-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_prep.py
More file actions
41 lines (28 loc) · 1.16 KB
/
data_prep.py
File metadata and controls
41 lines (28 loc) · 1.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import pandas as pd
import numpy as np
print("Loading dataset...")
df = pd.read_csv('dataset.csv')
numeric_cols = ['duration_min', 'sleep_hours', 'energy_level', 'stress_level']
for col in numeric_cols:
df[col] = pd.to_numeric(df[col], errors='coerce')
df[col] = df[col].fillna(df[col].median())
text_cols = ['previous_day_mood', 'face_emotion_hint', 'reflection_quality']
for col in text_cols:
df[col] = df[col].fillna('unknown')
df['time_numeric'] = df['time_of_day'].astype(str).str.extract(r'(\d+)', expand=False).astype(float)
df['time_numeric'] = df['time_numeric'].fillna(0)
df['time_period'] = df['time_of_day'].astype(str).str.extract(r'([a-zA-Z]+)', expand=False).str.lower()
typo_fixes = {
'moming': 'morning',
'maming': 'morning',
'marring': 'morning',
'attemoon': 'afternoon',
'nicht': 'night'
}
df['time_period'] = df['time_period'].replace(typo_fixes)
df['time_period'] = df['time_period'].fillna('unknown')
df = df.drop(columns=['time_of_day'])
df.to_csv('dataset_cleaned.csv', index=False)
print("\nCleaning complete! Saved as 'dataset_cleaned.csv'.")
print("\n--- Any missing values left? ---")
print(df.isnull().sum())