-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess.py
More file actions
35 lines (30 loc) · 1.23 KB
/
Copy pathpreprocess.py
File metadata and controls
35 lines (30 loc) · 1.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from config import FEATURES
def drop_missing(df, limit=0.5):
feature_cols = [feature for feature in FEATURES if feature in df.columns]
n_before = len(df)
df = df.dropna(subset= feature_cols, thresh= int(len(feature_cols) * (1 - limit)))
print(f'\n[preprocess] | Dropped {n_before - len(df)} tracks with insuficient features; {len(df)} tracks still left.')
return df
def fill_missing(df):
feature_cols = [feature for feature in FEATURES if feature in df.columns]
df[feature_cols] = df[feature_cols].fillna(df[feature_cols].median())
return df
def normalise(df):
feature_cols = [feature for feature in FEATURES if feature in df.columns]
scaler = MinMaxScaler()
df = df.copy()
df[feature_cols] = scaler.fit_transform(df[feature_cols])
print(f'\n[preprocess] | Normalised {len(feature_cols)} features.')
return df, scaler
def feature_matrix(df):
feature_cols = [feature for feature in FEATURES if feature in df.columns]
return df[feature_cols].values
def full_preprocess(df):
df = drop_missing(df)
df = fill_missing(df)
df, scaler = normalise(df)
X = feature_matrix(df)
return df, X, scaler