forked from VikashBasfore/osha_ai_app
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathencoding.py
More file actions
77 lines (59 loc) · 1.95 KB
/
Copy pathencoding.py
File metadata and controls
77 lines (59 loc) · 1.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import pandas as pd
from sklearn.preprocessing import LabelEncoder
class Encoder:
def __init__(self):
self.freq_maps = {}
self.label_encoder = LabelEncoder()
self.columns = None
def fit(self, df):
df = df.copy()
# DROP
df = df.drop(columns=[
'date_of_incident',
'establishment_name',
'street_address',
'city',
'naics_code'
])
# STORE FREQUENCY MAPS 🔥
for col in ['industry_description', 'soc_description', 'company_name', 'soc_code']:
self.freq_maps[col] = df[col].value_counts()
# LABEL ENCODER
self.label_encoder.fit(df['incident_outcome'])
# APPLY ON TRAIN TO LOCK COLUMNS
df = self.transform(df)
self.columns = df.columns
return self
def transform(self, df):
df = df.copy()
df = df.drop(columns=[
'date_of_incident',
'establishment_name',
'street_address',
'city',
'naics_code'
], errors='ignore')
# APPLY STORED FREQUENCY 🔥
for col, freq in self.freq_maps.items():
df[col] = df[col].map(freq).fillna(0)
# ONE HOT
df = pd.get_dummies(
df,
columns=[
'shift','size_category','season','establishment_type',
'soc_reviewed','time_unknown','industry_sector',
'type_of_incident','naics_year','soc_group'
],
drop_first=True,
sparse=True
)
# APPLY SAME LABEL ENCODER 🔥
if 'incident_outcome' in df.columns:
df['incident_outcome'] = self.label_encoder.transform(df['incident_outcome'])
# ALIGN COLUMNS
if self.columns is not None:
for col in self.columns:
if col not in df:
df[col] = 0
df = df[self.columns]
return df