osha_ai_app/text_processing.py at main · kavyaradheshwar1/osha_ai_app · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer

class TextProcessor:

    def __init__(self):

        # 🔒 EXACT SAME AS YOUR NOTEBOOK
        self.vectorizer = TfidfVectorizer(
            max_features=500,
            min_df=5,
            max_df=0.90,
            stop_words='english'
        )

        self.text_cols = [
            'job_description',
            'NEW_NAR_WHAT_HAPPENED',
            'NEW_NAR_BEFORE_INCIDENT',
            'NEW_INCIDENT_LOCATION',
            'NEW_NAR_INJURY_ILLNESS',
            'NEW_NAR_OBJECT_SUBSTANCE',
            'NEW_INCIDENT_DESCRIPTION'
        ]

    # -------------------------
    # FIT
    # -------------------------
    def fit(self, df):

        df = df.copy()

        # combine text
        df['combined_text'] = df[self.text_cols].fillna('').astype(str).agg(' '.join, axis=1)

        # cleaning (same as notebook)
        df['combined_text'] = df['combined_text'].str.lower()
        df['combined_text'] = df['combined_text'].apply(
            lambda x: re.sub(r'[^a-z\s]', '', x)
        )

        # fit TF-IDF
        self.vectorizer.fit(df['combined_text'])

        return self

    # -------------------------
    # TRANSFORM
    # -------------------------
    def transform(self, df):

        df = df.copy()

        df['combined_text'] = df[self.text_cols].fillna('').astype(str).agg(' '.join, axis=1)

        df['combined_text'] = df['combined_text'].str.lower()
        df['combined_text'] = df['combined_text'].apply(
            lambda x: re.sub(r'[^a-z\s]', '', x)
        )

        tfidf_matrix = self.vectorizer.transform(df['combined_text'])

        tfidf_df = pd.DataFrame(
            tfidf_matrix.toarray(),
            columns=self.vectorizer.get_feature_names_out()
        )

        # reset index (VERY IMPORTANT — same as notebook)
        df = df.reset_index(drop=True)
        tfidf_df = tfidf_df.reset_index(drop=True)

        # merge
        df = pd.concat([df, tfidf_df], axis=1)

        # drop original text
        df = df.drop(columns=self.text_cols + ['combined_text'])

        return df