-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtagger.py
More file actions
104 lines (93 loc) · 3.82 KB
/
Copy pathtagger.py
File metadata and controls
104 lines (93 loc) · 3.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import numpy as np
import os
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')
import pandas as pd
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import string
from nltk import ngrams
from collections import Counter
setup = False
if setup:
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
class TagGenerator:
def __init__(self):
self.stop = [i.lower() for i in stop]
self.unimportant_words = stop + [i.lower() for i in "34 almost a about all and are as at back be because been but can can't come could did didn't do don't for from get go going good got had have he her here he's hey him his how I if I'll I'm in is it it's just know like look me mean my no not now of oh OK okay on one or out really right say see she so some something tell that that's the then there they think this time to up want was we well were what when who why will with would yeah yes you your you're".replace("'", "").split()]
self.rejected_tags = ['PRP', 'NNP', 'WDT', 'WP', 'WP$', 'WRB', 'UH', 'PRP$', 'CC', 'IN', 'TO', 'MD', 'DT']
self.w_dual = [0.3, 0.5]
self.w_tri = [0.2, 0.3, 0.3]
self.ids_weight = 0
self.dual_weight, self.tri_weight = 0.5, 0.4
self.disable = False
def preprocess_sent(self, sent):
sent = str(sent).replace("'", "")
sent = nltk.word_tokenize(sent)
sent = nltk.pos_tag(sent)
sent = " ".join([i[0] for i in sent if i[1] not in self.rejected_tags])
sent = "".join([w for w in sent if w not in string.punctuation])
sent = " ".join([i for i in sent.split() if i not in self.unimportant_words])
sent = sent.lower()
return sent
def preprocess(self, arr):
arr = [self.preprocess_sent(i) for i in arr]
return arr
def n_grammer(self, arr, n):
n = [n]
vals = [y for x in arr for y in x.split()]
rslt = [' '.join(y) for x in n for y in ngrams(vals, x)]
grams = np.array(list(Counter(rslt).keys()))
weights = np.array([i for _, i in Counter(rslt).items()])
index = np.argsort(weights)[::-1]
weights = weights[index]
grams = grams[index]
ids = np.log((len(arr)+1)/np.array([1 + sum([1 for sent in arr if w in sent]) for w in grams]))
weights = weights * ids * self.ids_weight + weights * (1 - self.ids_weight)
return grams, weights
def tag_collection(self, arr):
grams, weights = {i:None for i in range(1, 4)}, {i:None for i in range(1, 4)}
for n in range(1, 4):
grams[n], weights[n] = self.n_grammer(arr, n)
tags_final = [t for t in grams[2][:6]]
for t, w in zip(grams[3], weights[3]):
if w>weights[2][0]/2:
tags_final.append(t)
for t, w in zip(grams[1], weights[1]):
if w>weights[2][0]*4:
tags_final.append(t)
if len(tags_final) <= 12:
tags_final = tags_final + [t for t in grams[3][:4]] + [t for t in grams[1][:4]]
return tags_final
def collect(self, dir_path='./product_wise_reviews/', save_path='./product_wise_tags.csv', return_tags=False, start=0, end=None):
if return_tags:
collected_tags = []
dir_paths = sorted([dir_path+i for i in os.listdir(dir_path) if '.csv' in i])
header = True
mode = 'w'
if end==None:
end = len(dir_paths)
for path in tqdm(dir_paths[start:end], disable=self.disable):
df = pd.read_csv(path, usecols=['reviewText'])
df = df.values
df = df.flatten()
df = self.preprocess(df)
tags = self.tag_collection(df)[:12]
if return_tags:
collected_tags.append(tags)
else:
tags_output = {'product_id':[path[len(dir_path):-4]]}
tags_output.update({'tag_'+str(i+1):[tags[i]] for i in range(12)})
tags_output = pd.DataFrame(tags_output)
tags_output.to_csv(save_path, header=header, index=False, mode=mode)
header = False
mode='a'
if return_tags:
return collected_tags
def main():
tg = TagGenerator()
tags = tg.collect()
main()