-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain-ternary-classifier.py
More file actions
117 lines (92 loc) · 4.02 KB
/
Copy pathtrain-ternary-classifier.py
File metadata and controls
117 lines (92 loc) · 4.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import set_seed
import argparse
import json
seeds = [16, 42, 123, 456, 78] # List of random seeds
results_dict = {}
# Load the dataset using pandas
df = pd.read_csv('ternary_task_train_data.tsv', delimiter='\t')
dev_df = pd.read_csv('ternary_task_dev_data.tsv', delimiter='\t')
test_df = pd.read_csv('ternary_task_test_data.tsv', delimiter='\t')
# Map string labels to integers
label_mapping = {'not_gender_related': 0,'m': 1, 'inc': 2}
df['label'] = df['label'].map(label_mapping)
dev_df['label'] = dev_df['label'].map(label_mapping)
test_df['label'] = test_df['label'].map(label_mapping)
for i in range(5):
set_seed(seeds[i])
# Convert the pandas DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df).shuffle(seed=seeds[i])
dev_dataset = Dataset.from_pandas(dev_df)
test_dataset = Dataset.from_pandas(test_df)
parser = argparse.ArgumentParser()
parser.add_argument("model")
args = parser.parse_args()
# Load the tokenizer and model
model_name = args.model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
model_str = model_name.replace("/", "")
# Tokenize the dataset
def preprocess_function(examples):
return tokenizer(examples['text'], truncation=True, padding=True, max_length=512)
tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dev_dataset = dev_dataset.map(preprocess_function)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
# Define training arguments
training_args = TrainingArguments(
output_dir="./results",
eval_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=32,
num_train_epochs=10,
weight_decay=0.01,
logging_dir="./logs",
logging_steps=10,
load_best_model_at_end=True,
metric_for_best_model="f1",
)
# Define a compute_metrics function
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = logits.argmax(axis=-1)
precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
acc = accuracy_score(labels, predictions)
return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}
# Initialize the Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
eval_dataset=tokenized_dev_dataset,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
# Train the model
trainer.train()
# Save the model
trainer.save_model(f"./ternary_classifier_model_{model_str}-{i}")
tokenizer.save_pretrained(f"./ternary_classifier_model_{i}")
# Evaluate the model on the test dataset
results = trainer.evaluate(eval_dataset=tokenized_test_dataset)
print("Evaluation Results:", results)
results_dict[i] = results
# Make predictions on the test dataset
predictions = trainer.predict(tokenized_test_dataset)
predicted_labels = predictions.predictions.argmax(axis=-1)
# Print the first few predictions and their corresponding true labels
print("Predicted Labels:", predicted_labels[:10])
print("True Labels:", predictions.label_ids[:10])
# Save predictions and true labels to a CSV file
output_df = pd.DataFrame({
"text": tokenized_test_dataset["text"], # Match text with predictions
"true_label": predictions.label_ids,
"predicted_label": predicted_labels
})
output_df.to_csv(f"test_predictions_ternary_{model_str}_{i}.csv", index=False)
with open(f"scores{model_str}.json", "w") as file:
json.dump(results_dict, file, indent=4)