statutes/train-ternary-classifier.py at main · dhfbk/statutes · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import set_seed
import argparse
import json

seeds = [16, 42, 123, 456, 78]  # List of random seeds
results_dict = {}


# Load the dataset using pandas
df = pd.read_csv('ternary_task_train_data.tsv', delimiter='\t')
dev_df = pd.read_csv('ternary_task_dev_data.tsv', delimiter='\t')
test_df = pd.read_csv('ternary_task_test_data.tsv', delimiter='\t')

# Map string labels to integers
label_mapping = {'not_gender_related': 0,'m': 1, 'inc': 2}
df['label'] = df['label'].map(label_mapping)
dev_df['label'] = dev_df['label'].map(label_mapping)
test_df['label'] = test_df['label'].map(label_mapping)

for i in range(5):
    set_seed(seeds[i])

    # Convert the pandas DataFrame to a Hugging Face Dataset
    dataset = Dataset.from_pandas(df).shuffle(seed=seeds[i])
    dev_dataset = Dataset.from_pandas(dev_df)
    test_dataset = Dataset.from_pandas(test_df)

    parser = argparse.ArgumentParser()
    parser.add_argument("model")
    args = parser.parse_args()
    # Load the tokenizer and model
    model_name = args.model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

    model_str = model_name.replace("/", "")

    # Tokenize the dataset
    def preprocess_function(examples):
        return tokenizer(examples['text'], truncation=True, padding=True, max_length=512)

    tokenized_dataset = dataset.map(preprocess_function, batched=True)
    tokenized_dev_dataset = dev_dataset.map(preprocess_function)
    tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)


    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=32,
        num_train_epochs=10,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
    )

    # Define a compute_metrics function
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = logits.argmax(axis=-1)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
        acc = accuracy_score(labels, predictions)
        return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        eval_dataset=tokenized_dev_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Save the model
    trainer.save_model(f"./ternary_classifier_model_{model_str}-{i}")
    tokenizer.save_pretrained(f"./ternary_classifier_model_{i}")

    # Evaluate the model on the test dataset
    results = trainer.evaluate(eval_dataset=tokenized_test_dataset)
    print("Evaluation Results:", results)

    results_dict[i] = results

    # Make predictions on the test dataset
    predictions = trainer.predict(tokenized_test_dataset)
    predicted_labels = predictions.predictions.argmax(axis=-1)

    # Print the first few predictions and their corresponding true labels
    print("Predicted Labels:", predicted_labels[:10])
    print("True Labels:", predictions.label_ids[:10])

    # Save predictions and true labels to a CSV file
    output_df = pd.DataFrame({
        "text": tokenized_test_dataset["text"],  # Match text with predictions
        "true_label": predictions.label_ids,
        "predicted_label": predicted_labels
    })
    output_df.to_csv(f"test_predictions_ternary_{model_str}_{i}.csv", index=False)

with open(f"scores{model_str}.json", "w") as file:
    json.dump(results_dict, file, indent=4)