facexbench/aggregate_results.py at main · Kartik-3004/facexbench · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import os
import json
import sys
import re
from collections import defaultdict
import argparse

def extract_option_label(model_output, option_labels, options):
    # Remove any leading/trailing whitespace
    model_output_clean = model_output.strip()

    # Define regex patterns to match option labels
    label_patterns = [
        r'\b\(?([A-Za-z])\)?\b',         # Matches 'A', '(A)', etc.
        r'\bOption\s+([A-Za-z])\b',      # Matches 'Option A', etc.
        r'\b([A-Za-z])\.',               # Matches 'A.', etc.
        r'\b([A-Za-z]):',                # Matches 'A:', etc.
        r'\b([A-Za-z])\s+-',             # Matches 'A -', etc.
        r'\b\(([A-Za-z])\)',             # Matches '(A)', etc.
    ]

    model_output_start = model_output_clean
    for pattern in label_patterns:
        match = re.match(pattern, model_output_start)
        if match:
            label = match.group(1)
            if label in option_labels:
                return label

    # If not found at the beginning, search the entire text
    for pattern in label_patterns:
        matches = re.findall(pattern, model_output_clean)
        for label in matches:
            if label in option_labels:
                return label

    # If no label found, try to match the output to the option texts exactly
    for idx, option_text in enumerate(options):
        option_text_stripped = str(option_text).strip()
        if option_text_stripped == model_output_clean.strip():
            return option_labels[idx]
        elif option_text_stripped in model_output_clean:
            return option_labels[idx]

    return None

def main():
    parser = argparse.ArgumentParser(description="Process FacexBench results.")
    parser.add_argument('--model', type=str, required=True, help='Name of the model')
    parser.add_argument('--results_dir', type=str, required=True, help='Directory containing result JSON files')
    args = parser.parse_args()

    model_name = args.model
    results_dir = args.results_dir

    if not os.path.exists(results_dir):
        print(f"Directory {results_dir} does not exist.")
        sys.exit(1)

    categories_subcategories = {
        'bias_fairness': ['age', 'gender', 'race'],
        'attributes_expression': ['expression', 'attributes'],
        'face_localization': ['headpose', 'segmentation', 'crowd_counting'],
        'face_recognition': ['hr_fr', 'lr_fr', 'celebrity_identification'],
        'fas_deepfakes': ['fas', 'deepfakes'],
        "tools_use": ['tools_retrieval']
    }

    total_correct = 0
    total_questions = 0

    category_counts = defaultdict(lambda: {'correct': 0, 'total': 0})
    subcategory_counts = defaultdict(lambda: {'correct': 0, 'total': 0})
    num_images_counts = defaultdict(lambda: {'correct': 0, 'total': 0})

    for root, dirs, files in os.walk(results_dir):
        for file in files:
            if file.endswith('.json'):
                json_path = os.path.join(root, file)
                with open(json_path, 'r') as f:
                    data = json.load(f)

                category = data.get('category', 'unknown')
                subcategory = data.get('sub-category', 'unknown')
                num_images = data.get('num_images', 'unknown')

                questions = data.get('questions', {})
                for q_id, q_data in questions.items():
                    total_questions += 1
                    category_counts[category]['total'] += 1
                    subcategory_counts[subcategory]['total'] += 1
                    num_images_counts[num_images]['total'] += 1

                    correct_answer_option = q_data.get('correct_answer_option')
                    prediction = q_data.get('prediction', '')

                    options_list = q_data.get('options', [])
                    num_options = len(options_list)

                    option_labels = [chr(ord('A') + i) for i in range(num_options)]

                    extracted_prediction = extract_option_label(prediction, option_labels, options_list)

                    q_data['prediction_answer_option'] = extracted_prediction

                    if extracted_prediction == correct_answer_option:
                        total_correct += 1
                        category_counts[category]['correct'] += 1
                        subcategory_counts[subcategory]['correct'] += 1
                        num_images_counts[num_images]['correct'] += 1

                # Write updated JSON
                with open(json_path, 'w') as f:
                    json.dump(data, f, indent=4)

    # Compute overall accuracy
    total_accuracy = (total_correct / total_questions) * 100 if total_questions > 0 else 0

    # Write results to file
    results_file = os.path.join(results_dir, 'results.txt')
    with open(results_file, 'w') as f:
        f.write(f"Model Name: {model_name} | Total Questions: {total_questions}\n")
        f.write(f"Total Accuracy: {total_accuracy:.2f}% ({total_correct}/{total_questions})\n\n")

        f.write("Category Accuracies:\n")
        for category, counts in category_counts.items():
            correct = counts['correct']
            total = counts['total']
            accuracy = (correct / total) * 100 if total > 0 else 0
            f.write(f"{category}: {accuracy:.2f}% ({correct}/{total})\n")

        f.write("\nSub-category Accuracies:\n")
        for subcategory, counts in subcategory_counts.items():
            correct = counts['correct']
            total = counts['total']
            accuracy = (correct / total) * 100 if total > 0 else 0
            f.write(f"{subcategory}: {accuracy:.2f}% ({correct}/{total})\n")

        f.write("\nNumber of Images Accuracies:\n")
        for num_images, counts in num_images_counts.items():
            correct = counts['correct']
            total = counts['total']
            accuracy = (correct / total) * 100 if total > 0 else 0
            f.write(f"{num_images}: {accuracy:.2f}% ({correct}/{total})\n")

if __name__ == '__main__':
    main()