-
-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathcer_wer_tamil.py
More file actions
74 lines (56 loc) · 2.26 KB
/
Copy pathcer_wer_tamil.py
File metadata and controls
74 lines (56 loc) · 2.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import sys
import unicodedata
import argparse
from jiwer import wer
from difflib import SequenceMatcher
def normalize(text):
"""Normalize Unicode and remove unwanted characters"""
text = unicodedata.normalize('NFKC', text)
text = text.strip()
return text
def calculate_cer(reference, hypothesis):
"""Calculate Character Error Rate (CER)"""
reference = normalize(reference)
hypothesis = normalize(hypothesis)
matcher = SequenceMatcher(None, reference, hypothesis)
distance = 0
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
if tag != 'equal':
distance += (i2 - i1) # number of characters changed/deleted/inserted
if len(reference) == 0:
return 0.0 if len(hypothesis) == 0 else 1.0
return distance / len(reference)
def calculate_wer(reference, hypothesis):
"""Calculate Word Error Rate (WER)"""
reference = normalize(reference)
hypothesis = normalize(hypothesis)
return wer(reference, hypothesis)
def calculate_rc(reference, hypothesis):
"""Calculate Recognition Coverage (RC)"""
reference = normalize(reference)
hypothesis = normalize(hypothesis)
recognized_chars = len(hypothesis)
total_chars = len(reference)
if total_chars == 0:
return 100.0 if recognized_chars == 0 else 0.0
rc = (recognized_chars / total_chars) * 100
return rc
def read_file(file_path):
"""Read a UTF-8 file"""
with open(file_path, "r", encoding="utf-8") as f:
return f.read()
def main():
parser = argparse.ArgumentParser(description="Calculate RC, CER, and WER for Tamil OCR evaluation.")
parser.add_argument("--ground_truth", required=True, help="Path to the ground truth text file")
parser.add_argument("--prediction", required=True, help="Path to the prediction text file")
args = parser.parse_args()
ground_truth = read_file(args.ground_truth)
prediction = read_file(args.prediction)
rc = calculate_rc(ground_truth, prediction)
cer = calculate_cer(ground_truth, prediction)
wer_score = calculate_wer(ground_truth, prediction)
print(f"Recognition Coverage (RC): {rc:.2f}%")
print(f"Character Error Rate (CER): {cer * 100:.2f}%")
print(f"Word Error Rate (WER): {wer_score * 100:.2f}%")
if __name__ == "__main__":
main()