Skip to content

Commit fa55288

Browse files
Add v3 canonicalization utilities
1 parent 1416778 commit fa55288

2 files changed

Lines changed: 187 additions & 0 deletions

File tree

tests/test_v3_canonicalize.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
from pathlib import Path
2+
import sys
3+
4+
import pandas as pd
5+
6+
7+
REPO_ROOT = Path(__file__).resolve().parents[1]
8+
sys.path.insert(0, str(REPO_ROOT))
9+
10+
from versions.v3.src.core.canonicalize import ( # noqa: E402
11+
build_row_hash,
12+
normalize_reference,
13+
parse_amount,
14+
parse_date,
15+
)
16+
17+
18+
def test_normalize_reference_removes_noise_and_uppercases():
19+
assert normalize_reference(" ref-000123 / abc ") == "REF000123ABC"
20+
assert normalize_reference("Ref 777") == "REF777"
21+
assert normalize_reference("") is None
22+
assert normalize_reference(None) is None
23+
24+
25+
def test_parse_amount_handles_common_formats():
26+
assert parse_amount("1,250.50") == 1250.50
27+
assert parse_amount("$1,250.50") == 1250.50
28+
assert parse_amount("(1,250.50)") == -1250.50
29+
assert parse_amount("not-a-number") is None
30+
assert parse_amount(None) is None
31+
32+
33+
def test_parse_date_returns_iso_date_or_none():
34+
assert parse_date("2026-03-12") == "2026-03-12"
35+
assert parse_date(pd.Timestamp("2026-03-12")) == "2026-03-12"
36+
assert parse_date("not-a-date") is None
37+
assert parse_date(None) is None
38+
39+
40+
def test_build_row_hash_is_deterministic_and_field_based():
41+
row_a = {
42+
"account_id": "ACC1",
43+
"currency": "CAD",
44+
"amount": "100.00",
45+
}
46+
row_b = {
47+
"amount": "100.00",
48+
"currency": "CAD",
49+
"account_id": "ACC1",
50+
}
51+
row_c = {
52+
"account_id": "ACC1",
53+
"currency": "CAD",
54+
"amount": "101.00",
55+
}
56+
57+
fields = ["account_id", "currency", "amount"]
58+
59+
hash_a = build_row_hash(row_a, fields=fields)
60+
hash_b = build_row_hash(row_b, fields=fields)
61+
hash_c = build_row_hash(row_c, fields=fields)
62+
63+
assert hash_a == hash_b
64+
assert hash_a != hash_c
65+
assert len(hash_a) == 64
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
from __future__ import annotations
2+
3+
import hashlib
4+
import json
5+
import re
6+
from decimal import Decimal, InvalidOperation
7+
from typing import Any, Iterable
8+
9+
import pandas as pd
10+
11+
12+
MISSING_STRINGS = {"", "nan", "none", "null", "nat"}
13+
14+
15+
def is_missing(value: Any) -> bool:
16+
if value is None:
17+
return True
18+
19+
try:
20+
if pd.isna(value):
21+
return True
22+
except (TypeError, ValueError):
23+
pass
24+
25+
if isinstance(value, str) and value.strip().lower() in MISSING_STRINGS:
26+
return True
27+
28+
return False
29+
30+
31+
def normalize_reference(value: Any) -> str | None:
32+
"""Normalize transaction references for matching.
33+
34+
Example:
35+
" ref-000123 " -> "REF000123"
36+
"""
37+
if is_missing(value):
38+
return None
39+
40+
if isinstance(value, float) and value.is_integer():
41+
raw_value = str(int(value))
42+
else:
43+
raw_value = str(value)
44+
45+
normalized = re.sub(r"[^A-Za-z0-9]", "", raw_value.strip()).upper()
46+
return normalized or None
47+
48+
49+
def parse_amount(value: Any) -> float | None:
50+
"""Parse amount values into numeric form.
51+
52+
Supports common demo formats such as:
53+
"1,250.50"
54+
"$1,250.50"
55+
"(1,250.50)"
56+
"""
57+
if is_missing(value):
58+
return None
59+
60+
if isinstance(value, (int, float, Decimal)) and not isinstance(value, bool):
61+
return float(value)
62+
63+
text = str(value).strip()
64+
is_negative_parentheses = text.startswith("(") and text.endswith(")")
65+
66+
if is_negative_parentheses:
67+
text = text[1:-1]
68+
69+
text = (
70+
text.replace("$", "")
71+
.replace(",", "")
72+
.replace("CAD", "")
73+
.replace("USD", "")
74+
.replace(" ", "")
75+
.strip()
76+
)
77+
78+
try:
79+
amount = Decimal(text)
80+
except InvalidOperation:
81+
return None
82+
83+
if is_negative_parentheses:
84+
amount = -amount
85+
86+
return float(amount)
87+
88+
89+
def parse_date(value: Any) -> str | None:
90+
"""Parse a date-like value into ISO format: YYYY-MM-DD."""
91+
if is_missing(value):
92+
return None
93+
94+
parsed = pd.to_datetime(value, errors="coerce")
95+
96+
if pd.isna(parsed):
97+
return None
98+
99+
return parsed.date().isoformat()
100+
101+
102+
def _stable_string(value: Any) -> str:
103+
if is_missing(value):
104+
return ""
105+
106+
if isinstance(value, pd.Timestamp):
107+
return value.isoformat()
108+
109+
return str(value).strip()
110+
111+
112+
def build_row_hash(row: dict[str, Any] | pd.Series, fields: Iterable[str] | None = None) -> str:
113+
"""Create a deterministic hash for row-level traceability."""
114+
row_dict = row.to_dict() if isinstance(row, pd.Series) else dict(row)
115+
116+
if fields is not None:
117+
payload = {field: _stable_string(row_dict.get(field)) for field in fields}
118+
else:
119+
payload = {key: _stable_string(value) for key, value in row_dict.items()}
120+
121+
serialized = json.dumps(payload, sort_keys=True, ensure_ascii=False)
122+
return hashlib.sha256(serialized.encode("utf-8")).hexdigest()

0 commit comments

Comments
 (0)