Skip to content

Commit 05932c8

Browse files
Add v3 standardization layer
1 parent fa55288 commit 05932c8

2 files changed

Lines changed: 239 additions & 0 deletions

File tree

tests/test_v3_standardize.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
from pathlib import Path
2+
import sys
3+
4+
import pandas as pd
5+
6+
7+
REPO_ROOT = Path(__file__).resolve().parents[1]
8+
sys.path.insert(0, str(REPO_ROOT))
9+
10+
from versions.v3.src.core.standardize import ( # noqa: E402
11+
standardize_bank_transactions,
12+
standardize_internal_ledger,
13+
)
14+
15+
16+
def test_standardize_bank_transactions_creates_canonical_columns():
17+
raw = pd.DataFrame(
18+
{
19+
"bank_transaction_id": ["B001"],
20+
"account_id": [" ACC1 "],
21+
"transaction_date": ["2026-03-12"],
22+
"posting_date": ["2026-03-13"],
23+
"currency": ["CAD"],
24+
"amount": ["1,250.50"],
25+
"direction": ["credit"],
26+
"transaction_type": ["wire"],
27+
"reference_id": ["REF-001"],
28+
"raw_reference": [" ref-001 "],
29+
"counterparty": ["Test Counterparty"],
30+
"description": ["Test transaction"],
31+
"source_file_id": ["BANK_FILE_001"],
32+
}
33+
)
34+
35+
result = standardize_bank_transactions(raw, run_id="test_run")
36+
37+
assert len(result) == 1
38+
assert result.loc[0, "run_id"] == "test_run"
39+
assert result.loc[0, "source_row_id"] == 2
40+
assert result.loc[0, "source_name"] == "bank_statement"
41+
assert result.loc[0, "account_id"] == "ACC1"
42+
assert result.loc[0, "canonical_date"] == "2026-03-12"
43+
assert result.loc[0, "amount_numeric"] == 1250.50
44+
assert result.loc[0, "normalized_reference"] == "REF001"
45+
assert len(result.loc[0, "row_hash"]) == 64
46+
47+
48+
def test_standardize_internal_ledger_creates_canonical_columns():
49+
raw = pd.DataFrame(
50+
{
51+
"ledger_transaction_id": ["L001"],
52+
"account_id": ["ACC1"],
53+
"entity": ["Entity A"],
54+
"ledger_date": ["2026-03-12"],
55+
"value_date": ["2026-03-13"],
56+
"currency": ["USD"],
57+
"amount": ["(500.00)"],
58+
"direction": ["debit"],
59+
"transaction_type": ["payment"],
60+
"reference_id": ["ref 777"],
61+
"raw_reference": ["ref 777"],
62+
"counterparty": ["Vendor A"],
63+
"description": ["Ledger transaction"],
64+
"source_system": ["ERP"],
65+
"batch_id": ["BATCH001"],
66+
"created_by": ["analyst"],
67+
}
68+
)
69+
70+
result = standardize_internal_ledger(raw, run_id="test_run")
71+
72+
assert len(result) == 1
73+
assert result.loc[0, "run_id"] == "test_run"
74+
assert result.loc[0, "source_row_id"] == 2
75+
assert result.loc[0, "source_name"] == "internal_cash_ledger"
76+
assert result.loc[0, "canonical_date"] == "2026-03-12"
77+
assert result.loc[0, "value_date"] == "2026-03-13"
78+
assert result.loc[0, "amount_numeric"] == -500.00
79+
assert result.loc[0, "normalized_reference"] == "REF777"
80+
assert len(result.loc[0, "row_hash"]) == 64
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
from __future__ import annotations
2+
3+
import sys
4+
from pathlib import Path
5+
from typing import Any
6+
7+
import pandas as pd
8+
9+
10+
REPO_ROOT = Path(__file__).resolve().parents[4]
11+
12+
if str(REPO_ROOT) not in sys.path:
13+
sys.path.insert(0, str(REPO_ROOT))
14+
15+
from versions.v3.src.core.canonicalize import ( # noqa: E402
16+
build_row_hash,
17+
normalize_reference,
18+
parse_amount,
19+
parse_date,
20+
)
21+
22+
23+
24+
V2_DATA_DIR = REPO_ROOT / "versions" / "v2" / "data"
25+
V3_OUTPUT_DIR = REPO_ROOT / "versions" / "v3" / "output"
26+
27+
28+
BANK_HASH_FIELDS = [
29+
"bank_transaction_id",
30+
"account_id",
31+
"transaction_date",
32+
"posting_date",
33+
"currency",
34+
"amount",
35+
"direction",
36+
"reference_id",
37+
"raw_reference",
38+
]
39+
40+
LEDGER_HASH_FIELDS = [
41+
"ledger_transaction_id",
42+
"account_id",
43+
"ledger_date",
44+
"value_date",
45+
"currency",
46+
"amount",
47+
"direction",
48+
"reference_id",
49+
"raw_reference",
50+
"source_system",
51+
"batch_id",
52+
]
53+
54+
55+
def _clean_string(value: Any) -> str | None:
56+
if pd.isna(value):
57+
return None
58+
59+
text = str(value).strip()
60+
return text if text else None
61+
62+
63+
def standardize_bank_transactions(df: pd.DataFrame, run_id: str) -> pd.DataFrame:
64+
standardized = pd.DataFrame(index=df.index)
65+
66+
standardized["run_id"] = run_id
67+
standardized["source_row_id"] = df.index + 2
68+
standardized["source_name"] = "bank_statement"
69+
70+
standardized["bank_transaction_id"] = df.get("bank_transaction_id").map(_clean_string)
71+
standardized["source_file_id"] = df.get("source_file_id").map(_clean_string)
72+
73+
standardized["account_id"] = df.get("account_id").map(_clean_string)
74+
standardized["canonical_date"] = df.get("transaction_date").map(parse_date)
75+
standardized["posting_date"] = df.get("posting_date").map(parse_date)
76+
77+
standardized["amount_numeric"] = df.get("amount").map(parse_amount)
78+
standardized["currency"] = df.get("currency").map(_clean_string)
79+
standardized["direction"] = df.get("direction").map(_clean_string)
80+
standardized["transaction_type"] = df.get("transaction_type").map(_clean_string)
81+
82+
standardized["reference_id"] = df.get("reference_id").map(_clean_string)
83+
standardized["raw_reference"] = df.get("raw_reference").map(_clean_string)
84+
standardized["normalized_reference"] = df.get("raw_reference").map(normalize_reference)
85+
86+
standardized["counterparty"] = df.get("counterparty").map(_clean_string)
87+
standardized["description"] = df.get("description").map(_clean_string)
88+
89+
standardized["row_hash"] = df.apply(
90+
lambda row: build_row_hash(row, fields=BANK_HASH_FIELDS),
91+
axis=1,
92+
)
93+
94+
return standardized
95+
96+
97+
def standardize_internal_ledger(df: pd.DataFrame, run_id: str) -> pd.DataFrame:
98+
standardized = pd.DataFrame(index=df.index)
99+
100+
standardized["run_id"] = run_id
101+
standardized["source_row_id"] = df.index + 2
102+
standardized["source_name"] = "internal_cash_ledger"
103+
104+
standardized["ledger_transaction_id"] = df.get("ledger_transaction_id").map(_clean_string)
105+
standardized["source_system"] = df.get("source_system").map(_clean_string)
106+
standardized["batch_id"] = df.get("batch_id").map(_clean_string)
107+
108+
standardized["account_id"] = df.get("account_id").map(_clean_string)
109+
standardized["entity"] = df.get("entity").map(_clean_string)
110+
standardized["canonical_date"] = df.get("ledger_date").map(parse_date)
111+
standardized["value_date"] = df.get("value_date").map(parse_date)
112+
113+
standardized["amount_numeric"] = df.get("amount").map(parse_amount)
114+
standardized["currency"] = df.get("currency").map(_clean_string)
115+
standardized["direction"] = df.get("direction").map(_clean_string)
116+
standardized["transaction_type"] = df.get("transaction_type").map(_clean_string)
117+
118+
standardized["reference_id"] = df.get("reference_id").map(_clean_string)
119+
standardized["raw_reference"] = df.get("raw_reference").map(_clean_string)
120+
standardized["normalized_reference"] = df.get("raw_reference").map(normalize_reference)
121+
122+
standardized["counterparty"] = df.get("counterparty").map(_clean_string)
123+
standardized["description"] = df.get("description").map(_clean_string)
124+
standardized["created_by"] = df.get("created_by").map(_clean_string)
125+
126+
standardized["row_hash"] = df.apply(
127+
lambda row: build_row_hash(row, fields=LEDGER_HASH_FIELDS),
128+
axis=1,
129+
)
130+
131+
return standardized
132+
133+
134+
def main() -> None:
135+
V3_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
136+
137+
run_id = "v3_local_standardization_run"
138+
139+
bank_df = pd.read_csv(V2_DATA_DIR / "bank_statement_v2.csv")
140+
ledger_df = pd.read_csv(V2_DATA_DIR / "internal_cash_ledger_v2.csv")
141+
142+
canonical_bank = standardize_bank_transactions(bank_df, run_id=run_id)
143+
canonical_ledger = standardize_internal_ledger(ledger_df, run_id=run_id)
144+
145+
bank_output_path = V3_OUTPUT_DIR / "canonical_bank_transactions.csv"
146+
ledger_output_path = V3_OUTPUT_DIR / "canonical_internal_transactions.csv"
147+
148+
canonical_bank.to_csv(bank_output_path, index=False)
149+
canonical_ledger.to_csv(ledger_output_path, index=False)
150+
151+
print("Standardization complete.")
152+
print(f"Canonical bank rows: {len(canonical_bank)}")
153+
print(f"Canonical ledger rows: {len(canonical_ledger)}")
154+
print(f"Bank output written to: {bank_output_path}")
155+
print(f"Ledger output written to: {ledger_output_path}")
156+
157+
158+
if __name__ == "__main__":
159+
main()

0 commit comments

Comments
 (0)