Add v3 timing difference matching

xiangtinghe616-blip · xiangtinghe616-blip · commit 4b18af631849 · 2026-05-18T07:07:07.000-03:00
diff --git a/tests/test_v3_pipeline_runner.py b/tests/test_v3_pipeline_runner.py
@@ -32,6 +32,8 @@ def test_run_v3_pipeline_creates_expected_outputs():
     assert result["canonical_ledger_count"] == 600
     assert result["validation_issue_count"] >= 1
     assert result["exact_match_count"] >= 1
+    assert result["timing_match_count"] >= 0
+    assert result["deterministic_match_count"] >= result["exact_match_count"]
     assert result["exception_count"] >= 1
 
     reconciliation_links = pd.read_csv(output_dir / "reconciliation_links.csv")
@@ -50,6 +52,6 @@ def test_run_v3_pipeline_creates_expected_outputs():
         "schema_validation",
         "bank_standardization",
         "ledger_standardization",
-        "deterministic_exact_matching",
+        "deterministic_matching",
         "exception_queue_build",
     }
diff --git a/versions/v3/src/matching/deterministic_rules.py b/versions/v3/src/matching/deterministic_rules.py
@@ -5,7 +5,7 @@
 import pandas as pd
 
 
-MATCH_KEY_COLUMNS = [
+EXACT_MATCH_KEY_COLUMNS = [
     "account_id",
     "currency",
     "direction",
@@ -14,6 +14,13 @@
     "canonical_date",
 ]
 
+TIMING_MATCH_KEY_COLUMNS = [
+    "account_id",
+    "currency",
+    "direction",
+    "amount_numeric",
+    "normalized_reference",
+]
 
 RECONCILIATION_LINK_COLUMNS = [
     "run_id",
@@ -42,44 +49,40 @@
 def _clean_value(value: Any) -> Any:
     if pd.isna(value):
         return None
-
     return value
 
 
-def _eligible_for_exact_match(df: pd.DataFrame) -> pd.DataFrame:
-    return df.dropna(subset=MATCH_KEY_COLUMNS).copy()
+def _empty_links() -> pd.DataFrame:
+    return pd.DataFrame(columns=RECONCILIATION_LINK_COLUMNS)
+
+
+def _matched_ids(reconciliation_links: pd.DataFrame, column_name: str) -> set[int]:
+    if reconciliation_links.empty or column_name not in reconciliation_links.columns:
+        return set()
+    return {int(value) for value in reconciliation_links[column_name].dropna()}
 
 
 def find_exact_matches(
     canonical_bank: pd.DataFrame,
     canonical_ledger: pd.DataFrame,
     run_id: str,
+    link_start_index: int = 1,
 ) -> pd.DataFrame:
-    """Find high-confidence exact matches between canonical bank and ledger rows.
-
-    Exact match criteria:
-    - same account
-    - same currency
-    - same direction
-    - same amount
-    - same normalized reference
-    - same canonical date
-    """
-    bank_candidates = _eligible_for_exact_match(canonical_bank)
-    ledger_candidates = _eligible_for_exact_match(canonical_ledger)
+    bank_candidates = canonical_bank.dropna(subset=EXACT_MATCH_KEY_COLUMNS).copy()
+    ledger_candidates = canonical_ledger.dropna(subset=EXACT_MATCH_KEY_COLUMNS).copy()
 
     if bank_candidates.empty or ledger_candidates.empty:
-        return pd.DataFrame(columns=RECONCILIATION_LINK_COLUMNS)
+        return _empty_links()
 
     merged = bank_candidates.merge(
         ledger_candidates,
-        on=MATCH_KEY_COLUMNS,
+        on=EXACT_MATCH_KEY_COLUMNS,
         suffixes=("_bank", "_ledger"),
         how="inner",
     )
 
     if merged.empty:
-        return pd.DataFrame(columns=RECONCILIATION_LINK_COLUMNS)
+        return _empty_links()
 
     merged = merged.sort_values(
         by=["source_row_id_bank", "source_row_id_ledger"],
@@ -96,14 +99,13 @@ def find_exact_matches(
 
         if bank_source_row_id in matched_bank_rows:
             continue
-
         if ledger_source_row_id in matched_ledger_rows:
             continue
 
         matched_bank_rows.add(bank_source_row_id)
         matched_ledger_rows.add(ledger_source_row_id)
 
-        link_number = len(links) + 1
+        link_number = link_start_index + len(links)
 
         links.append(
             {
@@ -131,3 +133,145 @@ def find_exact_matches(
         )
 
     return pd.DataFrame(links, columns=RECONCILIATION_LINK_COLUMNS)
+
+
+def find_timing_difference_matches(
+    canonical_bank: pd.DataFrame,
+    canonical_ledger: pd.DataFrame,
+    run_id: str,
+    existing_links: pd.DataFrame | None = None,
+    max_day_gap: int = 2,
+    link_start_index: int = 1,
+) -> pd.DataFrame:
+    existing_links = existing_links if existing_links is not None else _empty_links()
+
+    matched_bank_rows = _matched_ids(existing_links, "bank_source_row_id")
+    matched_ledger_rows = _matched_ids(existing_links, "ledger_source_row_id")
+
+    bank_candidates = canonical_bank.dropna(
+        subset=TIMING_MATCH_KEY_COLUMNS + ["canonical_date"]
+    ).copy()
+    ledger_candidates = canonical_ledger.dropna(
+        subset=TIMING_MATCH_KEY_COLUMNS + ["canonical_date"]
+    ).copy()
+
+    bank_candidates = bank_candidates[
+        ~bank_candidates["source_row_id"].isin(matched_bank_rows)
+    ].copy()
+    ledger_candidates = ledger_candidates[
+        ~ledger_candidates["source_row_id"].isin(matched_ledger_rows)
+    ].copy()
+
+    if bank_candidates.empty or ledger_candidates.empty:
+        return _empty_links()
+
+    merged = bank_candidates.merge(
+        ledger_candidates,
+        on=TIMING_MATCH_KEY_COLUMNS,
+        suffixes=("_bank", "_ledger"),
+        how="inner",
+    )
+
+    if merged.empty:
+        return _empty_links()
+
+    candidates = []
+
+    for _, row in merged.iterrows():
+        bank_date = pd.to_datetime(row["canonical_date_bank"], errors="coerce")
+        ledger_date = pd.to_datetime(row["canonical_date_ledger"], errors="coerce")
+
+        if pd.isna(bank_date) or pd.isna(ledger_date):
+            continue
+
+        date_gap_days = abs((bank_date - ledger_date).days)
+
+        if date_gap_days == 0:
+            continue
+        if date_gap_days > max_day_gap:
+            continue
+
+        candidate = row.to_dict()
+        candidate["date_gap_days"] = date_gap_days
+        candidates.append(candidate)
+
+    if not candidates:
+        return _empty_links()
+
+    timing_df = pd.DataFrame(candidates).sort_values(
+        by=["date_gap_days", "source_row_id_bank", "source_row_id_ledger"],
+        kind="stable",
+    )
+
+    matched_bank_rows = set()
+    matched_ledger_rows = set()
+    links: list[dict[str, Any]] = []
+
+    for _, row in timing_df.iterrows():
+        bank_source_row_id = int(row["source_row_id_bank"])
+        ledger_source_row_id = int(row["source_row_id_ledger"])
+
+        if bank_source_row_id in matched_bank_rows:
+            continue
+        if ledger_source_row_id in matched_ledger_rows:
+            continue
+
+        matched_bank_rows.add(bank_source_row_id)
+        matched_ledger_rows.add(ledger_source_row_id)
+
+        link_number = link_start_index + len(links)
+
+        links.append(
+            {
+                "run_id": run_id,
+                "link_id": f"LINK-{link_number:06d}",
+                "match_type": "POTENTIAL_TIMING_DIFFERENCE",
+                "stage_detected": "deterministic_timing",
+                "confidence_score": 0.95,
+                "bank_source_row_id": bank_source_row_id,
+                "ledger_source_row_id": ledger_source_row_id,
+                "bank_transaction_id": _clean_value(row.get("bank_transaction_id")),
+                "ledger_transaction_id": _clean_value(row.get("ledger_transaction_id")),
+                "account_id": row["account_id"],
+                "currency": row["currency"],
+                "direction": row["direction"],
+                "amount_bank": row["amount_numeric"],
+                "amount_internal": row["amount_numeric"],
+                "transaction_date_bank": row["canonical_date_bank"],
+                "transaction_date_internal": row["canonical_date_ledger"],
+                "normalized_reference": row["normalized_reference"],
+                "counterparty_bank": _clean_value(row.get("counterparty_bank")),
+                "counterparty_internal": _clean_value(row.get("counterparty_ledger")),
+                "rationale": f"Matched on account, currency, direction, amount, and normalized reference with a {int(row['date_gap_days'])}-day date gap.",
+            }
+        )
+
+    return pd.DataFrame(links, columns=RECONCILIATION_LINK_COLUMNS)
+
+
+def find_deterministic_matches(
+    canonical_bank: pd.DataFrame,
+    canonical_ledger: pd.DataFrame,
+    run_id: str,
+) -> pd.DataFrame:
+    exact_matches = find_exact_matches(
+        canonical_bank=canonical_bank,
+        canonical_ledger=canonical_ledger,
+        run_id=run_id,
+        link_start_index=1,
+    )
+
+    timing_matches = find_timing_difference_matches(
+        canonical_bank=canonical_bank,
+        canonical_ledger=canonical_ledger,
+        run_id=run_id,
+        existing_links=exact_matches,
+        link_start_index=len(exact_matches) + 1,
+    )
+
+    all_links = [df for df in [exact_matches, timing_matches] if not df.empty]
+
+    if not all_links:
+        return _empty_links()
+
+    return pd.concat(all_links, ignore_index=True)
diff --git a/versions/v3/src/reconciliation/run_v3_pipeline.py b/versions/v3/src/reconciliation/run_v3_pipeline.py
@@ -18,7 +18,7 @@
     standardize_bank_transactions,
     standardize_internal_ledger,
 )
-from versions.v3.src.matching.deterministic_rules import find_exact_matches  # noqa: E402
+from versions.v3.src.matching.deterministic_rules import find_deterministic_matches  # noqa: E402
 from versions.v3.src.reconciliation.exception_builder import build_exception_queue  # noqa: E402
 
 
@@ -93,18 +93,32 @@ def run_v3_pipeline() -> dict[str, Any]:
     print(f"Canonical bank output: {canonical_bank_output_path}")
     print(f"Canonical ledger output: {canonical_ledger_output_path}")
 
-    print("Step 3/5: Running deterministic exact matching...")
+    print("Step 3/5: Running deterministic matching...")
 
-    reconciliation_links = find_exact_matches(
+    reconciliation_links = find_deterministic_matches(
         canonical_bank=canonical_bank,
         canonical_ledger=canonical_ledger,
         run_id=run_id,
     )
 
+    exact_match_count = (
+        int((reconciliation_links["match_type"] == "EXACT_CANONICAL_MATCH").sum())
+        if not reconciliation_links.empty
+        else 0
+    )
+
+    timing_match_count = (
+        int((reconciliation_links["match_type"] == "POTENTIAL_TIMING_DIFFERENCE").sum())
+        if not reconciliation_links.empty
+        else 0
+    )
+
     reconciliation_links_output_path = V3_OUTPUT_DIR / "reconciliation_links.csv"
     write_csv(reconciliation_links, reconciliation_links_output_path)
 
-    print(f"Exact reconciliation links: {len(reconciliation_links)}")
+    print(f"Exact reconciliation links: {exact_match_count}")
+    print(f"Timing-difference links: {timing_match_count}")
+    print(f"Total deterministic links: {len(reconciliation_links)}")
     print(f"Reconciliation links output: {reconciliation_links_output_path}")
 
     print("Step 4/5: Building exception queue...")
@@ -146,7 +160,7 @@ def run_v3_pipeline() -> dict[str, Any]:
             },
             {
                 "run_id": run_id,
-                "stage": "deterministic_exact_matching",
+                "stage": "deterministic_matching",
                 "output_file": "reconciliation_links.csv",
                 "record_count": len(reconciliation_links),
             },
@@ -170,7 +184,9 @@ def run_v3_pipeline() -> dict[str, Any]:
         "validation_issue_count": len(validation_issues_df),
         "canonical_bank_count": len(canonical_bank),
         "canonical_ledger_count": len(canonical_ledger),
-        "exact_match_count": len(reconciliation_links),
+        "exact_match_count": exact_match_count,
+        "timing_match_count": timing_match_count,
+        "deterministic_match_count": len(reconciliation_links),
         "exception_count": len(exception_queue),
         "summary_output_path": summary_output_path,
     }