Skip to content

Commit d42f01c

Browse files
committed
upd and add example + add new metric + cosmetic
1 parent 27cc596 commit d42f01c

9 files changed

Lines changed: 542 additions & 205 deletions

File tree

examples/advanced/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,5 @@ These scenarios illustrate various nuances, for example those concerning pattern
99
+ [comparison_pfd_vs_afd.py](https://github.com/Desbordante/desbordante-core/tree/main/examples/advanced/comparison_pfd_vs_afd.py) — a scenario comparing probabilistic functional dependency with approximate functional dependency.
1010
+ [comparison_ucc_and_aucc_1.py](https://github.com/Desbordante/desbordante-core/tree/main/examples/advanced/comparison_ucc_and_aucc_1.py) — a scenario showing how to search errors in data using exact unique column combination mining and approximate unique column combination verifying algorithms.
1111
+ [comparison_ucc_and_aucc_2.py](https://github.com/Desbordante/desbordante-core/tree/main/examples/advanced/comparison_ucc_and_aucc_2.py) — a scenario showing how to search errors in data using exact and approximate unique column combination mining algorithms.
12+
+ [fd_and_afd_via_ga-rfd.py](https://github.com/Desbordante/desbordante-core/tree/main/examples/advanced/fd_and_afd_via_ga-rfd.py) — a scenario showing how to discover exact and approximate functional dependencies using the GA-RFD algorithm and then validate them with the AFD verifier.
1213
+ [md_semantic_checks.py](https://github.com/Desbordante/desbordante-core/tree/main/examples/advanced/md_semantic_checks.py) - a scenario demonstrating using a meaningful matching dependency to explore and repair a dataset.
Lines changed: 330 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,330 @@
1+
"""
2+
Example: Mining FDs / AFDs via GA-RFD and verifying AFDs - Advanced
3+
====================================================================
4+
5+
This advanced example extends the basic (`mining_ga_rfd.py`) tutorial.
6+
It uses GA-RFD to mine exact functional dependencies (FDs) and
7+
approximate FDs (AFDs), and then validates the discovered AFDs with
8+
Desbordante's dedicated AFD verifier. We compare the confidence reported
9+
by GA-RFD with the g₁ error computed by the verifier, showing how these
10+
two measures relate.
11+
12+
The algorithm is based on the paper:
13+
L. Caruccio, V. Deufemia, G. Polese.
14+
"A genetic algorithm to discover relaxed functional dependencies from data".
15+
SEBD 2017, Symposium on Advanced Database Systems.
16+
"""
17+
18+
import desbordante
19+
import pandas as pd
20+
from tabulate import tabulate
21+
import textwrap
22+
23+
# ------------------------------------------------------------
24+
# Styling utilities
25+
# ------------------------------------------------------------
26+
YELLOW = "\033[1;33m"
27+
CYAN = "\033[1;36m"
28+
GREEN = "\033[1;32m"
29+
RED = "\033[1;31m"
30+
BLUE = "\033[1;34m"
31+
BOLD = "\033[1m"
32+
RESET = "\033[0m"
33+
34+
35+
def prints(s, width=80, end='\n'):
36+
print(textwrap.fill(s, width=width), end=end)
37+
38+
39+
def printlns(s, width=80):
40+
prints(s, width)
41+
print()
42+
43+
44+
def banner(title, num=None):
45+
prefix = f"{num}. " if num is not None else ""
46+
print("\n" + "=" * 80)
47+
print(f"{CYAN}{prefix}{title}{RESET}")
48+
print("=" * 80)
49+
50+
51+
def print_table(df, title=None, show_index=True, highlight_rows=None):
52+
if title:
53+
print(f"\n{YELLOW}{title}{RESET}")
54+
if show_index:
55+
display_df = df.reset_index(drop=True)
56+
display_df.index += 1
57+
display_df.index.name = "#"
58+
else:
59+
display_df = df
60+
61+
table_str = tabulate(display_df, headers="keys", tablefmt="psql", showindex=show_index)
62+
63+
lines = table_str.split('\n')
64+
for i, line in enumerate(lines):
65+
if highlight_rows and i > 2:
66+
display_row_num = i - 2
67+
if (display_row_num - 1) in highlight_rows:
68+
print(f"{BLUE}{line}{RESET}")
69+
else:
70+
print(line)
71+
else:
72+
print(line)
73+
print()
74+
75+
76+
def make_rfd_key(col_names, lhs_list, rhs):
77+
mask = 0
78+
for col in lhs_list:
79+
mask |= 1 << col_names.index(col)
80+
rhs_idx = col_names.index(rhs)
81+
return (mask, rhs_idx)
82+
83+
84+
def print_rfds_table(rfds, col_names, title=None, highlight=None, color=YELLOW):
85+
if title:
86+
print(f"{YELLOW}{title}{RESET}")
87+
if not rfds:
88+
print(" (none)\n")
89+
return
90+
91+
if highlight is None:
92+
highlight = set()
93+
94+
raw_lines = []
95+
for idx, rfd in enumerate(sorted(rfds, key=lambda r: (r.rhs_index, r.lhs_mask)), start=1):
96+
lhs_cols = [col_names[i] for i in range(len(col_names)) if rfd.lhs_mask & (1 << i)]
97+
lhs_str = ", ".join(lhs_cols) if lhs_cols else "()"
98+
rhs_col = col_names[rfd.rhs_index]
99+
line = f"[{lhs_str}] -> [{rhs_col}] (conf={rfd.confidence:.3f}, supp={rfd.support:.3f})"
100+
numbered_line = f"{idx:>2}. {line}"
101+
if (rfd.lhs_mask, rfd.rhs_index) in highlight:
102+
raw_lines.append(f"{color}" + numbered_line + f"{RESET}")
103+
else:
104+
raw_lines.append(numbered_line)
105+
106+
max_len = 0
107+
for line in raw_lines:
108+
pos = line.find('(conf=')
109+
if pos != -1:
110+
max_len = max(max_len, pos)
111+
else:
112+
max_len = max(max_len, len(line))
113+
114+
for line in raw_lines:
115+
pos = line.find('(conf=')
116+
if pos != -1:
117+
lhs_part = line[:pos]
118+
rhs_part = line[pos:]
119+
padded = lhs_part.ljust(max_len + 2) + rhs_part
120+
print(padded)
121+
else:
122+
print(line)
123+
print()
124+
125+
126+
# ------------------------------------------------------------
127+
# 1. Introduction
128+
# ------------------------------------------------------------
129+
banner("Introduction", num=1)
130+
131+
printlns(
132+
" This example is intended for users who want to dive deeper into the " +
133+
"algorithm. We strongly recommend going through the basic GA-RFD " +
134+
"example first to become familiar with the core concepts and API. " +
135+
"Here we move on to exact FDs and approximate FDs, and we show how to " +
136+
"validate AFDs using the built-in verifier that computes the g₁ error."
137+
)
138+
printlns(
139+
" By the end you will understand the difference between the confidence " +
140+
"reported by GA-RFD (based on tuple pairs) and the g₁ error from the " +
141+
"AFD verifier (based on tuple pairs), and how these two measures correspond."
142+
)
143+
144+
# ------------------------------------------------------------
145+
# 2. Dataset (the same as in basic)
146+
# ------------------------------------------------------------
147+
banner("Dataset", num=2)
148+
149+
DATA_PATH = "examples/datasets/sample_height_weight.csv"
150+
COL_NAMES = ["height_cm", "weight_kg", "shoe_size_eu"]
151+
152+
df = pd.read_csv(DATA_PATH, header=0)
153+
print_table(df, title="Sample data (8 persons, 3 numeric attributes)")
154+
155+
printlns(
156+
f" {GREEN}Dataset description:{RESET} This dataset contains information about 8 people. " +
157+
"Each row represents one person with three numeric attributes:"
158+
)
159+
prints(f" * {BOLD}height_cm{RESET} — person's height in centimeters")
160+
prints(f" * {BOLD}weight_kg{RESET} — person's weight in kilograms")
161+
prints(f" * {BOLD}shoe_size_eu{RESET} — European shoe size")
162+
print()
163+
164+
# ------------------------------------------------------------
165+
# 3. Exact FDs (minconf=1.0, equality metrics)
166+
# ------------------------------------------------------------
167+
banner("Exact FDs (minconf=1.0, equality metrics)", num=3)
168+
169+
printlns(
170+
" Setting minconf = 1.0 and using the default equality metric " +
171+
"makes GA-RFD mine classical exact functional dependencies."
172+
)
173+
174+
print_table(df, title="Sample data - note duplicate weights in rows 1-2 and 5-6",
175+
highlight_rows=[0, 1, 4, 5])
176+
algo_fd = desbordante.rfd.algorithms.GaRfd()
177+
algo_fd.load_data(table=(DATA_PATH, ",", True))
178+
algo_fd.set_option("max_generations", 100)
179+
algo_fd.set_option("seed", 42)
180+
algo_fd.execute()
181+
fds = algo_fd.get_rfds()
182+
183+
highlight_fd = make_rfd_key(COL_NAMES, ["weight_kg"], "height_cm")
184+
print_rfds_table(fds, COL_NAMES, title=f"Found {len(fds)} exact FD(s) with minconf=1.0",
185+
highlight={highlight_fd})
186+
187+
printlns(f"{YELLOW}>>> Why does [weight_kg] -> [height_cm] have conf=1.000 and supp=0.071?{RESET}")
188+
printlns(
189+
" There are 8 rows, therefore 8*7/2 = 28 tuple pairs. " +
190+
"Only two pairs share the same weight: (row 1, row 2) with weight 70, " +
191+
"and (row 5, row 6) with weight 81. In both pairs the height is also equal " +
192+
"(175 and 178 respectively). Hence, among the 2 pairs that agree on the left side, " +
193+
"all 2 agree on the right side => confidence = 2/2 = 1.0. " +
194+
"Support = 2/28 ≈ 0.071 because the whole dependency holds for exactly 2 pairs."
195+
)
196+
197+
# ------------------------------------------------------------
198+
# 4. Approximate FDs (AFDs) - lowering confidence
199+
# ------------------------------------------------------------
200+
banner("Approximate FDs (AFDs): lowering minconf", num=4)
201+
202+
printlns(
203+
" When we keep equality metrics but lower minconf below 1.0, " +
204+
"the RFD pattern reduces to an Approximate Functional Dependency (AFD). " +
205+
"Minconf = 0.6 means we accept dependencies that hold in at least 60% " +
206+
"of the cases."
207+
)
208+
209+
algo_afd = desbordante.rfd.algorithms.GaRfd()
210+
algo_afd.load_data(table=(DATA_PATH, ",", True))
211+
algo_afd.set_option("minconf", 0.6)
212+
algo_afd.set_option("max_generations", 100)
213+
algo_afd.set_option("seed", 42)
214+
algo_afd.execute()
215+
afds = algo_afd.get_rfds()
216+
217+
# Highlight the dependency discussed in the text: height_cm -> shoe_size_eu
218+
highlight_afd = make_rfd_key(COL_NAMES, ["height_cm"], "shoe_size_eu")
219+
print_rfds_table(afds, COL_NAMES, title=f"Found {len(afds)} AFD(s) with minconf>=0.6",
220+
highlight={highlight_afd})
221+
222+
printlns(f"{YELLOW}>>> Why does [height_cm] -> [shoe_size_eu] have conf=0.750 and supp=0.107?{RESET}")
223+
printlns(
224+
" There are 4 pairs with identical height: (1,2), (1,3), (2,3) from height 175 " +
225+
"and (5,6) from height 178. Among them, the first three also share the same shoe size (40), " +
226+
"but the pair (5,6) has different shoe sizes (42 vs 41). Hence confidence = 3/4 = 0.75. " +
227+
"Support = 3/28 ≈ 0.107 because three pairs satisfy both sides."
228+
)
229+
230+
# ------------------------------------------------------------
231+
# 5. Verifying AFDs with the AFD verifier (g₁ error)
232+
# ------------------------------------------------------------
233+
banner("Verifying AFDs with the AFD verifier (g₁ error)", num=5)
234+
235+
printlns(
236+
" An AFD can be quantified by its g₁ error: the fraction of all tuple " +
237+
"pairs (i, j) that violate the dependency — that is, pairs where the " +
238+
"left-hand side attributes are equal but the right-hand side differ. " +
239+
"Desbordante provides a dedicated AFD verifier that computes exactly " +
240+
"this measure. We will verify each AFD discovered by GA-RFD and compare " +
241+
"the g₁ error with the confidence value."
242+
)
243+
244+
verifier = desbordante.afd_verification.algorithms.Default()
245+
verifier.load_data(table=(DATA_PATH, ",", True))
246+
247+
table_data = []
248+
for rfd in sorted(afds, key=lambda r: (r.rhs_index, r.lhs_mask)):
249+
lhs_indices = [i for i in range(len(COL_NAMES)) if rfd.lhs_mask & (1 << i)]
250+
rhs_index = rfd.rhs_index
251+
if not lhs_indices:
252+
continue
253+
254+
verifier.execute(lhs_indices=lhs_indices, rhs_indices=[rhs_index])
255+
g1_error = verifier.get_error()
256+
confidence = rfd.confidence
257+
258+
lhs_names = [COL_NAMES[i] for i in lhs_indices]
259+
rhs_name = COL_NAMES[rhs_index]
260+
rule_str = f"[{', '.join(lhs_names)}] -> [{rhs_name}]"
261+
262+
table_data.append([
263+
rule_str,
264+
f"{confidence:.3f}",
265+
f"{g1_error:.3f}",
266+
f"{1 - confidence:.3f}"
267+
])
268+
269+
print(f"\n{YELLOW}Verification results:{RESET}\n")
270+
headers = ["Rule", "Confidence", "g₁ error", "1 - Confidence"]
271+
print(tabulate(table_data, headers=headers, tablefmt="psql",
272+
colalign=("left", "right", "right", "right")))
273+
print()
274+
275+
printlns(f"{YELLOW}>>> Observations{RESET}")
276+
printlns(
277+
" The table compares the confidence reported by GA-RFD with the g₁ error " +
278+
"from the verifier. Confidence is defined as the fraction of pairs with " +
279+
"equal LHS that also have equal RHS. The g₁ error, on the other hand, is " +
280+
"the fraction of all possible pairs in the dataset that violate the rule " +
281+
"(LHS equal, RHS different)."
282+
)
283+
printlns(
284+
" Because they are computed over different sets of pairs, 1 - Confidence and g₁ error " +
285+
"generally do not match. For example, in our 8-row dataset there are " +
286+
"8·7/2 = 28 total pairs. For the dependency [height_cm] => [shoe_size_eu] " +
287+
"only 4 pairs agree on height. Among those, 3 also agree on shoe size, " +
288+
"so confidence = 3/4 = 0.75, and 1 - confidence = 0.25. However, the " +
289+
"number of violating pairs is just 1 (rows 5 and 6), which gives a g₁ " +
290+
"error of 1/28 ≈ 0.036 — exactly the value shown by the verifier."
291+
)
292+
printlns(
293+
" This illustrates the important difference: g₁ error gives a global, " +
294+
"pair-based measure of how much the data deviates from a perfect FD, " +
295+
"while confidence tells us how reliable the dependency is among the " +
296+
"tuples that actually share the LHS values."
297+
)
298+
299+
# ------------------------------------------------------------
300+
banner("Summary")
301+
302+
prints(" In this advanced example we:")
303+
prints(
304+
" * Mined exact FDs and approximate FDs using GA-RFD with equality metrics."
305+
)
306+
prints(
307+
" * Verified the AFDs with the AFD verifier, computing the g₁ error and " +
308+
"comparing it to the confidence reported by the mining algorithm."
309+
)
310+
printlns(
311+
" * Understood the difference: confidence is based on tuple pairs, g₁ error " +
312+
"is the fraction of violating tuple pairs. Both are useful, but the verifier gives " +
313+
"a direct measure of data quality at the pair level."
314+
)
315+
prints(
316+
" When using RFDs for data cleaning, you can first mine approximate dependencies " +
317+
"with GA-RFD, then pass them to the verifier to obtain exact pair-level error " +
318+
"statistics."
319+
)
320+
321+
# ------------------------------------------------------------
322+
banner("See also")
323+
324+
print("Related primitives in Desbordante:")
325+
print(" * FD mining - examples/basic/mining_fd.py")
326+
print(" * AFD mining - examples/basic/mining_afd.py")
327+
print(" * MFD verifying - examples/basic/verifying_mfd.py")
328+
print(" * MD mining - examples/basic/mining_md.py")
329+
print(" * RFD mining - examples/basic/mining_ga_rfd.py")
330+
print()

examples/basic/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ These scenarios showcase a single pattern by discussing its definition and provi
1515
+ [mining_dd.py](https://github.com/Desbordante/desbordante-core/tree/main/examples/basic/mining_dd.py) — a scenario showing how to discover differential dependencies.
1616
+ [mining_fd.py](https://github.com/Desbordante/desbordante-core/tree/main/examples/basic/mining_fd.py) — a scenario showing how to discover exact functional dependencies.
1717
+ [mining_fd_approximate.py](https://github.com/Desbordante/desbordante-core/tree/main/examples/basic/mining_fd_approximate.py) — a scenario showing how to discover exact functional dependencies using an approximate algorithm.
18+
+ [mining_ga_rfd.py](https://github.com/Desbordante/desbordante-core/tree/main/examples/basic/mining_ga_rfd.py) — a scenario showing how to discover relaxed functional dependencies and detect errors using the GA-RFD algorithm.
1819
+ [mining_gfd](https://github.com/Desbordante/desbordante-core/tree/main/examples/basic/mining_gfd) — scenarios showing how to discover graph functional dependencies.
1920
+ [mining_ind.py](https://github.com/Desbordante/desbordante-core/tree/main/examples/basic/mining_ind.py) — a scenario showing how to discover inclusion dependencies.
2021
+ [mining_list_od.py](https://github.com/Desbordante/desbordante-core/tree/main/examples/basic/mining_list_od.py) — a scenario showing how to discover order dependencies based on list axiomatization.

0 commit comments

Comments
 (0)