Skip to content

Commit f8d6052

Browse files
committed
fix examples
1 parent d42f01c commit f8d6052

3 files changed

Lines changed: 40 additions & 105 deletions

File tree

examples/advanced/fd_and_afd_via_ga-rfd.py renamed to examples/advanced/fd_and_afd_via_ga_rfd.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ def print_rfds_table(rfds, col_names, title=None, highlight=None, color=YELLOW):
130130

131131
printlns(
132132
" This example is intended for users who want to dive deeper into the " +
133-
"algorithm. We strongly recommend going through the basic GA-RFD " +
133+
"GA-RFD. We strongly recommend going through the basic GA-RFD " +
134134
"example first to become familiar with the core concepts and API. " +
135135
"Here we move on to exact FDs and approximate FDs, and we show how to " +
136136
"validate AFDs using the built-in verifier that computes the g₁ error."
@@ -184,7 +184,7 @@ def print_rfds_table(rfds, col_names, title=None, highlight=None, color=YELLOW):
184184
print_rfds_table(fds, COL_NAMES, title=f"Found {len(fds)} exact FD(s) with minconf=1.0",
185185
highlight={highlight_fd})
186186

187-
printlns(f"{YELLOW}>>> Why does [weight_kg] -> [height_cm] have conf=1.000 and supp=0.071?{RESET}")
187+
printlns(f"{YELLOW}Why does [weight_kg] -> [height_cm] have conf=1.000 and supp=0.071?{RESET}")
188188
printlns(
189189
" There are 8 rows, therefore 8*7/2 = 28 tuple pairs. " +
190190
"Only two pairs share the same weight: (row 1, row 2) with weight 70, " +
@@ -219,7 +219,7 @@ def print_rfds_table(rfds, col_names, title=None, highlight=None, color=YELLOW):
219219
print_rfds_table(afds, COL_NAMES, title=f"Found {len(afds)} AFD(s) with minconf>=0.6",
220220
highlight={highlight_afd})
221221

222-
printlns(f"{YELLOW}>>> Why does [height_cm] -> [shoe_size_eu] have conf=0.750 and supp=0.107?{RESET}")
222+
printlns(f"{YELLOW}Why does [height_cm] -> [shoe_size_eu] have conf=0.750 and supp=0.107?{RESET}")
223223
printlns(
224224
" There are 4 pairs with identical height: (1,2), (1,3), (2,3) from height 175 " +
225225
"and (5,6) from height 178. Among them, the first three also share the same shoe size (40), " +
@@ -254,6 +254,7 @@ def print_rfds_table(rfds, col_names, title=None, highlight=None, color=YELLOW):
254254
verifier.execute(lhs_indices=lhs_indices, rhs_indices=[rhs_index])
255255
g1_error = verifier.get_error()
256256
confidence = rfd.confidence
257+
support = rfd.support
257258

258259
lhs_names = [COL_NAMES[i] for i in lhs_indices]
259260
rhs_name = COL_NAMES[rhs_index]
@@ -262,17 +263,18 @@ def print_rfds_table(rfds, col_names, title=None, highlight=None, color=YELLOW):
262263
table_data.append([
263264
rule_str,
264265
f"{confidence:.3f}",
266+
f"{support:.3f}",
265267
f"{g1_error:.3f}",
266268
f"{1 - confidence:.3f}"
267269
])
268270

269271
print(f"\n{YELLOW}Verification results:{RESET}\n")
270-
headers = ["Rule", "Confidence", "g₁ error", "1 - Confidence"]
272+
headers = ["rule", "conf", "supp", "g₁ error", "1 - conf"]
271273
print(tabulate(table_data, headers=headers, tablefmt="psql",
272-
colalign=("left", "right", "right", "right")))
274+
colalign=("center", "left", "left", "left", "left")))
273275
print()
274276

275-
printlns(f"{YELLOW}>>> Observations{RESET}")
277+
printlns(f"{YELLOW}Observations{RESET}")
276278
printlns(
277279
" The table compares the confidence reported by GA-RFD with the g₁ error " +
278280
"from the verifier. Confidence is defined as the fraction of pairs with " +
@@ -321,7 +323,7 @@ def print_rfds_table(rfds, col_names, title=None, highlight=None, color=YELLOW):
321323
# ------------------------------------------------------------
322324
banner("See also")
323325

324-
print("Related primitives in Desbordante:")
326+
print("Related patterns in Desbordante:")
325327
print(" * FD mining - examples/basic/mining_fd.py")
326328
print(" * AFD mining - examples/basic/mining_afd.py")
327329
print(" * MFD verifying - examples/basic/verifying_mfd.py")

examples/basic/mining_ga_rfd.py

Lines changed: 23 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,8 @@ def print_link(text, url, end=''):
133133
printlns(
134134
" In this example we will learn the basics of RFD mining from tables. " +
135135
"RFD (Relaxed Functional Dependency) is a pattern that captures " +
136-
"the rule: «if two tuples are similar on a set of attributes X, " +
137-
"then they are likely similar on attribute Y». Similarity is defined " +
136+
"the rule: 'if two tuples are similar on a set of attributes X, " +
137+
"then they are likely similar on attribute Y'. Similarity is defined " +
138138
"via configurable metrics and thresholds, making RFD more flexible " +
139139
"than classical functional dependencies."
140140
)
@@ -147,7 +147,7 @@ def print_link(text, url, end=''):
147147
prints(
148148
f" This pattern is formally defined in the paper: " +
149149
f"{BOLD}L. Caruccio, V. Deufemia, G. Polese. " +
150-
"«A genetic algorithm to discover relaxed functional dependencies from data». " +
150+
"'A genetic algorithm to discover relaxed functional dependencies from data'. " +
151151
f"SEBD 2017{RESET}.", end=' '
152152
)
153153
print('(', end='')
@@ -158,15 +158,15 @@ def print_link(text, url, end=''):
158158
printlns(
159159
f"{YELLOW}!?{RESET} It is important not to confuse RFD as a general term for 'approximate FD'. " +
160160
"Here RFD refers to a concrete pattern defined by Caruccio et al. that " +
161-
"combines similarity metric for each column global coverage threshold."
161+
"combines similarity metric for each column with a global coverage threshold."
162162
)
163163

164164
# ------------------------------------------------------------
165165
# 2. What is an RFD?
166166
# ------------------------------------------------------------
167167
banner("What is an RFD?", num=2)
168168

169-
print(f"{YELLOW}>>> 2.1. Pattern definition{RESET}")
169+
print(f"{YELLOW}2.1. Pattern definition{RESET}")
170170
printlns(
171171
"A Relaxed Functional Dependency (RFD) is a specific pattern of the form"
172172
)
@@ -185,7 +185,7 @@ def print_link(text, url, end=''):
185185
"* Lowering confidence gives us AFDs."
186186
)
187187

188-
print(f"{YELLOW}>>> 2.2. Confidence and support{RESET}")
188+
print(f"{YELLOW}2.2. Confidence and support{RESET}")
189189
printlns("Two numbers describe an RFD.")
190190
printlns(
191191
f" {BOLD}Confidence{RESET} tells us how reliable the rule is. " +
@@ -205,7 +205,7 @@ def print_link(text, url, end=''):
205205
# ------------------------------------------------------------
206206
banner("Dataset", num=3)
207207

208-
DATA_PATH = "examples/datasets/sample_height_weight.csv"
208+
DATA_PATH = "examples/datasets/sample_original_from_paper.csv"
209209
COL_NAMES = ["height_cm", "weight_kg", "shoe_size_eu"]
210210

211211
df = pd.read_csv(DATA_PATH, header=0)
@@ -262,7 +262,7 @@ def print_link(text, url, end=''):
262262
"reproducible results."
263263
)
264264

265-
print(f"{YELLOW}>>> Where are similarity metrics defined?{RESET}")
265+
print(f"{YELLOW}Where are similarity metrics defined?{RESET}")
266266
printlns(
267267
f" Similarity metrics are set using the {BOLD}set_metrics(){RESET} method, which takes " +
268268
"a list of metric functions (one per column). For example:"
@@ -341,19 +341,21 @@ def print_link(text, url, end=''):
341341
# ------------------------------------------------------------
342342
banner("Verifying hypothesis", num=7)
343343

344-
printlns(
345-
f" {YELLOW}Here we set:{RESET} height <= 1 cm, weight <= 10 kg, shoe size <= 1. " +
346-
f"This models {BOLD}'people of practically the same height and roughly the same " +
347-
f"weight should have almost the same shoe size'{RESET}. " +
348-
"Since the metric returns 0 or 1, we set min_similarity=1.0 to accept only exact " +
349-
"matches according to these thresholds. "
350-
)
351344
printlns(
352345
f" {GREEN}Recall our hypothesis from Section 3:{RESET} we expect that similar height and " +
353346
"weight imply similar shoe size. The absolute metric lets us define " +
354347
"'similar' in concrete, measurable terms."
355348
)
356349

350+
printlns(
351+
f" {YELLOW}To check it we set the following attribute difference thresholds:{RESET} " +
352+
"height <= 1 cm, weight <= 10 kg, shoe size <= 1. " +
353+
f"This models {BOLD}'people of practically the same height and roughly the same " +
354+
f"weight should have almost the same shoe size'{RESET}. " +
355+
"Since the used abs_threshold_metric returns 0 or 1, we set min_similarity=1.0 to accept only exact " +
356+
"matches according to these thresholds. "
357+
)
358+
357359
print_table(df)
358360
algo_abs = desbordante.rfd.algorithms.GaRfd()
359361
algo_abs.load_data(table=(DATA_PATH, ",", True))
@@ -385,8 +387,8 @@ def print_link(text, url, end=''):
385387
)
386388

387389
prints(
388-
" Because the thresholds are strict, only a few pairs match - hence the " +
389-
"support is low, but the confidence can still be high. " +
390+
" Because the thresholds are strict, only a few pairs have similar lhs and similar " +
391+
"rhs - hence the support is low, but the confidence can still be high. " +
390392
"The absolute metric makes the similarity definition completely transparent."
391393
)
392394

@@ -436,8 +438,8 @@ def ngrams(s, n=2):
436438
title="RFDs with exact equality on all columns")
437439
printlns(
438440
" Without fuzzy matching, the only dependencies found involve cuisine " +
439-
"and district because they have exact duplicates. Restaurant names, " +
440-
"which are all unique due to typos, never appear in any rule."
441+
"and district because they contain exact duplicates. On the other hand, restaurant names, " +
442+
"which are all unique due to typos, never appear in any found RFD."
441443
)
442444

443445
algo_jac = desbordante.rfd.algorithms.GaRfd()
@@ -512,91 +514,14 @@ def ngrams(s, n=2):
512514
title=f"Found {len(discovered_rfds)} RFD(s) on dirty data",
513515
highlight={highlight_key})
514516

515-
printlns(f"{YELLOW}>>> Why do the RFD sets differ, and how to find out if there is an error?{RESET}")
517+
printlns(f"{YELLOW}Why do the RFD sets differ, and how to find out if there is an error?{RESET}")
516518
printlns(
517519
" On clean data, [cuisine] -> [district] holds with confidence 1.0 " +
518520
"because every cuisine appears in only one district. " +
519521
"After adding rows 10-11 (Italian in Uptown), the confidence drops to 0.6. " +
520522
"This change signals a potential inconsistency."
521523
)
522524

523-
def extract_violations_for_garfd_rfds(df, rfds, metrics, thresholds):
524-
from collections import defaultdict
525-
import itertools
526-
527-
violation_reports = []
528-
n = len(df)
529-
530-
for rfd in rfds:
531-
lhs_indices = [i for i in range(len(df.columns)) if rfd.lhs_mask & (1 << i)]
532-
rhs_idx = rfd.rhs_index
533-
534-
if not lhs_indices or rhs_idx >= len(df.columns):
535-
continue
536-
537-
violation_count = defaultdict(int)
538-
for i, j in itertools.combinations(range(n), 2):
539-
lhs_sim = all(
540-
metrics[col](str(df.iloc[i, col]), str(df.iloc[j, col])) >= thresholds[col]
541-
for col in lhs_indices
542-
)
543-
if not lhs_sim:
544-
continue
545-
546-
rhs_sim = metrics[rhs_idx](str(df.iloc[i, rhs_idx]), str(df.iloc[j, rhs_idx])) >= thresholds[rhs_idx]
547-
548-
if not rhs_sim:
549-
violation_count[i] += 1
550-
violation_count[j] += 1
551-
552-
violation_reports.append({
553-
"rfd": rfd,
554-
"violations": sorted(violation_count.items(), key=lambda x: x[1], reverse=True)
555-
})
556-
return violation_reports
557-
558-
reports = extract_violations_for_garfd_rfds(
559-
dirty_df,
560-
discovered_rfds,
561-
metrics=[jaccard_2gram, eq, eq],
562-
thresholds=[0.3, 1.0, 1.0]
563-
)
564-
565-
print("="*80)
566-
printlns(" Errors detected:")
567-
for report in reports:
568-
rfd = report["rfd"]
569-
lhs_names = [COL_NAMES_STR[i] for i in range(len(COL_NAMES_STR)) if rfd.lhs_mask & (1 << i)]
570-
rhs_name = COL_NAMES_STR[rfd.rhs_index]
571-
rule_str = f"[{', '.join(lhs_names)}] -> [{rhs_name}] (conf={rfd.confidence:.3f}, supp={rfd.support:.3f})"
572-
if len(report["violations"]):
573-
prints(f"{RED}Rule: {rule_str}{RESET}")
574-
575-
top_violations = report["violations"][:3] # show only top 3
576-
for j, (idx, count) in enumerate(top_violations):
577-
prints(f" Tuple #{idx+1} | Violating pairs: {count} | ")
578-
prints(f" Data: {dirty_df.iloc[idx].to_dict()}")
579-
580-
if j < len(top_violations) - 1:
581-
print("-" * 80)
582-
print()
583-
584-
printlns(
585-
f"{GREEN}Note{RESET} that Bella Napoli (row #4) is flagged because it violates the rule " +
586-
"when paired with the two erroneous Italian rows - it is an innocent bystander " +
587-
"that helps locate the true errors."
588-
)
589-
590-
printlns(f"{YELLOW}>>> How we spot errors?{RESET}")
591-
printlns(
592-
" For each RFD discovered on the dirty dataset we count how many times " +
593-
"a tuple violates the rule (LHS similar, RHS dissimilar). Tuples with " +
594-
"the highest violation counts are the best candidates for manual review. " +
595-
"In our example, rows 10 and 11 are flagged precisely because they " +
596-
"introduced Italian cuisine into Uptown, breaking the previously clean " +
597-
"pattern."
598-
)
599-
600525
# ------------------------------------------------------------
601526
# 10. Reproducibility note
602527
# ------------------------------------------------------------
@@ -633,7 +558,7 @@ def extract_violations_for_garfd_rfds(df, rfds, metrics, thresholds):
633558
# ------------------------------------------------------------
634559
banner("See also")
635560

636-
print("Related primitives in Desbordante:")
561+
print("Related patterns in Desbordante:")
637562
print(" * FD mining - examples/basic/mining_fd.py")
638563
print(" * AFD mining - examples/basic/mining_afd.py")
639564
print(" * MFD verifying - examples/basic/verifying_mfd.py")
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
height_cm,weight_kg,shoe_size_eu
2+
175,70,40
3+
175,75,39
4+
175,69,40
5+
176,71,40
6+
178,81,41
7+
169,73,37
8+
170,62,39

0 commit comments

Comments
 (0)