@@ -133,8 +133,8 @@ def print_link(text, url, end=''):
133133printlns (
134134 " In this example we will learn the basics of RFD mining from tables. " +
135135 "RFD (Relaxed Functional Dependency) is a pattern that captures " +
136- "the rule: « if two tuples are similar on a set of attributes X, " +
137- "then they are likely similar on attribute Y» . Similarity is defined " +
136+ "the rule: ' if two tuples are similar on a set of attributes X, " +
137+ "then they are likely similar on attribute Y' . Similarity is defined " +
138138 "via configurable metrics and thresholds, making RFD more flexible " +
139139 "than classical functional dependencies."
140140)
@@ -147,7 +147,7 @@ def print_link(text, url, end=''):
147147prints (
148148 f" This pattern is formally defined in the paper: " +
149149 f"{ BOLD } L. Caruccio, V. Deufemia, G. Polese. " +
150- "« A genetic algorithm to discover relaxed functional dependencies from data» . " +
150+ "' A genetic algorithm to discover relaxed functional dependencies from data' . " +
151151 f"SEBD 2017{ RESET } ." , end = ' '
152152)
153153print ('(' , end = '' )
@@ -158,15 +158,15 @@ def print_link(text, url, end=''):
158158printlns (
159159 f"{ YELLOW } !?{ RESET } It is important not to confuse RFD as a general term for 'approximate FD'. " +
160160 "Here RFD refers to a concrete pattern defined by Caruccio et al. that " +
161- "combines similarity metric for each column global coverage threshold."
161+ "combines similarity metric for each column with a global coverage threshold."
162162)
163163
164164# ------------------------------------------------------------
165165# 2. What is an RFD?
166166# ------------------------------------------------------------
167167banner ("What is an RFD?" , num = 2 )
168168
169- print (f"{ YELLOW } >>> 2.1. Pattern definition{ RESET } " )
169+ print (f"{ YELLOW } 2.1. Pattern definition{ RESET } " )
170170printlns (
171171 "A Relaxed Functional Dependency (RFD) is a specific pattern of the form"
172172)
@@ -185,7 +185,7 @@ def print_link(text, url, end=''):
185185 "* Lowering confidence gives us AFDs."
186186)
187187
188- print (f"{ YELLOW } >>> 2.2. Confidence and support{ RESET } " )
188+ print (f"{ YELLOW } 2.2. Confidence and support{ RESET } " )
189189printlns ("Two numbers describe an RFD." )
190190printlns (
191191 f" { BOLD } Confidence{ RESET } tells us how reliable the rule is. " +
@@ -205,7 +205,7 @@ def print_link(text, url, end=''):
205205# ------------------------------------------------------------
206206banner ("Dataset" , num = 3 )
207207
208- DATA_PATH = "examples/datasets/sample_height_weight .csv"
208+ DATA_PATH = "examples/datasets/sample_original_from_paper .csv"
209209COL_NAMES = ["height_cm" , "weight_kg" , "shoe_size_eu" ]
210210
211211df = pd .read_csv (DATA_PATH , header = 0 )
@@ -262,7 +262,7 @@ def print_link(text, url, end=''):
262262 "reproducible results."
263263)
264264
265- print (f"{ YELLOW } >>> Where are similarity metrics defined?{ RESET } " )
265+ print (f"{ YELLOW } Where are similarity metrics defined?{ RESET } " )
266266printlns (
267267 f" Similarity metrics are set using the { BOLD } set_metrics(){ RESET } method, which takes " +
268268 "a list of metric functions (one per column). For example:"
@@ -341,19 +341,21 @@ def print_link(text, url, end=''):
341341# ------------------------------------------------------------
342342banner ("Verifying hypothesis" , num = 7 )
343343
344- printlns (
345- f" { YELLOW } Here we set:{ RESET } height <= 1 cm, weight <= 10 kg, shoe size <= 1. " +
346- f"This models { BOLD } 'people of practically the same height and roughly the same " +
347- f"weight should have almost the same shoe size'{ RESET } . " +
348- "Since the metric returns 0 or 1, we set min_similarity=1.0 to accept only exact " +
349- "matches according to these thresholds. "
350- )
351344printlns (
352345 f" { GREEN } Recall our hypothesis from Section 3:{ RESET } we expect that similar height and " +
353346 "weight imply similar shoe size. The absolute metric lets us define " +
354347 "'similar' in concrete, measurable terms."
355348)
356349
350+ printlns (
351+ f" { YELLOW } To check it we set the following attribute difference thresholds:{ RESET } " +
352+ "height <= 1 cm, weight <= 10 kg, shoe size <= 1. " +
353+ f"This models { BOLD } 'people of practically the same height and roughly the same " +
354+ f"weight should have almost the same shoe size'{ RESET } . " +
355+ "Since the used abs_threshold_metric returns 0 or 1, we set min_similarity=1.0 to accept only exact " +
356+ "matches according to these thresholds. "
357+ )
358+
357359print_table (df )
358360algo_abs = desbordante .rfd .algorithms .GaRfd ()
359361algo_abs .load_data (table = (DATA_PATH , "," , True ))
@@ -385,8 +387,8 @@ def print_link(text, url, end=''):
385387)
386388
387389prints (
388- " Because the thresholds are strict, only a few pairs match - hence the " +
389- "support is low, but the confidence can still be high. " +
390+ " Because the thresholds are strict, only a few pairs have similar lhs and similar " +
391+ "rhs - hence the support is low, but the confidence can still be high. " +
390392 "The absolute metric makes the similarity definition completely transparent."
391393)
392394
@@ -436,8 +438,8 @@ def ngrams(s, n=2):
436438 title = "RFDs with exact equality on all columns" )
437439printlns (
438440 " Without fuzzy matching, the only dependencies found involve cuisine " +
439- "and district because they have exact duplicates. Restaurant names, " +
440- "which are all unique due to typos, never appear in any rule ."
441+ "and district because they contain exact duplicates. On the other hand, restaurant names, " +
442+ "which are all unique due to typos, never appear in any found RFD ."
441443)
442444
443445algo_jac = desbordante .rfd .algorithms .GaRfd ()
@@ -512,91 +514,14 @@ def ngrams(s, n=2):
512514 title = f"Found { len (discovered_rfds )} RFD(s) on dirty data" ,
513515 highlight = {highlight_key })
514516
515- printlns (f"{ YELLOW } >>> Why do the RFD sets differ, and how to find out if there is an error?{ RESET } " )
517+ printlns (f"{ YELLOW } Why do the RFD sets differ, and how to find out if there is an error?{ RESET } " )
516518printlns (
517519 " On clean data, [cuisine] -> [district] holds with confidence 1.0 " +
518520 "because every cuisine appears in only one district. " +
519521 "After adding rows 10-11 (Italian in Uptown), the confidence drops to 0.6. " +
520522 "This change signals a potential inconsistency."
521523)
522524
523- def extract_violations_for_garfd_rfds (df , rfds , metrics , thresholds ):
524- from collections import defaultdict
525- import itertools
526-
527- violation_reports = []
528- n = len (df )
529-
530- for rfd in rfds :
531- lhs_indices = [i for i in range (len (df .columns )) if rfd .lhs_mask & (1 << i )]
532- rhs_idx = rfd .rhs_index
533-
534- if not lhs_indices or rhs_idx >= len (df .columns ):
535- continue
536-
537- violation_count = defaultdict (int )
538- for i , j in itertools .combinations (range (n ), 2 ):
539- lhs_sim = all (
540- metrics [col ](str (df .iloc [i , col ]), str (df .iloc [j , col ])) >= thresholds [col ]
541- for col in lhs_indices
542- )
543- if not lhs_sim :
544- continue
545-
546- rhs_sim = metrics [rhs_idx ](str (df .iloc [i , rhs_idx ]), str (df .iloc [j , rhs_idx ])) >= thresholds [rhs_idx ]
547-
548- if not rhs_sim :
549- violation_count [i ] += 1
550- violation_count [j ] += 1
551-
552- violation_reports .append ({
553- "rfd" : rfd ,
554- "violations" : sorted (violation_count .items (), key = lambda x : x [1 ], reverse = True )
555- })
556- return violation_reports
557-
558- reports = extract_violations_for_garfd_rfds (
559- dirty_df ,
560- discovered_rfds ,
561- metrics = [jaccard_2gram , eq , eq ],
562- thresholds = [0.3 , 1.0 , 1.0 ]
563- )
564-
565- print ("=" * 80 )
566- printlns (" Errors detected:" )
567- for report in reports :
568- rfd = report ["rfd" ]
569- lhs_names = [COL_NAMES_STR [i ] for i in range (len (COL_NAMES_STR )) if rfd .lhs_mask & (1 << i )]
570- rhs_name = COL_NAMES_STR [rfd .rhs_index ]
571- rule_str = f"[{ ', ' .join (lhs_names )} ] -> [{ rhs_name } ] (conf={ rfd .confidence :.3f} , supp={ rfd .support :.3f} )"
572- if len (report ["violations" ]):
573- prints (f"{ RED } Rule: { rule_str } { RESET } " )
574-
575- top_violations = report ["violations" ][:3 ] # show only top 3
576- for j , (idx , count ) in enumerate (top_violations ):
577- prints (f" Tuple #{ idx + 1 } | Violating pairs: { count } | " )
578- prints (f" Data: { dirty_df .iloc [idx ].to_dict ()} " )
579-
580- if j < len (top_violations ) - 1 :
581- print ("-" * 80 )
582- print ()
583-
584- printlns (
585- f"{ GREEN } Note{ RESET } that Bella Napoli (row #4) is flagged because it violates the rule " +
586- "when paired with the two erroneous Italian rows - it is an innocent bystander " +
587- "that helps locate the true errors."
588- )
589-
590- printlns (f"{ YELLOW } >>> How we spot errors?{ RESET } " )
591- printlns (
592- " For each RFD discovered on the dirty dataset we count how many times " +
593- "a tuple violates the rule (LHS similar, RHS dissimilar). Tuples with " +
594- "the highest violation counts are the best candidates for manual review. " +
595- "In our example, rows 10 and 11 are flagged precisely because they " +
596- "introduced Italian cuisine into Uptown, breaking the previously clean " +
597- "pattern."
598- )
599-
600525# ------------------------------------------------------------
601526# 10. Reproducibility note
602527# ------------------------------------------------------------
@@ -633,7 +558,7 @@ def extract_violations_for_garfd_rfds(df, rfds, metrics, thresholds):
633558# ------------------------------------------------------------
634559banner ("See also" )
635560
636- print ("Related primitives in Desbordante:" )
561+ print ("Related patterns in Desbordante:" )
637562print (" * FD mining - examples/basic/mining_fd.py" )
638563print (" * AFD mining - examples/basic/mining_afd.py" )
639564print (" * MFD verifying - examples/basic/verifying_mfd.py" )
0 commit comments