@@ -167,91 +167,127 @@ def jaccard_sim(a, b) -> float:
167167 return 1.0
168168 return len (set_a & set_b ) / len (set_a | set_b )
169169
170- # Create a new algorithm instance
171- algo2 = desbordante .rfd .algorithms .GaRfd ()
172- algo2 .load_data (table = (TABLE_PATH , ',' , False ))
173-
174- # Use set_metrics_py to include the Python function.
175- # Keep abs_diff for the numeric columns and use our Jaccard function for the species column.
176- algo2 .set_metrics_py ([abs_diff , abs_diff , abs_diff , abs_diff , jaccard_sim ])
177-
178- # Use slightly different parameters for variety
179- algo2 .set_option ('rfd_min_similarity' , 0.8 )
180- algo2 .set_option ('minconf' , 0.9 )
181- algo2 .set_option ('population_size' , 15 )
182- algo2 .set_option ('rfd_max_generations' , 10 )
183- algo2 .set_option ('seed' , 123 )
184- algo2 .execute ()
185-
186- rfds2 = algo2 .get_rfds ()
187- print (f"С пользовательской метрикой найдено { len (rfds2 )} зависимостей" )
188- for i , rfd in enumerate (rfds2 ):
189- print (f"{ i + 1 } . { rfd } " )
190-
191- import desbordante
192- import pandas
193- import logging
194-
195- # logging.basicConfig(level=logging.DEBUG, format='%(asctime)s [%(name)s] %(message)s')
196-
197- # Чтобы запустить
198- # PYTHONPATH=build/src/python_bindings python3 examples/basic/mining_ga_rfd.py
199-
200- # ===================================================
201- # Пример использования алгоритма GaRfd для поиска
202- # релаксированных функциональных зависимостей (RFD)
203- # ===================================================
204-
205- TABLE = 'examples/datasets/iris.csv'
206-
207- # Загружаем таблицу для просмотра
208- df = pandas .read_csv (TABLE )
209- print ("Таблица Iris:" )
210- print (df .head (), "\n " )
211-
212- # ---------- Первый пример (встроенные метрики) ----------
213- algo = desbordante .rfd .algorithms .GaRfd ()
170+ algo_custom = desbordante .rfd .algorithms .GaRfd ()
171+ algo_custom .load_data (table = (DATA_PATH , ',' , True ))
172+ # equality for Height and Weight, Jaccard for Shoe size
173+ algo_custom .set_metrics ([eq , eq , jaccard_sim ])
174+ algo_custom .set_option ('min_similarity' , 0.3 )
175+ algo_custom .set_option ('minconf' , 0.4 )
176+ algo_custom .set_option ('max_generations' , 100 )
177+ algo_custom .set_option ('seed' , 42 )
178+ algo_custom .execute ()
179+ custom_rfds = algo_custom .get_rfds ()
180+ print (f"Found { len (custom_rfds )} RFD(s) with custom metric:" )
181+ for rfd in sorted (custom_rfds , key = lambda r : (r .rhs_index , r .lhs_mask )):
182+ print (f" { rfd } " )
214183
215- lev = desbordante .rfd .levenshtein_metric ()
216- eq = desbordante .rfd .equality_metric ()
217- abs_diff = desbordante .rfd .abs_diff_metric ()
218- algo .set_metrics ([abs_diff , abs_diff , abs_diff , lev , eq ])
184+ # ------------------------------------------------------------
185+ # Error detection and cleaning scenario
186+ # ------------------------------------------------------------
187+ print ("\n " + "=" * 70 )
188+ print ("Error detection and data cleaning" )
189+ print ("=" * 70 )
190+ print ("""
191+ We deliberately introduce a mistake: change the Shoe size of the first person
192+ from 40 to 47 (a likely typo). This breaks the exact FD [Height,Weight] -> Shoe_size.
193+ After fixing it back, the FD is restored.
194+ """ )
219195
220- algo .load_data (table = (TABLE , ',' , False ))
196+ typo_df = df .copy ()
197+ typo_df .loc [0 , 'shoe_size' ] = 47
198+ typo_path = 'typo_data.csv'
199+ typo_df .to_csv (typo_path , index = False , header = True )
200+
201+ algo_typo = desbordante .rfd .algorithms .GaRfd ()
202+ algo_typo .load_data (table = (typo_path , ',' , True ))
203+ algo_typo .set_option ('max_generations' , 100 )
204+ algo_typo .set_option ('seed' , 42 )
205+ algo_typo .execute ()
206+ typo_fds = algo_typo .get_rfds ()
207+ print ("Exact FDs on data with typo (fewer than original):" )
208+ for rfd in typo_fds :
209+ print (f" { rfd } " )
221210
222- algo .set_option ('rfd_min_similarity' , 0.8 )
223- algo .set_option ('minconf' , 0.9 )
224- algo .set_option ('population_size' , 22 )
225- algo .set_option ('rfd_max_generations' , 10 )
226- algo .set_option ('seed' , 42 )
227- algo .execute ()
211+ # Fix the error back to original
212+ typo_df .loc [0 , 'shoe_size' ] = 40
213+ fixed_path = 'fixed_data.csv'
214+ typo_df .to_csv (fixed_path , index = False , header = True )
215+
216+ algo_fixed = desbordante .rfd .algorithms .GaRfd ()
217+ algo_fixed .load_data (table = (fixed_path , ',' , True ))
218+ algo_fixed .set_option ('max_generations' , 100 )
219+ algo_fixed .set_option ('seed' , 42 )
220+ algo_fixed .execute ()
221+ fixed_fds = algo_fixed .get_rfds ()
222+ print ("\n After fixing the typo (exact FDs should be restored):" )
223+ for rfd in fixed_fds :
224+ print (f" { rfd } " )
228225
229- rfds = algo .get_rfds ()
230- print (f"Найдено { len (rfds )} релаксированных функциональных зависимостей:" )
231- for i , rfd in enumerate (rfds ):
232- print (f"{ i + 1 } . { rfd } " )
226+ os .remove (typo_path )
227+ os .remove (fixed_path )
233228
234- # ---------- Второй пример (пользовательская метрика) ----------
235- print ("\n --- Пример с пользовательской метрикой (Jaccard) ---" )
236- def jaccard_sim (a : str , b : str ) -> float :
237- set_a = set (a )
238- set_b = set (b )
239- if not set_a and not set_b :
240- return 1.0
241- return len (set_a & set_b ) / len (set_a | set_b )
229+ # ------------------------------------------------------------
230+ # Parameter tuning and reproducibility
231+ # ------------------------------------------------------------
232+ print ("\n " + "=" * 70 )
233+ print ("The importance of seed for reproducibility" )
234+ print ("=" * 70 )
235+ print ("""
236+ GaRfd is a genetic algorithm - it uses random numbers to initialise the
237+ population and to perform crossover/mutation. Therefore, two consecutive runs
238+ with exactly the same parameters may yield different sets of RFDs.
239+ "Reproducible results" means that if you fix a seed, the sequence of random
240+ numbers is always the same, and the algorithm produces identical output
241+ on any computer and at any time.
242+
243+ Below we first demonstrate two runs WITHOUT a seed (the results may differ).
244+ Then we run the algorithm twice WITH the same seed - we will see the same output.
245+ """ )
242246
243- algo2 = desbordante .rfd .algorithms .GaRfd ()
244- algo2 .load_data (table = (TABLE , ',' , False ))
247+ # Two runs without seed
248+ print ("--- Two runs without seed (results may vary) ---" )
249+ for run in [1 , 2 ]:
250+ algo_noseed = desbordante .rfd .algorithms .GaRfd ()
251+ algo_noseed .load_data (table = (DATA_PATH , ',' , True ))
252+ algo_noseed .set_option ('minconf' , 0.6 )
253+ algo_noseed .set_option ('max_generations' , 100 )
254+ # no seed set
255+ algo_noseed .execute ()
256+ res = algo_noseed .get_rfds ()
257+ print (f"Run { run } : { len (res )} RFD(s)" )
258+ for rfd in sorted (res , key = lambda r : (r .rhs_index , r .lhs_mask )):
259+ print (f" { rfd } " )
260+
261+ # Two runs with seed = 42
262+ print ("\n --- Two runs with seed = 42 (results must be identical) ---" )
263+ for run in [1 , 2 ]:
264+ algo_seed = desbordante .rfd .algorithms .GaRfd ()
265+ algo_seed .load_data (table = (DATA_PATH , ',' , True ))
266+ algo_seed .set_metrics ([eq , eq , eq ])
267+ algo_seed .set_option ('minconf' , 0.6 )
268+ algo_seed .set_option ('max_generations' , 100 )
269+ algo_seed .set_option ('seed' , 42 )
270+ algo_seed .execute ()
271+ res = algo_seed .get_rfds ()
272+ print (f"Run { run } : { len (res )} RFD(s)" )
273+ for rfd in sorted (res , key = lambda r : (r .rhs_index , r .lhs_mask )):
274+ print (f" { rfd } " )
245275
246- algo2 .set_metrics_py ([jaccard_sim , lev , lev , jaccard_sim , eq ]) # ← set_metrics_py!
247- algo2 .set_option ('rfd_min_similarity' , 0.8 )
248- algo2 .set_option ('minconf' , 0.9 )
249- algo2 .set_option ('population_size' , 10 )
250- algo2 .set_option ('rfd_max_generations' , 10 )
251- algo2 .set_option ('seed' , 42 )
252- algo2 .execute ()
276+ print ("""
277+ As you can see, the random runs may differ in the number and content of RFDs,
278+ while the seeded runs are perfectly reproducible.
279+ """ )
253280
254- rfds2 = algo2 .get_rfds ()
255- print (f"С пользовательской метрикой найдено { len (rfds2 )} зависимостей" )
256- for i , rfd in enumerate (rfds2 ):
257- print (f"{ i + 1 } . { rfd } " )
281+ # ------------------------------------------------------------
282+ # Summary and next steps
283+ # ------------------------------------------------------------
284+ print ("\n " + "=" * 70 )
285+ print ("Summary" )
286+ print ("=" * 70 )
287+ print ("""
288+ - Default settings give exact FDs;
289+ - Lower minconf => AFDs;
290+ - Lower min_similarity + absolute difference metric => RFDs;
291+ - You can pass any Python function as a custom metric;
292+ - Use seed for reproducibility, tune population/generations for better results.
293+ """ )
0 commit comments