Desbordante
diff --git a/‎examples/basic/mining_ga_rfd.py‎
Lines changed: 116 additions & 80 deletions b/‎examples/basic/mining_ga_rfd.py‎
Lines changed: 116 additions & 80 deletions
diff --git a/‎src/core/algorithms/algorithm_types.h‎
Lines changed: 1 addition & 1 deletion b/‎src/core/algorithms/algorithm_types.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/core/algorithms/rfd/ga_rfd/ga_rfd.cpp‎
Lines changed: 14 additions & 26 deletions b/‎src/core/algorithms/rfd/ga_rfd/ga_rfd.cpp‎
Lines changed: 14 additions & 26 deletions
@@ -167,91 +167,127 @@ def jaccard_sim(a, b) -> float:
         return 1.0
     return len(set_a & set_b) / len(set_a | set_b)
 
-# Create a new algorithm instance
-algo2 = desbordante.rfd.algorithms.GaRfd()
-algo2.load_data(table=(TABLE_PATH, ',', False))
-
-# Use set_metrics_py to include the Python function.
-# Keep abs_diff for the numeric columns and use our Jaccard function for the species column.
-algo2.set_metrics_py([abs_diff, abs_diff, abs_diff, abs_diff, jaccard_sim])
-
-# Use slightly different parameters for variety
-algo2.set_option('rfd_min_similarity', 0.8)
-algo2.set_option('minconf', 0.9)
-algo2.set_option('population_size', 15)
-algo2.set_option('rfd_max_generations', 10)
-algo2.set_option('seed', 123)
-algo2.execute()
-
-rfds2 = algo2.get_rfds()
-print(f"С пользовательской метрикой найдено {len(rfds2)} зависимостей")
-for i, rfd in enumerate(rfds2):
-    print(f"{i+1}. {rfd}")
-
-import desbordante
-import pandas
-import logging
-
-# logging.basicConfig(level=logging.DEBUG, format='%(asctime)s [%(name)s] %(message)s')
-
-# Чтобы запустить
-# PYTHONPATH=build/src/python_bindings python3 examples/basic/mining_ga_rfd.py
-
-# ===================================================
-# Пример использования алгоритма GaRfd для поиска
-# релаксированных функциональных зависимостей (RFD)
-# ===================================================
-
-TABLE = 'examples/datasets/iris.csv'
-
-# Загружаем таблицу для просмотра
-df = pandas.read_csv(TABLE)
-print("Таблица Iris:")
-print(df.head(), "\n")
-
-# ---------- Первый пример (встроенные метрики) ----------
-algo = desbordante.rfd.algorithms.GaRfd()
+algo_custom = desbordante.rfd.algorithms.GaRfd()
+algo_custom.load_data(table=(DATA_PATH, ',', True))
+# equality for Height and Weight, Jaccard for Shoe size
+algo_custom.set_metrics([eq, eq, jaccard_sim])
+algo_custom.set_option('min_similarity', 0.3)
+algo_custom.set_option('minconf', 0.4)
+algo_custom.set_option('max_generations', 100)
+algo_custom.set_option('seed', 42)
+algo_custom.execute()
+custom_rfds = algo_custom.get_rfds()
+print(f"Found {len(custom_rfds)} RFD(s) with custom metric:")
+for rfd in sorted(custom_rfds, key=lambda r: (r.rhs_index, r.lhs_mask)):
+    print(f"  {rfd}")
 
-lev = desbordante.rfd.levenshtein_metric()
-eq  = desbordante.rfd.equality_metric()
-abs_diff = desbordante.rfd.abs_diff_metric()
-algo.set_metrics([abs_diff, abs_diff, abs_diff, lev, eq])
+# ------------------------------------------------------------
+# Error detection and cleaning scenario
+# ------------------------------------------------------------
+print("\n" + "=" * 70)
+print("Error detection and data cleaning")
+print("=" * 70)
+print("""
+We deliberately introduce a mistake: change the Shoe size of the first person
+from 40 to 47 (a likely typo). This breaks the exact FD [Height,Weight] -> Shoe_size.
+After fixing it back, the FD is restored.
+""")
 
-algo.load_data(table=(TABLE, ',', False))
+typo_df = df.copy()
+typo_df.loc[0, 'shoe_size'] = 47
+typo_path = 'typo_data.csv'
+typo_df.to_csv(typo_path, index=False, header=True)
+
+algo_typo = desbordante.rfd.algorithms.GaRfd()
+algo_typo.load_data(table=(typo_path, ',', True))
+algo_typo.set_option('max_generations', 100)
+algo_typo.set_option('seed', 42)
+algo_typo.execute()
+typo_fds = algo_typo.get_rfds()
+print("Exact FDs on data with typo (fewer than original):")
+for rfd in typo_fds:
+    print(f"  {rfd}")
 
-algo.set_option('rfd_min_similarity', 0.8)
-algo.set_option('minconf', 0.9)
-algo.set_option('population_size', 22)
-algo.set_option('rfd_max_generations', 10)
-algo.set_option('seed', 42)
-algo.execute()
+# Fix the error back to original
+typo_df.loc[0, 'shoe_size'] = 40
+fixed_path = 'fixed_data.csv'
+typo_df.to_csv(fixed_path, index=False, header=True)
+
+algo_fixed = desbordante.rfd.algorithms.GaRfd()
+algo_fixed.load_data(table=(fixed_path, ',', True))
+algo_fixed.set_option('max_generations', 100)
+algo_fixed.set_option('seed', 42)
+algo_fixed.execute()
+fixed_fds = algo_fixed.get_rfds()
+print("\nAfter fixing the typo (exact FDs should be restored):")
+for rfd in fixed_fds:
+    print(f"  {rfd}")
 
-rfds = algo.get_rfds()
-print(f"Найдено {len(rfds)} релаксированных функциональных зависимостей:")
-for i, rfd in enumerate(rfds):
-    print(f"{i+1}. {rfd}")
+os.remove(typo_path)
+os.remove(fixed_path)
 
-# ---------- Второй пример (пользовательская метрика) ----------
-print("\n--- Пример с пользовательской метрикой (Jaccard) ---")
-def jaccard_sim(a: str, b: str) -> float:
-    set_a = set(a)
-    set_b = set(b)
-    if not set_a and not set_b:
-        return 1.0
-    return len(set_a & set_b) / len(set_a | set_b)
+# ------------------------------------------------------------
+# Parameter tuning and reproducibility
+# ------------------------------------------------------------
+print("\n" + "=" * 70)
+print("The importance of seed for reproducibility")
+print("=" * 70)
+print("""
+GaRfd is a genetic algorithm - it uses random numbers to initialise the
+population and to perform crossover/mutation. Therefore, two consecutive runs
+with exactly the same parameters may yield different sets of RFDs.
+"Reproducible results" means that if you fix a seed, the sequence of random
+numbers is always the same, and the algorithm produces identical output
+on any computer and at any time.
+
+Below we first demonstrate two runs WITHOUT a seed (the results may differ).
+Then we run the algorithm twice WITH the same seed - we will see the same output.
+""")
 
-algo2 = desbordante.rfd.algorithms.GaRfd()
-algo2.load_data(table=(TABLE, ',', False))
+# Two runs without seed
+print("--- Two runs without seed (results may vary) ---")
+for run in [1, 2]:
+    algo_noseed = desbordante.rfd.algorithms.GaRfd()
+    algo_noseed.load_data(table=(DATA_PATH, ',', True))
+    algo_noseed.set_option('minconf', 0.6)
+    algo_noseed.set_option('max_generations', 100)
+    # no seed set
+    algo_noseed.execute()
+    res = algo_noseed.get_rfds()
+    print(f"Run {run}: {len(res)} RFD(s)")
+    for rfd in sorted(res, key=lambda r: (r.rhs_index, r.lhs_mask)):
+        print(f"  {rfd}")
+
+# Two runs with seed = 42
+print("\n--- Two runs with seed = 42 (results must be identical) ---")
+for run in [1, 2]:
+    algo_seed = desbordante.rfd.algorithms.GaRfd()
+    algo_seed.load_data(table=(DATA_PATH, ',', True))
+    algo_seed.set_metrics([eq, eq, eq])
+    algo_seed.set_option('minconf', 0.6)
+    algo_seed.set_option('max_generations', 100)
+    algo_seed.set_option('seed', 42)
+    algo_seed.execute()
+    res = algo_seed.get_rfds()
+    print(f"Run {run}: {len(res)} RFD(s)")
+    for rfd in sorted(res, key=lambda r: (r.rhs_index, r.lhs_mask)):
+        print(f"  {rfd}")
 
-algo2.set_metrics_py([jaccard_sim, lev, lev, jaccard_sim, eq])   # ← set_metrics_py!
-algo2.set_option('rfd_min_similarity', 0.8)
-algo2.set_option('minconf', 0.9)
-algo2.set_option('population_size', 10)
-algo2.set_option('rfd_max_generations', 10)
-algo2.set_option('seed', 42)
-algo2.execute()
+print("""
+As you can see, the random runs may differ in the number and content of RFDs,
+while the seeded runs are perfectly reproducible.
+""")
 
-rfds2 = algo2.get_rfds()
-print(f"С пользовательской метрикой найдено {len(rfds2)} зависимостей")
-for i, rfd in enumerate(rfds2):
-    print(f"{i+1}. {rfd}")
+# ------------------------------------------------------------
+# Summary and next steps
+# ------------------------------------------------------------
+print("\n" + "=" * 70)
+print("Summary")
+print("=" * 70)
+print("""
+- Default settings give exact FDs;
+- Lower minconf => AFDs;
+- Lower min_similarity + absolute difference metric => RFDs;
+- You can pass any Python function as a custom metric;
+- Use seed for reproducibility, tune population/generations for better results.
+""")
@@ -103,7 +103,7 @@ BETTER_ENUM(AlgorithmType, char,
 /* Frequent subgraph mining algorithm */
     gspan,
 
-/* RFD mining algorithms */
+/* RFD mining algorithm */
     garfd
 )
 
 
@@ -18,7 +18,7 @@
 #include "core/util/logger.h"
 #include "core/util/timed_invoke.h"
 
-bool DEBUG = false;
+bool debug = false;
 
 namespace {
 
@@ -31,14 +31,6 @@ template <typename T>
     return std::bitset<32>(mask).to_string().substr(32 - num_bits);
 }
 
-[[nodiscard]] inline int FirstSetBit(uint32_t x) {
-    return x ? static_cast<int>(__builtin_ctz(x)) : -1;
-}
-
-[[nodiscard]] std::string bit_representation(uint32_t mask, int num_bits = 31) {
-    return std::bitset<32>(mask).to_string().substr(32 - num_bits);
-}
-
 [[nodiscard]] inline int FirstSetBit(uint32_t x) noexcept {
     return x ? __builtin_ctz(x) : -1;
 }
@@ -134,10 +126,6 @@ void GaRfd::LoadDataInternal() {
         }
         num_rows_++;
     }
-    res += "] -> " + std::to_string(rhs_index) + " (conf=" + std::to_string(confidence) +
-           ", supp=" + std::to_string(support) + ")";
-    return res;
-}
 
     if (num_rows_ < 2) [[unlikely]]
         throw std::runtime_error("Input table must contain at least 2 rows");
@@ -146,7 +134,7 @@ void GaRfd::LoadDataInternal() {
         throw std::runtime_error("Table too large, total pairs would overflow size_t");
     total_pairs_ = num_rows_ * (num_rows_ - 1) / 2;
 
-    if (DEBUG)
+    if (debug)
         LOG_INFO("Loaded {} rows, {} attributes, {} total pairs", num_rows_, num_attrs_,
                  total_pairs_);
 
@@ -165,7 +153,7 @@ void GaRfd::BuildSimilarityBitsets() {
     if (!support_cache_) {
         support_cache_ = std::make_unique<util::LRUCache<uint32_t, std::size_t>>(cache_max_size_);
     }
-    if (DEBUG)
+    if (debug)
         LOG_INFO("BuildSimilarityBitsets: total_pairs_ = {}, num_attrs_ = {}, num_rows_ = {}",
                  total_pairs_, num_attrs_, num_rows_);
     std::size_t const num_uint64_per_attr = (total_pairs_ + 63) / 64;
@@ -188,21 +176,21 @@ void GaRfd::BuildSimilarityBitsets() {
                         bits[word_idx] |= (1ULL << bit_idx);
                     }
                 } catch (std::exception const& e) {
-                    if (DEBUG)
+                    if (debug)
                         LOG_ERROR("Exception in Compare: attr={}, i={}, j={}, what={}", a, i, j,
                                   e.what());
                     throw;
                 } catch (...) {
-                    if (DEBUG)
+                    if (debug)
                         LOG_ERROR("Unknown exception in Compare: attr={}, i={}, j={}", a, i, j);
                     throw;
                 }
                 pair_idx++;
             }
         }
-        if (DEBUG) LOG_INFO("Finished attribute {} similarity bitset", a);
+        if (debug) LOG_INFO("Finished attribute {} similarity bitset", a);
     }
-    if (DEBUG) LOG_INFO("Similarity bitsets built for {} attributes", num_attrs_);
+    if (debug) LOG_INFO("Similarity bitsets built for {} attributes", num_attrs_);
 }
 
 std::size_t GaRfd::ComputeSupport(uint32_t attrs_mask) const noexcept {
@@ -224,7 +212,7 @@ std::size_t GaRfd::ComputeSupport(uint32_t attrs_mask) const noexcept {
         std::size_t s = 0;
         for (uint64_t w : first_vec) s += std::popcount(w);
         support_cache_->put(attrs_mask, s);
-        if (DEBUG)
+        if (debug)
             LOG_DEBUG("Support for mask {} = {}", BitRepresentation(attrs_mask, num_attrs_), s);
         return s;
     }
@@ -249,7 +237,7 @@ std::size_t GaRfd::ComputeSupport(uint32_t attrs_mask) const noexcept {
     std::size_t support = 0;
     for (uint64_t w : buffer) support += std::popcount(w);
     support_cache_->put(attrs_mask, support);
-    if (DEBUG)
+    if (debug)
         LOG_DEBUG("Support for mask {} = {}", BitRepresentation(attrs_mask, num_attrs_), support);
     return support;
 }
@@ -454,13 +442,13 @@ std::unordered_set<RFD, RFDHash> GaRfd::Finalize(
         if (ind.confidence < eps_) continue;
         res.insert(RFD{ind.lhs_mask, ind.rhs_index, ind.support, ind.confidence});
     }
-    if (DEBUG) LOG_DEBUG("Finalized {} unique RFDs", res.size());
+    if (debug) LOG_DEBUG("Finalized {} unique RFDs", res.size());
     return res;
 }
 
 unsigned long long GaRfd::ExecuteInternal() {
     return ::util::TimedInvoke([&]() {
-        if (DEBUG) LOG_INFO("Build similarity bitsets...");
+        if (debug) LOG_INFO("Build similarity bitsets...");
         BuildSimilarityBitsets();
 
         std::mt19937 rng(seed_);
@@ -469,16 +457,16 @@ unsigned long long GaRfd::ExecuteInternal() {
         EvaluatePopulation(pop);
 
         for (size_t gen = 0; gen < max_generations_; gen++) {
-            if (DEBUG)
+            if (debug)
                 LOG_INFO("Generation {}/{} (pop size: {})", gen + 1, max_generations_, pop.size());
             if (AllOf(pop)) {
-                if (DEBUG)
+                if (debug)
                     LOG_DEBUG("All individuals satisfy confidence threshold – stopping early");
                 break;
             }
 
             if (pop.empty()) [[unlikely]] {
-                if (DEBUG) LOG_DEBUG("Population is empty, stopping evolution");
+                if (debug) LOG_DEBUG("Population is empty, stopping evolution");
                 break;
             }
Original file line number	Diff line number	Diff line change
`@@ -103,7 +103,7 @@ BETTER_ENUM(AlgorithmType, char,`
`103`	`103`	`/* Frequent subgraph mining algorithm */`
`104`	`104`	`gspan,`
`105`	`105`
`106`		`-/* RFD mining algorithms */`
	`106`	`+/* RFD mining algorithm */`
`107`	`107`	`garfd`
`108`	`108`	`)`
`109`	`109`