Skip to content

Commit 14a9f23

Browse files
committed
fix CI and rebase bags
1 parent 45959c2 commit 14a9f23

5 files changed

Lines changed: 175 additions & 115 deletions

File tree

examples/basic/mining_ga_rfd.py

Lines changed: 116 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -167,91 +167,127 @@ def jaccard_sim(a, b) -> float:
167167
return 1.0
168168
return len(set_a & set_b) / len(set_a | set_b)
169169

170-
# Create a new algorithm instance
171-
algo2 = desbordante.rfd.algorithms.GaRfd()
172-
algo2.load_data(table=(TABLE_PATH, ',', False))
173-
174-
# Use set_metrics_py to include the Python function.
175-
# Keep abs_diff for the numeric columns and use our Jaccard function for the species column.
176-
algo2.set_metrics_py([abs_diff, abs_diff, abs_diff, abs_diff, jaccard_sim])
177-
178-
# Use slightly different parameters for variety
179-
algo2.set_option('rfd_min_similarity', 0.8)
180-
algo2.set_option('minconf', 0.9)
181-
algo2.set_option('population_size', 15)
182-
algo2.set_option('rfd_max_generations', 10)
183-
algo2.set_option('seed', 123)
184-
algo2.execute()
185-
186-
rfds2 = algo2.get_rfds()
187-
print(f"С пользовательской метрикой найдено {len(rfds2)} зависимостей")
188-
for i, rfd in enumerate(rfds2):
189-
print(f"{i+1}. {rfd}")
190-
191-
import desbordante
192-
import pandas
193-
import logging
194-
195-
# logging.basicConfig(level=logging.DEBUG, format='%(asctime)s [%(name)s] %(message)s')
196-
197-
# Чтобы запустить
198-
# PYTHONPATH=build/src/python_bindings python3 examples/basic/mining_ga_rfd.py
199-
200-
# ===================================================
201-
# Пример использования алгоритма GaRfd для поиска
202-
# релаксированных функциональных зависимостей (RFD)
203-
# ===================================================
204-
205-
TABLE = 'examples/datasets/iris.csv'
206-
207-
# Загружаем таблицу для просмотра
208-
df = pandas.read_csv(TABLE)
209-
print("Таблица Iris:")
210-
print(df.head(), "\n")
211-
212-
# ---------- Первый пример (встроенные метрики) ----------
213-
algo = desbordante.rfd.algorithms.GaRfd()
170+
algo_custom = desbordante.rfd.algorithms.GaRfd()
171+
algo_custom.load_data(table=(DATA_PATH, ',', True))
172+
# equality for Height and Weight, Jaccard for Shoe size
173+
algo_custom.set_metrics([eq, eq, jaccard_sim])
174+
algo_custom.set_option('min_similarity', 0.3)
175+
algo_custom.set_option('minconf', 0.4)
176+
algo_custom.set_option('max_generations', 100)
177+
algo_custom.set_option('seed', 42)
178+
algo_custom.execute()
179+
custom_rfds = algo_custom.get_rfds()
180+
print(f"Found {len(custom_rfds)} RFD(s) with custom metric:")
181+
for rfd in sorted(custom_rfds, key=lambda r: (r.rhs_index, r.lhs_mask)):
182+
print(f" {rfd}")
214183

215-
lev = desbordante.rfd.levenshtein_metric()
216-
eq = desbordante.rfd.equality_metric()
217-
abs_diff = desbordante.rfd.abs_diff_metric()
218-
algo.set_metrics([abs_diff, abs_diff, abs_diff, lev, eq])
184+
# ------------------------------------------------------------
185+
# Error detection and cleaning scenario
186+
# ------------------------------------------------------------
187+
print("\n" + "=" * 70)
188+
print("Error detection and data cleaning")
189+
print("=" * 70)
190+
print("""
191+
We deliberately introduce a mistake: change the Shoe size of the first person
192+
from 40 to 47 (a likely typo). This breaks the exact FD [Height,Weight] -> Shoe_size.
193+
After fixing it back, the FD is restored.
194+
""")
219195

220-
algo.load_data(table=(TABLE, ',', False))
196+
typo_df = df.copy()
197+
typo_df.loc[0, 'shoe_size'] = 47
198+
typo_path = 'typo_data.csv'
199+
typo_df.to_csv(typo_path, index=False, header=True)
200+
201+
algo_typo = desbordante.rfd.algorithms.GaRfd()
202+
algo_typo.load_data(table=(typo_path, ',', True))
203+
algo_typo.set_option('max_generations', 100)
204+
algo_typo.set_option('seed', 42)
205+
algo_typo.execute()
206+
typo_fds = algo_typo.get_rfds()
207+
print("Exact FDs on data with typo (fewer than original):")
208+
for rfd in typo_fds:
209+
print(f" {rfd}")
221210

222-
algo.set_option('rfd_min_similarity', 0.8)
223-
algo.set_option('minconf', 0.9)
224-
algo.set_option('population_size', 22)
225-
algo.set_option('rfd_max_generations', 10)
226-
algo.set_option('seed', 42)
227-
algo.execute()
211+
# Fix the error back to original
212+
typo_df.loc[0, 'shoe_size'] = 40
213+
fixed_path = 'fixed_data.csv'
214+
typo_df.to_csv(fixed_path, index=False, header=True)
215+
216+
algo_fixed = desbordante.rfd.algorithms.GaRfd()
217+
algo_fixed.load_data(table=(fixed_path, ',', True))
218+
algo_fixed.set_option('max_generations', 100)
219+
algo_fixed.set_option('seed', 42)
220+
algo_fixed.execute()
221+
fixed_fds = algo_fixed.get_rfds()
222+
print("\nAfter fixing the typo (exact FDs should be restored):")
223+
for rfd in fixed_fds:
224+
print(f" {rfd}")
228225

229-
rfds = algo.get_rfds()
230-
print(f"Найдено {len(rfds)} релаксированных функциональных зависимостей:")
231-
for i, rfd in enumerate(rfds):
232-
print(f"{i+1}. {rfd}")
226+
os.remove(typo_path)
227+
os.remove(fixed_path)
233228

234-
# ---------- Второй пример (пользовательская метрика) ----------
235-
print("\n--- Пример с пользовательской метрикой (Jaccard) ---")
236-
def jaccard_sim(a: str, b: str) -> float:
237-
set_a = set(a)
238-
set_b = set(b)
239-
if not set_a and not set_b:
240-
return 1.0
241-
return len(set_a & set_b) / len(set_a | set_b)
229+
# ------------------------------------------------------------
230+
# Parameter tuning and reproducibility
231+
# ------------------------------------------------------------
232+
print("\n" + "=" * 70)
233+
print("The importance of seed for reproducibility")
234+
print("=" * 70)
235+
print("""
236+
GaRfd is a genetic algorithm - it uses random numbers to initialise the
237+
population and to perform crossover/mutation. Therefore, two consecutive runs
238+
with exactly the same parameters may yield different sets of RFDs.
239+
"Reproducible results" means that if you fix a seed, the sequence of random
240+
numbers is always the same, and the algorithm produces identical output
241+
on any computer and at any time.
242+
243+
Below we first demonstrate two runs WITHOUT a seed (the results may differ).
244+
Then we run the algorithm twice WITH the same seed - we will see the same output.
245+
""")
242246

243-
algo2 = desbordante.rfd.algorithms.GaRfd()
244-
algo2.load_data(table=(TABLE, ',', False))
247+
# Two runs without seed
248+
print("--- Two runs without seed (results may vary) ---")
249+
for run in [1, 2]:
250+
algo_noseed = desbordante.rfd.algorithms.GaRfd()
251+
algo_noseed.load_data(table=(DATA_PATH, ',', True))
252+
algo_noseed.set_option('minconf', 0.6)
253+
algo_noseed.set_option('max_generations', 100)
254+
# no seed set
255+
algo_noseed.execute()
256+
res = algo_noseed.get_rfds()
257+
print(f"Run {run}: {len(res)} RFD(s)")
258+
for rfd in sorted(res, key=lambda r: (r.rhs_index, r.lhs_mask)):
259+
print(f" {rfd}")
260+
261+
# Two runs with seed = 42
262+
print("\n--- Two runs with seed = 42 (results must be identical) ---")
263+
for run in [1, 2]:
264+
algo_seed = desbordante.rfd.algorithms.GaRfd()
265+
algo_seed.load_data(table=(DATA_PATH, ',', True))
266+
algo_seed.set_metrics([eq, eq, eq])
267+
algo_seed.set_option('minconf', 0.6)
268+
algo_seed.set_option('max_generations', 100)
269+
algo_seed.set_option('seed', 42)
270+
algo_seed.execute()
271+
res = algo_seed.get_rfds()
272+
print(f"Run {run}: {len(res)} RFD(s)")
273+
for rfd in sorted(res, key=lambda r: (r.rhs_index, r.lhs_mask)):
274+
print(f" {rfd}")
245275

246-
algo2.set_metrics_py([jaccard_sim, lev, lev, jaccard_sim, eq]) # ← set_metrics_py!
247-
algo2.set_option('rfd_min_similarity', 0.8)
248-
algo2.set_option('minconf', 0.9)
249-
algo2.set_option('population_size', 10)
250-
algo2.set_option('rfd_max_generations', 10)
251-
algo2.set_option('seed', 42)
252-
algo2.execute()
276+
print("""
277+
As you can see, the random runs may differ in the number and content of RFDs,
278+
while the seeded runs are perfectly reproducible.
279+
""")
253280

254-
rfds2 = algo2.get_rfds()
255-
print(f"С пользовательской метрикой найдено {len(rfds2)} зависимостей")
256-
for i, rfd in enumerate(rfds2):
257-
print(f"{i+1}. {rfd}")
281+
# ------------------------------------------------------------
282+
# Summary and next steps
283+
# ------------------------------------------------------------
284+
print("\n" + "=" * 70)
285+
print("Summary")
286+
print("=" * 70)
287+
print("""
288+
- Default settings give exact FDs;
289+
- Lower minconf => AFDs;
290+
- Lower min_similarity + absolute difference metric => RFDs;
291+
- You can pass any Python function as a custom metric;
292+
- Use seed for reproducibility, tune population/generations for better results.
293+
""")

src/core/algorithms/algorithm_types.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ BETTER_ENUM(AlgorithmType, char,
103103
/* Frequent subgraph mining algorithm */
104104
gspan,
105105

106-
/* RFD mining algorithms */
106+
/* RFD mining algorithm */
107107
garfd
108108
)
109109

src/core/algorithms/rfd/ga_rfd/ga_rfd.cpp

Lines changed: 14 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
#include "core/util/logger.h"
1919
#include "core/util/timed_invoke.h"
2020

21-
bool DEBUG = false;
21+
bool debug = false;
2222

2323
namespace {
2424

@@ -31,14 +31,6 @@ template <typename T>
3131
return std::bitset<32>(mask).to_string().substr(32 - num_bits);
3232
}
3333

34-
[[nodiscard]] inline int FirstSetBit(uint32_t x) {
35-
return x ? static_cast<int>(__builtin_ctz(x)) : -1;
36-
}
37-
38-
[[nodiscard]] std::string bit_representation(uint32_t mask, int num_bits = 31) {
39-
return std::bitset<32>(mask).to_string().substr(32 - num_bits);
40-
}
41-
4234
[[nodiscard]] inline int FirstSetBit(uint32_t x) noexcept {
4335
return x ? __builtin_ctz(x) : -1;
4436
}
@@ -134,10 +126,6 @@ void GaRfd::LoadDataInternal() {
134126
}
135127
num_rows_++;
136128
}
137-
res += "] -> " + std::to_string(rhs_index) + " (conf=" + std::to_string(confidence) +
138-
", supp=" + std::to_string(support) + ")";
139-
return res;
140-
}
141129

142130
if (num_rows_ < 2) [[unlikely]]
143131
throw std::runtime_error("Input table must contain at least 2 rows");
@@ -146,7 +134,7 @@ void GaRfd::LoadDataInternal() {
146134
throw std::runtime_error("Table too large, total pairs would overflow size_t");
147135
total_pairs_ = num_rows_ * (num_rows_ - 1) / 2;
148136

149-
if (DEBUG)
137+
if (debug)
150138
LOG_INFO("Loaded {} rows, {} attributes, {} total pairs", num_rows_, num_attrs_,
151139
total_pairs_);
152140

@@ -165,7 +153,7 @@ void GaRfd::BuildSimilarityBitsets() {
165153
if (!support_cache_) {
166154
support_cache_ = std::make_unique<util::LRUCache<uint32_t, std::size_t>>(cache_max_size_);
167155
}
168-
if (DEBUG)
156+
if (debug)
169157
LOG_INFO("BuildSimilarityBitsets: total_pairs_ = {}, num_attrs_ = {}, num_rows_ = {}",
170158
total_pairs_, num_attrs_, num_rows_);
171159
std::size_t const num_uint64_per_attr = (total_pairs_ + 63) / 64;
@@ -188,21 +176,21 @@ void GaRfd::BuildSimilarityBitsets() {
188176
bits[word_idx] |= (1ULL << bit_idx);
189177
}
190178
} catch (std::exception const& e) {
191-
if (DEBUG)
179+
if (debug)
192180
LOG_ERROR("Exception in Compare: attr={}, i={}, j={}, what={}", a, i, j,
193181
e.what());
194182
throw;
195183
} catch (...) {
196-
if (DEBUG)
184+
if (debug)
197185
LOG_ERROR("Unknown exception in Compare: attr={}, i={}, j={}", a, i, j);
198186
throw;
199187
}
200188
pair_idx++;
201189
}
202190
}
203-
if (DEBUG) LOG_INFO("Finished attribute {} similarity bitset", a);
191+
if (debug) LOG_INFO("Finished attribute {} similarity bitset", a);
204192
}
205-
if (DEBUG) LOG_INFO("Similarity bitsets built for {} attributes", num_attrs_);
193+
if (debug) LOG_INFO("Similarity bitsets built for {} attributes", num_attrs_);
206194
}
207195

208196
std::size_t GaRfd::ComputeSupport(uint32_t attrs_mask) const noexcept {
@@ -224,7 +212,7 @@ std::size_t GaRfd::ComputeSupport(uint32_t attrs_mask) const noexcept {
224212
std::size_t s = 0;
225213
for (uint64_t w : first_vec) s += std::popcount(w);
226214
support_cache_->put(attrs_mask, s);
227-
if (DEBUG)
215+
if (debug)
228216
LOG_DEBUG("Support for mask {} = {}", BitRepresentation(attrs_mask, num_attrs_), s);
229217
return s;
230218
}
@@ -249,7 +237,7 @@ std::size_t GaRfd::ComputeSupport(uint32_t attrs_mask) const noexcept {
249237
std::size_t support = 0;
250238
for (uint64_t w : buffer) support += std::popcount(w);
251239
support_cache_->put(attrs_mask, support);
252-
if (DEBUG)
240+
if (debug)
253241
LOG_DEBUG("Support for mask {} = {}", BitRepresentation(attrs_mask, num_attrs_), support);
254242
return support;
255243
}
@@ -454,13 +442,13 @@ std::unordered_set<RFD, RFDHash> GaRfd::Finalize(
454442
if (ind.confidence < eps_) continue;
455443
res.insert(RFD{ind.lhs_mask, ind.rhs_index, ind.support, ind.confidence});
456444
}
457-
if (DEBUG) LOG_DEBUG("Finalized {} unique RFDs", res.size());
445+
if (debug) LOG_DEBUG("Finalized {} unique RFDs", res.size());
458446
return res;
459447
}
460448

461449
unsigned long long GaRfd::ExecuteInternal() {
462450
return ::util::TimedInvoke([&]() {
463-
if (DEBUG) LOG_INFO("Build similarity bitsets...");
451+
if (debug) LOG_INFO("Build similarity bitsets...");
464452
BuildSimilarityBitsets();
465453

466454
std::mt19937 rng(seed_);
@@ -469,16 +457,16 @@ unsigned long long GaRfd::ExecuteInternal() {
469457
EvaluatePopulation(pop);
470458

471459
for (size_t gen = 0; gen < max_generations_; gen++) {
472-
if (DEBUG)
460+
if (debug)
473461
LOG_INFO("Generation {}/{} (pop size: {})", gen + 1, max_generations_, pop.size());
474462
if (AllOf(pop)) {
475-
if (DEBUG)
463+
if (debug)
476464
LOG_DEBUG("All individuals satisfy confidence threshold – stopping early");
477465
break;
478466
}
479467

480468
if (pop.empty()) [[unlikely]] {
481-
if (DEBUG) LOG_DEBUG("Population is empty, stopping evolution");
469+
if (debug) LOG_DEBUG("Population is empty, stopping evolution");
482470
break;
483471
}
484472

0 commit comments

Comments
 (0)