Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
1fbc6dc
Implement FastDD
MichaelS239 Apr 6, 2025
5a51287
Add main target
MichaelS239 Dec 27, 2025
cec626b
Refactor algorithm for more general definition
MichaelS239 Feb 21, 2026
97e92bc
Add children bitset
MichaelS239 Mar 24, 2026
a9b4cc1
Use std::vector instead of std::unordered_map
MichaelS239 Mar 24, 2026
e126b3a
Add basic path compression
MichaelS239 Mar 24, 2026
cfdf476
Use std::vector instead of std::unordered_set
MichaelS239 Mar 26, 2026
92c4a75
Use ForEach instead of iterator
MichaelS239 Mar 26, 2026
d626480
Move MinimizeDifferentialSet() to HybridEvidenceInverter
MichaelS239 Apr 3, 2026
c224fd1
Small optimizations (sort + reserve)
MichaelS239 Apr 5, 2026
99a37b6
Implement MMCS algorithm for hitting set enumeration
MichaelS239 Apr 5, 2026
a8be4cd
Use std::unordered_set for building DiffSet
MichaelS239 Apr 5, 2026
86e03e0
Optimize SetNumMask in SingleISNBuilder
MichaelS239 Apr 6, 2026
88cdc27
Use boost::unordered::unordered_flat_set for clues
MichaelS239 Apr 6, 2026
ee8ed5c
Add custom dynamic bitset versions
MichaelS239 Dec 7, 2025
ed8df42
Choose bitset type in the algorithm
MichaelS239 Apr 7, 2026
d89b017
Optimize RemoveTransitive
MichaelS239 May 4, 2026
f86bb36
Fallback to bitsets when ISNs overflow
MichaelS239 May 5, 2026
bea0e0d
Optimize MinMaxDifCalculator
MichaelS239 May 5, 2026
02863bf
Set default shard length to 800
MichaelS239 May 5, 2026
da6b928
Treat mixed-type value as strings
MichaelS239 May 9, 2026
2cc0ca0
Move optimized Levenshtein distance to util
MichaelS239 May 10, 2026
c2747d7
Use optimized version of Levenshtein distance
MichaelS239 May 9, 2026
065314e
Move min_max_dif calculation to DiffSetBuilder
MichaelS239 May 13, 2026
fb6eae1
Optimize `is_subset_of` in `StaticBitset`
MichaelS239 May 15, 2026
27e006d
Optimize MinimizeTree
MichaelS239 May 15, 2026
a095dfe
Optimize removal of trivial DDs
MichaelS239 May 16, 2026
0999607
Cleanup
MichaelS239 May 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ endif()

if(DESBORDANTE_GDB_SYMBOLS)
add_compile_options(-ggdb3)
add_compile_options(-gdwarf)
endif()

add_compile_options("$<$<CONFIG:Debug>:${DEBUG_BUILD_OPTS}>")
Expand Down
2 changes: 2 additions & 0 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ Possible options:
EOF
}

GDB_DEBUG=true

# TODO: use getopts or something else instead of bash
for i in "$@"; do
case $i in
Expand Down
1 change: 1 addition & 0 deletions src/core/algorithms/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ target_link_libraries(
${DESBORDANTE_PREFIX}::cfd::verifier
${DESBORDANTE_PREFIX}::cind::verifier
${DESBORDANTE_PREFIX}::datastats
${DESBORDANTE_PREFIX}::dd::fastdd
${DESBORDANTE_PREFIX}::dd::split
${DESBORDANTE_PREFIX}::fd::aid
${DESBORDANTE_PREFIX}::fd::depminer
Expand Down
3 changes: 2 additions & 1 deletion src/core/algorithms/algorithm_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ using AlgorithmTypes =
fd_verifier::FDVerifier, HyUCC, PyroUCC, HPIValid, cfd::FDFirstAlgorithm,
ACAlgorithm, UCCVerifier, Faida, Spider, Mind, INDVerifier, cind::CINDVerifier,
Fastod, GfdValidator, EGfdValidator, NaiveGfdValidator, order::Order, dd::Split,
Cords, hymd::HyMD, PFDVerifier, cfd_verifier::CFDVerifier,
dd::FastDD, Cords, hymd::HyMD, PFDVerifier, cfd_verifier::CFDVerifier,
ar_verifier::ARVerifier, GSpan>;

// clang-format off
Expand Down Expand Up @@ -91,6 +91,7 @@ enum class AlgorithmType : char {

/* Differential dependencies mining algorithm */
kSplit,
kFastDD,

/* SFD mining algorithm */
kCords,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <stdexcept>

#include "core/algorithms/dc/FastADC/model/predicate.h"
#include "core/util/logger.h"

namespace algos::fastadc {

Expand Down
1 change: 1 addition & 0 deletions src/core/algorithms/dd/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
add_subdirectory(fastdd)
add_subdirectory(split)
add_subdirectory(dd_verifier)

Expand Down
35 changes: 35 additions & 0 deletions src/core/algorithms/dd/dd_algorithm.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#pragma once

#include <list>
#include <utility>
#include <vector>

#include "core/algorithms/algorithm.h"
#include "core/algorithms/dd/dd.h"
#include "core/util/primitive_collection.h"

namespace algos::dd {

class DDAlgorithm : public Algorithm {
private:
util::PrimitiveCollection<model::DDString> dd_collection_;

void ResetState() override {
dd_collection_.Clear();
ResetStateDD();
}

virtual void ResetStateDD() = 0;

protected:
void RegisterDD(model::DDString dd_to_register) {
dd_collection_.Register(std::move(dd_to_register));
}

public:
std::list<model::DDString> const& DDList() const noexcept {
return dd_collection_.AsList();
}
};

} // namespace algos::dd
34 changes: 34 additions & 0 deletions src/core/algorithms/dd/fastdd/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
set(NAME dd.fastdd.structs)
desbordante_add_lib(NAME OBJECT)
target_sources(
${NAME}
PRIVATE model/pli_shard.cpp trees/translating_tree_search.cpp
util/differential_function_builder.cpp util/isn_info.cpp util/threshold_calculator.cpp
)
target_link_libraries(${NAME} PUBLIC magic_enum::magic_enum)
target_link_libraries(${NAME} PRIVATE spdlog::spdlog_header_only Boost::headers)

set(NAME dd.fastdd)
desbordante_add_lib(NAME)
target_sources(${NAME} PRIVATE fastdd.cpp)
target_link_libraries(
${NAME}
PRIVATE ${DESBORDANTE_PREFIX}::dd::fastdd::structs
${DESBORDANTE_PREFIX}::dd
${DESBORDANTE_PREFIX}::config
${DESBORDANTE_PREFIX}::model::table
${DESBORDANTE_PREFIX}::model::types
spdlog::spdlog_header_only
${DESBORDANTE_PREFIX}::algos
magic_enum::magic_enum
Boost::headers
)

set(NAME dd.fastdd_run)
add_executable(${NAME})
set_target_properties(${NAME} PROPERTIES LINK_LIBRARIES_ONLY_TARGETS ON)
target_sources(${NAME} PRIVATE main.cpp)
target_link_libraries(
${NAME} PRIVATE ${DESBORDANTE_PREFIX}::dd::fastdd spdlog::spdlog_header_only
magic_enum::magic_enum Boost::headers
)
192 changes: 192 additions & 0 deletions src/core/algorithms/dd/fastdd/fastdd.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
#include "core/algorithms/dd/fastdd/fastdd.h"

#include <algorithm>
#include <chrono>
#include <cstddef>
#include <limits>
#include <list>
#include <memory>
#include <stdexcept>
#include <utility>

#include "core/algorithms/dd/fastdd/model/pli_shard.h"
#include "core/algorithms/dd/fastdd/util/differential_function_builder.h"
#include "core/algorithms/dd/fastdd/util/distance_calculator.h"
#include "core/algorithms/dd/fastdd/util/static_bitset.h"
#include "core/algorithms/dd/fastdd/util/threshold_calculator.h"
#include "core/config/names_and_descriptions.h"
#include "core/config/option_using.h"
#include "core/config/tabular_data/input_table/option.h"
#include "core/model/table/column_index.h"
#include "core/util/logger.h"

namespace algos::dd {

FastDD::FastDD() : DDAlgorithm() {
RegisterOptions();
MakeOptionsAvailable({config::kTableOpt.GetName()});
}

void FastDD::RegisterOptions() {
DESBORDANTE_OPTION_USING;

config::InputTable default_table;

RegisterOption(config::kTableOpt(&input_table_));
RegisterOption(Option{&difference_table_, kDifferenceTable, kDDifferenceTable, default_table});
RegisterOption(Option{&num_rows_, kNumRows, kDNumRows, 0U});
RegisterOption(Option{&num_columns_, kNumColumns, kDNumColumns, 0U});
RegisterOption(Option{&shard_length_, kShardLength, kDShardLength, 800U});
}

void FastDD::MakeExecuteOptsAvailable() {
using namespace config::names;

MakeOptionsAvailable({kDifferenceTable, kNumRows, kNumColumns, kShardLength});
}

void FastDD::LoadDataInternal() {
typed_relation_ = model::ColumnLayoutTypedRelationData::CreateFrom(*input_table_, false,
true); // nulls are ignored
if (typed_relation_->GetColumnData().empty()) {
throw std::runtime_error("Got an empty dataset: DD mining is meaningless.");
}
}

void FastDD::SetLimits() {
unsigned all_rows_num = typed_relation_->GetNumRows();
model::ColumnIndex all_columns_num = typed_relation_->GetNumColumns();
if (num_rows_ > all_rows_num) {
throw std::invalid_argument(
"'num_rows' must be less or equal to the number of rows in the table (total "
"rows: " +
std::to_string(all_rows_num) + ")");
}
if (num_columns_ > all_columns_num) {
throw std::invalid_argument(
"'num_columns' must be less or equal to the number of columns in the table (total "
"columns: " +
std::to_string(all_columns_num) + ")");
}
if (num_rows_ == 0) num_rows_ = all_rows_num;
if (num_columns_ == 0) num_columns_ = all_columns_num;
}

void FastDD::CheckTypes() {
type_ids_.resize(num_columns_, model::TypeId::kUndefined);
for (model::ColumnIndex column_index = 0; column_index < num_columns_; column_index++) {
model::TypedColumnData const& column = typed_relation_->GetColumnData(column_index);
model::TypeId type_id = column.GetTypeId();

if (type_id == model::TypeId::kUndefined) {
throw std::invalid_argument("Column with index \"" + std::to_string(column_index) +
"\" type undefined.");
}
if (type_id == model::TypeId::kMixed) {
LOG_WARN(
"Column with index \"{}\" contains values of different types. Those values "
"will be "
"treated as strings.",
column_index);
}

type_ids_[column_index] = type_id;

for (std::size_t row_index = 0; row_index < num_rows_; row_index++) {
if (column.IsNull(row_index)) {
throw std::runtime_error("Some of the value coordinates are nulls.");
}
if (column.IsEmpty(row_index)) {
throw std::runtime_error("Some of the value coordinates are empty.");
}
}
}
}

void FastDD::ParseDifferenceTable() {
if (difference_table_) {
difference_typed_relation_ =
model::ColumnLayoutTypedRelationData::CreateFrom(*difference_table_,
false); // nulls are ignored
if (typed_relation_->GetNumColumns() != num_columns_) {
throw std::invalid_argument(
"The number of columns in the difference table must be equal to the number of "
"columns in the loaded table or to 'num_columns' if specified");
}
}
}

unsigned long long FastDD::ExecuteInternal() {
auto const start_time = std::chrono::system_clock::now();
LOG_INFO("Start");

SetLimits();
CheckTypes();
ParseDifferenceTable();

std::shared_ptr<DistanceCalculator> distance_calculator =
std::make_shared<DistanceCalculator>(typed_relation_);
ThresholdCalculator threshold_calculator(difference_typed_relation_, distance_calculator,
num_rows_, num_columns_);
std::vector<std::vector<model::DFConstraint>> const& thresholds =
threshold_calculator.GetThresholds();
distance_calculator->SetMaxThresholds(thresholds);

PliShardBuilder pli_shard_builder(shard_length_);
std::vector<PliShard> pli_shards =
pli_shard_builder.BuildPliShards(typed_relation_->GetColumnData());
LOG_INFO("Built PLIs");
LOG_DEBUG("Number of PLI shards: {}", pli_shards.size());

DifferentialFunctionBuilder df_builder(typed_relation_, num_columns_);
df_builder.BuildDFList(thresholds);
LOG_INFO("Built DF set");
LOG_INFO("Initial search space size: {}", df_builder.GetDifFuncNum());

auto elapsed_milliseconds = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::system_clock::now() - start_time);
LOG_DEBUG("Current time: {}", elapsed_milliseconds.count());

std::size_t dif_func_num = df_builder.GetDifFuncNum();

if (dif_func_num <= 32) {
RunAlgo<StaticBitset<32>>(df_builder, distance_calculator, std::move(pli_shards),
start_time);
} else if (dif_func_num <= 64) {
RunAlgo<StaticBitset<64>>(df_builder, distance_calculator, std::move(pli_shards),
start_time);
} else if (dif_func_num <= 128) {
RunAlgo<StaticBitset<128>>(df_builder, distance_calculator, std::move(pli_shards),
start_time);
} else {
RunAlgo<boost::dynamic_bitset<>>(df_builder, distance_calculator, std::move(pli_shards),
start_time);
}

std::ranges::for_each(dds_, [this](auto const& dd) {
auto df_to_constraint = [](DifferentialFunction const& df) {
return model::DFStringConstraint{df.GetColumn()->GetName(), df.GetConstraint()};
};

std::list<model::DFStringConstraint> lhs;
std::ranges::transform(dd.GetLhs(), std::back_inserter(lhs), df_to_constraint);

std::list<model::DFStringConstraint> rhs = {df_to_constraint(dd.GetRhs())};

RegisterDD(model::DDString{std::move(lhs), std::move(rhs)});
});

LOG_INFO("Built DDs: {}", DDList().size());
if (DDList().size() <= 100) {
for (auto const& dd : DDList()) {
LOG_DEBUG(dd.ToString());
}
}

elapsed_milliseconds = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::system_clock::now() - start_time);
LOG_INFO("Algorithm time: {}", elapsed_milliseconds.count());
return elapsed_milliseconds.count();
}

} // namespace algos::dd
Loading
Loading