Skip to content

Commit bbaa84a

Browse files
committed
refactor: delete useless CFDRelationData::CreateFrom
1 parent b9d2441 commit bbaa84a

13 files changed

Lines changed: 123 additions & 8382 deletions

File tree

src/core/algorithms/cfd/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ target_sources(
1515
)
1616
target_link_libraries(
1717
${NAME} PRIVATE ${DESBORDANTE_PREFIX}::cfd::util ${DESBORDANTE_PREFIX}::model::table
18-
Boost::headers
18+
spdlog::spdlog_header_only Boost::headers
1919
)
2020

2121
set(NAME cfd)

src/core/algorithms/cfd/cfd_discovery.cpp

Lines changed: 2 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,11 @@ namespace algos::cfd {
1717
CFDDiscovery::CFDDiscovery() : Algorithm() {
1818
using namespace config::names;
1919
RegisterOptions();
20-
MakeOptionsAvailable({kTable, kCfdColumnsNumber, kCfdTuplesNumber});
20+
MakeOptionsAvailable({kTable});
2121
}
2222

2323
void CFDDiscovery::LoadDataInternal() {
24-
relation_ = CFDRelationData::CreateFrom(*input_table_, columns_number_, tuples_number_);
24+
relation_ = CFDRelationData::CreateFrom(*input_table_);
2525

2626
if (relation_->GetColumnData().empty()) {
2727
throw std::runtime_error("Got an empty .csv file: CFD mining is meaningless.");
@@ -36,32 +36,7 @@ void CFDDiscovery::ResetState() {
3636
void CFDDiscovery::RegisterOptions() {
3737
DESBORDANTE_OPTION_USING;
3838

39-
auto check_num_cols = [this](unsigned int val) {
40-
if (OptionIsSet(kCfdTuplesNumber) && (tuples_number_ != 0 && val == 0)) {
41-
throw config::ConfigurationError(
42-
"Tuples number and columns number options must be both 0 or both greater than "
43-
"0.");
44-
}
45-
};
46-
47-
auto check_num_tuples = [this](unsigned int val) {
48-
if (OptionIsSet(kCfdColumnsNumber) && (columns_number_ != 0 && val == 0)) {
49-
throw config::ConfigurationError(
50-
"Tuples number and columns number options must be both 0 or both greater than "
51-
"0.");
52-
}
53-
};
54-
5539
RegisterOption(config::kTableOpt(&input_table_));
56-
RegisterOption(
57-
Option{&columns_number_, kCfdColumnsNumber, kDCfdColumnsNumber, 0u}.SetValueCheck(
58-
std::move(check_num_cols)));
59-
RegisterOption(Option{&tuples_number_, kCfdTuplesNumber, kDCfdTuplesNumber, 0u}.SetValueCheck(
60-
std::move(check_num_tuples)));
61-
}
62-
63-
int CFDDiscovery::NrCfds() const {
64-
return static_cast<int>(cfd_list_.size());
6540
}
6641

6742
ItemsetCFDList const& CFDDiscovery::GetItemsetCfds() const {

src/core/algorithms/cfd/cfd_discovery.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,15 +26,12 @@ class CFDDiscovery : public Algorithm {
2626
protected:
2727
config::InputTable input_table_;
2828

29-
unsigned columns_number_;
30-
unsigned tuples_number_;
3129
ItemsetCFDList cfd_list_;
3230
std::shared_ptr<CFDRelationData> relation_;
3331

3432
public:
3533
explicit CFDDiscovery();
3634
void LoadDataInternal() final;
37-
int NrCfds() const;
3835
ItemsetCFDList const& GetItemsetCfds() const;
3936
CFDList GetCfds() const;
4037
std::string GetRelationString(char delim = ' ') const;

src/core/algorithms/cfd/fd_first_algorithm.cpp

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,7 @@ void FDFirstAlgorithm::RegisterOptions() {
3333
auto check_supp = [this](unsigned int val) {
3434
if (val == 0) {
3535
throw config::ConfigurationError("Minimum support must be more than 0.");
36-
} else if (columns_number_ != 0 && tuples_number_ != 0 &&
37-
val > tuples_number_) { // At this point, the columns_number_ and tuples_number_
38-
// options are guaranteed to be set.
36+
} else if (val > relation_->GetNumRows()) {
3937
throw config::ConfigurationError(
4038
"Minimum support must be less than or equal to the number of tuples.");
4139
}
@@ -167,7 +165,7 @@ void FDFirstAlgorithm::FdsFirstDFS() {
167165
free_itemsets_.insert(Itemset{a.item});
168166
}
169167
cand_store_ = PrefixTree<Itemset, Itemset>();
170-
store_[Itemset()] = PartitionTIdList(Iota(relation_->Size()));
168+
store_[Itemset()] = PartitionTIdList(Iota(relation_->GetNumRows()));
171169
cand_store_.Insert(Itemset(), all_attrs_);
172170
FdsFirstDFS(Itemset(), items, substrategy_);
173171
}
@@ -529,7 +527,7 @@ FDFirstAlgorithm::PIdListMiners FDFirstAlgorithm::GetPartitionSingletons() {
529527
attr_indices[dom[i]] = std::make_pair(a, i);
530528
}
531529
}
532-
for (size_t row = 0; row < relation_->Size(); row++) {
530+
for (size_t row = 0; row < relation_->GetNumRows(); row++) {
533531
auto const& tup = relation_->GetRow(row);
534532
for (int item : tup) {
535533
auto const& attr_node_ix = attr_indices.at(item);
@@ -542,7 +540,7 @@ FDFirstAlgorithm::PIdListMiners FDFirstAlgorithm::GetPartitionSingletons() {
542540
int attr_item = -1 - static_cast<int>(a);
543541
auto new_node = MinerNode<PartitionTIdList>(attr_item);
544542
auto const& dom = relation_->GetDomain(a);
545-
new_node.tids.tids.reserve(relation_->Size() + dom.size() - 1);
543+
new_node.tids.tids.reserve(relation_->GetNumRows() + dom.size() - 1);
546544
new_node.tids.sets_number = dom.size();
547545
for (unsigned i = 0; i < dom.size(); i++) {
548546
auto& ts = new_node.tids.tids;

src/core/algorithms/cfd/model/cfd_column_data.h

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
#include <utility>
44
#include <vector>
55

6+
#include <boost/unordered_map.hpp>
7+
68
#include "core/model/table/abstract_column_data.h"
79

810
// see algorithms/cfd/LICENSE
@@ -11,20 +13,31 @@ namespace algos::cfd {
1113

1214
// Column presentation class for CFDRelationData.
1315
class CFDColumnData : model::AbstractColumnData {
16+
public:
1417
using NumToken = int;
18+
using ItemDictionary = boost::unordered_map<std::string, int>;
1519

20+
private:
1621
std::vector<NumToken> values_;
22+
// maps a value string to corresponding item id
23+
ItemDictionary values_dict_;
1724

1825
public:
19-
CFDColumnData(Column const* col, std::vector<int> col_values)
20-
: AbstractColumnData(col), values_(std::move(col_values)) {}
26+
CFDColumnData(Column const* col, std::vector<int> col_values, ItemDictionary values_dict)
27+
: AbstractColumnData(col),
28+
values_(std::move(col_values)),
29+
values_dict_(std::move(values_dict)) {}
2130

2231
explicit CFDColumnData(Column const* col) : AbstractColumnData(col) {}
2332

2433
std::vector<NumToken> const& GetValues() const {
2534
return values_;
2635
}
2736

37+
ItemDictionary const& GetValueDict() const {
38+
return values_dict_;
39+
}
40+
2841
std::string ToString() const final {
2942
return "Cfd data for " + column_->ToString();
3043
}

src/core/algorithms/cfd/model/cfd_relation_data.cpp

Lines changed: 36 additions & 144 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <random>
77

88
#include "core/algorithms/cfd/util/set_util.h"
9+
#include "core/util/logger.h"
910

1011
// see algorithms/cfd/LICENSE
1112

@@ -15,148 +16,54 @@ size_t CFDRelationData::GetNumRows() const {
1516
return data_rows_.size();
1617
}
1718

18-
void CFDRelationData::AddNewItemsInFullTable(ItemDictionary& item_dictionary,
19-
ColumnsValuesDict& columns_values_dict,
20-
std::vector<ItemInfo>& items,
21-
std::vector<std::string> const& string_row,
22-
std::vector<int>& int_row,
23-
std::vector<Transaction>& data_rows,
24-
int& unique_elems_number, unsigned num_columns) {
25-
int it;
26-
for (size_t i = 0; i < num_columns; i++) {
27-
auto ptr = item_dictionary.find(std::make_pair(i, string_row[i]));
28-
if (ptr != item_dictionary.end()) {
29-
it = ptr->second;
30-
} else {
31-
items.emplace_back(string_row[i], i);
32-
columns_values_dict[static_cast<AttributeIndex>(i)].push_back(unique_elems_number);
33-
item_dictionary[std::make_pair(i, string_row[i])] = unique_elems_number;
34-
it = unique_elems_number++;
35-
}
36-
int_row[i] = it;
37-
items[it - 1].frequency++;
38-
}
39-
data_rows.push_back(int_row);
40-
}
19+
std::unique_ptr<CFDRelationData> CFDRelationData::CreateFrom(model::IDatasetStream& data_stream) {
20+
auto schema = std::make_unique<RelationalSchema>(data_stream.GetRelationName());
21+
size_t num_columns = data_stream.GetNumberOfColumns();
22+
std::vector<CFDColumnData::ItemDictionary> item_dictionaries(num_columns);
23+
std::vector<std::vector<int>> columns_values(num_columns);
4124

42-
std::unique_ptr<CFDRelationData> CFDRelationData::CreateFrom(model::IDatasetStream& parser,
43-
unsigned columns_number,
44-
unsigned tuples_number,
45-
double c_sample, double r_sample) {
46-
if (columns_number == 0 || tuples_number == 0) {
47-
return CFDRelationData::CreateFrom(parser, c_sample, r_sample);
48-
}
49-
50-
// Fields of CFDRelationData class
51-
auto schema = std::make_unique<RelationalSchema>(parser.GetRelationName());
5225
std::vector<Transaction> data_rows;
53-
ItemDictionary item_dictionary;
5426
std::vector<ItemInfo> items;
55-
ColumnsValuesDict columns_values_dict;
56-
int unique_elems_number = 1;
57-
58-
unsigned num_columns = parser.GetNumberOfColumns();
59-
std::vector<std::string> line;
60-
num_columns = std::min(num_columns, columns_number);
61-
std::vector<std::string> string_row(num_columns);
62-
while (parser.HasNextRow() && data_rows.size() < tuples_number) {
63-
line = parser.GetNextRow();
64-
std::vector<int> int_row(num_columns);
65-
for (size_t k = 0; k < num_columns; k++) {
66-
string_row[k] = line[k];
67-
}
68-
69-
AddNewItemsInFullTable(item_dictionary, columns_values_dict, items, string_row, int_row,
70-
data_rows, unique_elems_number, num_columns);
71-
}
7227

73-
std::vector<CFDColumnData> column_data;
74-
for (AttributeIndex i = 0; static_cast<size_t>(i) < num_columns; ++i) {
75-
auto column = Column(schema.get(), parser.GetColumnName(i), i);
76-
schema->AppendColumn(std::move(column));
77-
column_data.emplace_back(schema->GetColumn(i), columns_values_dict[i]);
78-
}
28+
std::vector<std::string> row;
29+
int next_value_id = 1;
7930

80-
return std::make_unique<CFDRelationData>(std::move(schema), std::move(column_data),
81-
std::move(data_rows), std::move(item_dictionary),
82-
std::move(items));
83-
}
31+
while (data_stream.HasNextRow()) {
32+
row = data_stream.GetNextRow();
8433

85-
void CFDRelationData::AddNewItemsInPartialTable(ItemDictionary& item_dictionary,
86-
ColumnsValuesDict& columns_values_dict,
87-
std::vector<ItemInfo>& items,
88-
std::vector<std::string> const& string_row,
89-
std::vector<int> const& columns_numbers_list,
90-
std::vector<Transaction>& data_rows,
91-
int& unique_elems_number, int size) {
92-
std::vector<int> int_row(size);
93-
AttributeIndex j = 0;
94-
int it;
95-
for (size_t i = 0; i < string_row.size(); i++) {
96-
if (!std::binary_search(columns_numbers_list.begin(), columns_numbers_list.end(), i)) {
34+
if (row.size() != num_columns) {
35+
LOG_WARN(
36+
"Unexpected number of columns for a row, "
37+
"skipping (expected {}, got {})",
38+
num_columns, row.size());
9739
continue;
9840
}
99-
auto ptr = item_dictionary.find(std::make_pair(j, string_row[i]));
100-
if (ptr != item_dictionary.end()) {
101-
it = ptr->second;
102-
} else {
103-
items.emplace_back(string_row[i], j);
104-
columns_values_dict[j].push_back(unique_elems_number);
105-
item_dictionary[std::make_pair(j, string_row[i])] = unique_elems_number;
106-
it = unique_elems_number++;
107-
}
108-
items[it - 1].frequency++;
109-
int_row[j] = it;
110-
j++;
111-
}
112-
if (j > 0) {
113-
data_rows.push_back(int_row);
114-
}
115-
}
11641

117-
std::unique_ptr<CFDRelationData> CFDRelationData::CreateFrom(model::IDatasetStream& file_input,
118-
double c_sample, double r_sample) {
119-
// Fields of CFDRelationData class
120-
auto schema = std::make_unique<RelationalSchema>(file_input.GetRelationName());
121-
std::vector<Transaction> data_rows;
122-
ItemDictionary item_dictionary;
123-
std::vector<ItemInfo> items;
124-
ColumnsValuesDict columns_values_dict;
125-
int unique_elems_number = 1;
126-
std::random_device rd; // only used once to initialise (seed) engine
127-
std::mt19937 rng(rd()); // random-number engine used (Mersenne-Twister in this case)
128-
std::uniform_real_distribution<double> uni(0.0, 1.0); // guaranteed unbiased
129-
int num_columns = static_cast<int>(file_input.GetNumberOfColumns());
130-
std::vector<std::string> line;
131-
std::vector<int> columns_numbers_list = Range(0, num_columns);
132-
int size = static_cast<int>(static_cast<double>(columns_numbers_list.size()) * c_sample);
133-
Shuffle(columns_numbers_list);
134-
columns_numbers_list =
135-
std::vector<int>(columns_numbers_list.begin(), columns_numbers_list.begin() + size);
136-
std::sort(columns_numbers_list.begin(), columns_numbers_list.end());
137-
while (file_input.HasNextRow()) {
138-
if (uni(rng) >= r_sample) {
139-
continue;
140-
}
141-
line = file_input.GetNextRow();
42+
std::vector<int> row_data(num_columns);
14243

143-
AddNewItemsInPartialTable(item_dictionary, columns_values_dict, items, line,
144-
columns_numbers_list, data_rows, unique_elems_number, size);
44+
for (size_t index = 0; index < num_columns; ++index) {
45+
auto [it, inserted] = item_dictionaries[index].try_emplace(row[index], next_value_id);
46+
47+
if (inserted) {
48+
items.emplace_back(std::move(row[index]), index);
49+
columns_values[index].push_back(next_value_id++);
50+
}
51+
52+
row_data[index] = it->second;
53+
items[it->second - 1].frequency++;
54+
}
55+
data_rows.push_back(std::move(row_data));
14556
}
14657

14758
std::vector<CFDColumnData> column_data;
148-
for (AttributeIndex i = 0; i < num_columns; ++i) {
149-
auto column = Column(schema.get(), file_input.GetColumnName(i), i);
59+
for (size_t i = 0; i < num_columns; ++i) {
60+
auto column = Column(schema.get(), data_stream.GetColumnName(i), i);
15061
schema->AppendColumn(std::move(column));
151-
column_data.emplace_back(schema->GetColumn(i), columns_values_dict[i]);
62+
column_data.emplace_back(schema->GetColumn(i), std::move(columns_values[i]),
63+
std::move(item_dictionaries[i]));
15264
}
15365
return std::make_unique<CFDRelationData>(std::move(schema), std::move(column_data),
154-
std::move(data_rows), std::move(item_dictionary),
155-
std::move(items));
156-
}
157-
158-
unsigned CFDRelationData::Size() const {
159-
return data_rows_.size();
66+
std::move(data_rows), std::move(items));
16067
}
16168

16269
unsigned CFDRelationData::GetAttrsNumber() const {
@@ -204,25 +111,10 @@ int CFDRelationData::GetAttr(std::string const& s) const {
204111
}
205112

206113
int CFDRelationData::GetItem(int attr, std::string const& str_value) const {
207-
auto it = item_dictionary_.find(std::make_pair(attr, str_value));
208-
if (it != item_dictionary_.end()) {
209-
return it->second;
210-
} else {
211-
return -1;
212-
}
213-
}
214-
215-
void CFDRelationData::Sort() {
216-
std::sort(data_rows_.begin(), data_rows_.end(),
217-
[](std::vector<int> const& a, std::vector<int> const& b) {
218-
return std::lexicographical_compare(a.begin(), b.begin(), a.end(), b.end());
219-
});
220-
}
114+
auto const& item_dict = column_data_.at(attr).GetValueDict();
115+
auto it = item_dict.find(str_value);
221116

222-
void CFDRelationData::ToFront(SimpleTIdList const& tids) {
223-
for (size_t i = 0; i < tids.size(); i++) {
224-
std::swap(data_rows_[i], data_rows_[tids[i]]);
225-
}
117+
return it != item_dict.end() ? it->second : -1;
226118
}
227119

228120
unsigned CFDRelationData::Frequency(int i) const {

0 commit comments

Comments
 (0)