66#include < random>
77
88#include " core/algorithms/cfd/util/set_util.h"
9+ #include " core/util/logger.h"
910
1011// see algorithms/cfd/LICENSE
1112
@@ -15,148 +16,54 @@ size_t CFDRelationData::GetNumRows() const {
1516 return data_rows_.size ();
1617}
1718
18- void CFDRelationData::AddNewItemsInFullTable (ItemDictionary& item_dictionary,
19- ColumnsValuesDict& columns_values_dict,
20- std::vector<ItemInfo>& items,
21- std::vector<std::string> const & string_row,
22- std::vector<int >& int_row,
23- std::vector<Transaction>& data_rows,
24- int & unique_elems_number, unsigned num_columns) {
25- int it;
26- for (size_t i = 0 ; i < num_columns; i++) {
27- auto ptr = item_dictionary.find (std::make_pair (i, string_row[i]));
28- if (ptr != item_dictionary.end ()) {
29- it = ptr->second ;
30- } else {
31- items.emplace_back (string_row[i], i);
32- columns_values_dict[static_cast <AttributeIndex>(i)].push_back (unique_elems_number);
33- item_dictionary[std::make_pair (i, string_row[i])] = unique_elems_number;
34- it = unique_elems_number++;
35- }
36- int_row[i] = it;
37- items[it - 1 ].frequency ++;
38- }
39- data_rows.push_back (int_row);
40- }
19+ std::unique_ptr<CFDRelationData> CFDRelationData::CreateFrom (model::IDatasetStream& data_stream) {
20+ auto schema = std::make_unique<RelationalSchema>(data_stream.GetRelationName ());
21+ size_t num_columns = data_stream.GetNumberOfColumns ();
22+ std::vector<CFDColumnData::ItemDictionary> item_dictionaries (num_columns);
23+ std::vector<std::vector<int >> columns_values (num_columns);
4124
42- std::unique_ptr<CFDRelationData> CFDRelationData::CreateFrom (model::IDatasetStream& parser,
43- unsigned columns_number,
44- unsigned tuples_number,
45- double c_sample, double r_sample) {
46- if (columns_number == 0 || tuples_number == 0 ) {
47- return CFDRelationData::CreateFrom (parser, c_sample, r_sample);
48- }
49-
50- // Fields of CFDRelationData class
51- auto schema = std::make_unique<RelationalSchema>(parser.GetRelationName ());
5225 std::vector<Transaction> data_rows;
53- ItemDictionary item_dictionary;
5426 std::vector<ItemInfo> items;
55- ColumnsValuesDict columns_values_dict;
56- int unique_elems_number = 1 ;
57-
58- unsigned num_columns = parser.GetNumberOfColumns ();
59- std::vector<std::string> line;
60- num_columns = std::min (num_columns, columns_number);
61- std::vector<std::string> string_row (num_columns);
62- while (parser.HasNextRow () && data_rows.size () < tuples_number) {
63- line = parser.GetNextRow ();
64- std::vector<int > int_row (num_columns);
65- for (size_t k = 0 ; k < num_columns; k++) {
66- string_row[k] = line[k];
67- }
68-
69- AddNewItemsInFullTable (item_dictionary, columns_values_dict, items, string_row, int_row,
70- data_rows, unique_elems_number, num_columns);
71- }
7227
73- std::vector<CFDColumnData> column_data;
74- for (AttributeIndex i = 0 ; static_cast <size_t >(i) < num_columns; ++i) {
75- auto column = Column (schema.get (), parser.GetColumnName (i), i);
76- schema->AppendColumn (std::move (column));
77- column_data.emplace_back (schema->GetColumn (i), columns_values_dict[i]);
78- }
28+ std::vector<std::string> row;
29+ int next_value_id = 1 ;
7930
80- return std::make_unique<CFDRelationData>(std::move (schema), std::move (column_data),
81- std::move (data_rows), std::move (item_dictionary),
82- std::move (items));
83- }
31+ while (data_stream.HasNextRow ()) {
32+ row = data_stream.GetNextRow ();
8433
85- void CFDRelationData::AddNewItemsInPartialTable (ItemDictionary& item_dictionary,
86- ColumnsValuesDict& columns_values_dict,
87- std::vector<ItemInfo>& items,
88- std::vector<std::string> const & string_row,
89- std::vector<int > const & columns_numbers_list,
90- std::vector<Transaction>& data_rows,
91- int & unique_elems_number, int size) {
92- std::vector<int > int_row (size);
93- AttributeIndex j = 0 ;
94- int it;
95- for (size_t i = 0 ; i < string_row.size (); i++) {
96- if (!std::binary_search (columns_numbers_list.begin (), columns_numbers_list.end (), i)) {
34+ if (row.size () != num_columns) {
35+ LOG_WARN (
36+ " Unexpected number of columns for a row, "
37+ " skipping (expected {}, got {})" ,
38+ num_columns, row.size ());
9739 continue ;
9840 }
99- auto ptr = item_dictionary.find (std::make_pair (j, string_row[i]));
100- if (ptr != item_dictionary.end ()) {
101- it = ptr->second ;
102- } else {
103- items.emplace_back (string_row[i], j);
104- columns_values_dict[j].push_back (unique_elems_number);
105- item_dictionary[std::make_pair (j, string_row[i])] = unique_elems_number;
106- it = unique_elems_number++;
107- }
108- items[it - 1 ].frequency ++;
109- int_row[j] = it;
110- j++;
111- }
112- if (j > 0 ) {
113- data_rows.push_back (int_row);
114- }
115- }
11641
117- std::unique_ptr<CFDRelationData> CFDRelationData::CreateFrom (model::IDatasetStream& file_input,
118- double c_sample, double r_sample) {
119- // Fields of CFDRelationData class
120- auto schema = std::make_unique<RelationalSchema>(file_input.GetRelationName ());
121- std::vector<Transaction> data_rows;
122- ItemDictionary item_dictionary;
123- std::vector<ItemInfo> items;
124- ColumnsValuesDict columns_values_dict;
125- int unique_elems_number = 1 ;
126- std::random_device rd; // only used once to initialise (seed) engine
127- std::mt19937 rng (rd ()); // random-number engine used (Mersenne-Twister in this case)
128- std::uniform_real_distribution<double > uni (0.0 , 1.0 ); // guaranteed unbiased
129- int num_columns = static_cast <int >(file_input.GetNumberOfColumns ());
130- std::vector<std::string> line;
131- std::vector<int > columns_numbers_list = Range (0 , num_columns);
132- int size = static_cast <int >(static_cast <double >(columns_numbers_list.size ()) * c_sample);
133- Shuffle (columns_numbers_list);
134- columns_numbers_list =
135- std::vector<int >(columns_numbers_list.begin (), columns_numbers_list.begin () + size);
136- std::sort (columns_numbers_list.begin (), columns_numbers_list.end ());
137- while (file_input.HasNextRow ()) {
138- if (uni (rng) >= r_sample) {
139- continue ;
140- }
141- line = file_input.GetNextRow ();
42+ std::vector<int > row_data (num_columns);
14243
143- AddNewItemsInPartialTable (item_dictionary, columns_values_dict, items, line,
144- columns_numbers_list, data_rows, unique_elems_number, size);
44+ for (size_t index = 0 ; index < num_columns; ++index) {
45+ auto [it, inserted] = item_dictionaries[index].try_emplace (row[index], next_value_id);
46+
47+ if (inserted) {
48+ items.emplace_back (std::move (row[index]), index);
49+ columns_values[index].push_back (next_value_id++);
50+ }
51+
52+ row_data[index] = it->second ;
53+ items[it->second - 1 ].frequency ++;
54+ }
55+ data_rows.push_back (std::move (row_data));
14556 }
14657
14758 std::vector<CFDColumnData> column_data;
148- for (AttributeIndex i = 0 ; i < num_columns; ++i) {
149- auto column = Column (schema.get (), file_input .GetColumnName (i), i);
59+ for (size_t i = 0 ; i < num_columns; ++i) {
60+ auto column = Column (schema.get (), data_stream .GetColumnName (i), i);
15061 schema->AppendColumn (std::move (column));
151- column_data.emplace_back (schema->GetColumn (i), columns_values_dict[i]);
62+ column_data.emplace_back (schema->GetColumn (i), std::move (columns_values[i]),
63+ std::move (item_dictionaries[i]));
15264 }
15365 return std::make_unique<CFDRelationData>(std::move (schema), std::move (column_data),
154- std::move (data_rows), std::move (item_dictionary),
155- std::move (items));
156- }
157-
158- unsigned CFDRelationData::Size () const {
159- return data_rows_.size ();
66+ std::move (data_rows), std::move (items));
16067}
16168
16269unsigned CFDRelationData::GetAttrsNumber () const {
@@ -204,25 +111,10 @@ int CFDRelationData::GetAttr(std::string const& s) const {
204111}
205112
206113int CFDRelationData::GetItem (int attr, std::string const & str_value) const {
207- auto it = item_dictionary_.find (std::make_pair (attr, str_value));
208- if (it != item_dictionary_.end ()) {
209- return it->second ;
210- } else {
211- return -1 ;
212- }
213- }
214-
215- void CFDRelationData::Sort () {
216- std::sort (data_rows_.begin (), data_rows_.end (),
217- [](std::vector<int > const & a, std::vector<int > const & b) {
218- return std::lexicographical_compare (a.begin (), b.begin (), a.end (), b.end ());
219- });
220- }
114+ auto const & item_dict = column_data_.at (attr).GetValueDict ();
115+ auto it = item_dict.find (str_value);
221116
222- void CFDRelationData::ToFront (SimpleTIdList const & tids) {
223- for (size_t i = 0 ; i < tids.size (); i++) {
224- std::swap (data_rows_[i], data_rows_[tids[i]]);
225- }
117+ return it != item_dict.end () ? it->second : -1 ;
226118}
227119
228120unsigned CFDRelationData::Frequency (int i) const {
0 commit comments