desbordante-core/examples/basic/mining_cind_2.py at 69f47fd10473ba5a1ea5fbbcb2f7984acaf9de4c · Desbordante/desbordante-core · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
import desbordante
import pandas as pd
from tabulate import tabulate
import textwrap
import pprint

YELLOW = "\033[1;33m"
CYAN = "\033[1;36m"
RESET = "\033[0m"


def prints(s):
    print(textwrap.fill(s, 80))


def printlns(s):
    prints(s)
    print()


def banner(title):
    print("=" * 80)
    print(f"{CYAN}{title}{RESET}")
    print("=" * 80)


def print_table(df, title):
    print(title)
    print(tabulate(df, headers="keys", tablefmt="psql"))
    print()


def short_attr(attr):
    return attr.replace(".csv", "").replace("cind_test_", "")


def fmt_attrs(attrs):
    cleaned = [short_attr(a) for a in attrs]
    if all("." in a for a in cleaned):
        tables = {a.split(".", 1)[0] for a in cleaned}
        if len(tables) == 1:
            t = tables.pop()
            cols = [a.split(".", 1)[1] for a in cleaned]
            return f"{t}.[{', '.join(cols)}]"
    return ", ".join(cleaned)


def print_attrs_line(label, attrs_str, indent):
    pad = " " * indent
    line = f"{pad}{label}: "
    wrapped = textwrap.fill(attrs_str, width=80,
                            initial_indent=line, subsequent_indent=" " * len(line))
    print(wrapped)


def print_condition(cond, num, indent):
    pad = " " * indent
    label = f"{pad}{num}. data = "
    data_str = pprint.pformat(cond.data(), width=80 - len(label), compact=True)
    lines = data_str.splitlines()
    print(f"{label}{lines[0]}")
    cont_pad = " " * len(label)
    for line in lines[1:]:
        print(f"{cont_pad}{line}")
    print(f"{cont_pad}validity = {cond.validity():.3f}, "
          f"completeness = {cond.completeness():.3f}")


def fmt_ind(ind_string):
    import re
    m = re.match(
        r"\(([^,]+),\s*\[([^\]]*)\]\)\s*->\s*\(([^,]+),\s*\[([^\]]*)\]\)",
        ind_string,
    )
    if not m:
        return ind_string
    lhs_t, lhs_c, rhs_t, rhs_c = m.groups()
    lhs_t = lhs_t.replace("cind_test_", "").replace(".csv", "")
    rhs_t = rhs_t.replace("cind_test_", "").replace(".csv", "")
    return f"{lhs_t}.[{lhs_c}] subseteq {rhs_t}.[{rhs_c}]"


def mine_cure(error, support):
    algo = desbordante.cind.algorithms.Default()
    algo.load_data(tables=TABLES, algo_type="cure_cind")
    algo.execute(error=error, support=support)
    return algo, algo.get_cinds()


def mine_cinderella(error, validity, completeness, condition_type="row"):
    algo = desbordante.cind.algorithms.Default()
    algo.load_data(tables=TABLES, algo_type="cinderella")
    algo.execute(error=error, validity=validity, completeness=completeness,
                 condition_type=condition_type)
    return algo, algo.get_cinds()


banner("Discovering CINDs with the Cure algorithm")

printlns(
    "In this example we discover Conditional Inclusion Dependencies with the "
    "Cure algorithm [3], an alternative to the CINDERELLA / PLI-CIND miners "
    "shown in mining_cind.py. Cure is controlled by a single support "
    "threshold and produces compact patterns with disjunctive RHS values."
)

print(f"{YELLOW}>>> Definition (CIND) [1, 2].{RESET}")
prints(
    "A Conditional Inclusion Dependency restricts an IND R1[X] subseteq R2[Y] "
    "to R1 rows that match a pattern on the remaining columns of R1 (the "
    "conditional attributes). Pattern entries are concrete values or a "
    "wildcard (\"_\" or \"-\") meaning any value. See mining_cind.py for the "
    "full definition and the validity/completeness metrics used by other "
    "miners."
)
print()

print(f"{YELLOW}>>> Cure pipeline [3].{RESET}")
prints(
    "Mining works in two stages. Spider first searches for Approximate "
    "Inclusion Dependencies (AINDs) - inclusions that hold up to a small "
    "fraction of mismatches, controlled by the error threshold. Then, for "
    "each AIND, Cure runs in two phases:"
)
print()
print("  1. Discovery: for every pair of (LHS conditional attribute, RHS")
print("     conditional attribute), hash-join the rows on the inclusion key")
print("     and count co-occurrences of (LHS value, RHS value) pairs. Keep")
print("     pairs whose count is at least the support threshold.")
print()
print("  2. Minimal cover: merge patterns sharing the same (LHS attribute,")
print("     LHS value) into one tableau row. When several RHS values appear")
print("     for the same LHS key, they are folded into a comma-separated")
print("     disjunction in the corresponding RHS slot.")
print()
prints(
    "Validity and completeness for the resulting patterns are derived from "
    "the per-pattern support and the total number of joined tuples; they are "
    "informational only - Cure does not filter by them."
)
print()


print(f"{CYAN}Datasets{RESET}")
print("-" * 80)

printlns(
    "Two toy tables, en and de, with the same people taken from the English "
    "and German editions of Wikipedia. Columns: pid, cent (century), "
    "birthplace, deathplace, desc (description, profession)."
)

en = pd.read_csv("examples/datasets/ind_datasets/cind_test_en.csv")
de = pd.read_csv("examples/datasets/ind_datasets/cind_test_de.csv")
print_table(en, "en:")
print_table(de, "de:")

TABLES = [(f"examples/datasets/ind_datasets/{name}.csv", ",", True)
          for name in ["cind_test_de", "cind_test_en"]]


print(f"{CYAN}Algorithm parameters{RESET}")
print("-" * 80)

print("  * error    AIND error threshold; the IND finder accepts INDs")
print("             whose error is at most this value.")
print()
print("  * support  minimum number of joined (LHS, RHS) tuple pairs that")
print("             must back a (LHS value, RHS value) pattern for it to")
print("             be kept. Larger support -> fewer, more reliable")
print("             patterns; smaller support -> wider exploration.")
print()


banner("Scenario 1. Mine CINDs with relaxed support")

printlns(
    "Start with support=1 so any pattern observed at least once is kept. "
    "This is the broadest setting and shows what kinds of patterns Cure "
    "produces on this dataset."
)

algo1, cinds = mine_cure(error=0.5, support=1)
print(f"  found {len(cinds)} CIND(s):")
for i, c in enumerate(cinds, start=1):
    print(f"    #{i:<2}  {fmt_ind(c.get_ind_string()):<48} "
          f"{c.conditions_number():>3} cond.")
print()

if not cinds:
    printlns(
        "No CINDs were found. Lower support or raise error to widen the search."
    )
else:
    printlns(
        "Each line shows the underlying inclusion dependency the CIND "
        "refines (LHS table and columns subseteq RHS table and columns) "
        "and the number of concrete patterns Cure found over the remaining "
        "LHS columns - the conditional attributes."
    )

    sample = cinds[0]
    sample_conds = sample.get_conditions()
    sample_attrs = fmt_attrs(sample.get_condition_attributes())
    if sample_conds:
        print(f"First few conditions of CIND #1 ({fmt_ind(sample.get_ind_string())}):")
        print_attrs_line("conditional attributes (in column order)",
                         sample_attrs, indent=2)
        print()
        for j, cond in enumerate(sample_conds[:3], start=1):
            print_condition(cond, j, indent=2)
        if len(sample_conds) > 3:
            print(f"  ... ({len(sample_conds) - 3} more)")
        print()
        printlns(
            "Each Condition lists concrete values for the conditional "
            "attributes (in column order); '-' means a wildcard. A "
            "comma-separated value (\"a, b\") is a disjunction produced by "
            "the minimal-cover phase: the same LHS key matched several RHS "
            "values, all of which satisfy the pattern."
        )


banner("Scenario 2. Tighter support")

printlns(
    "Raising support keeps only patterns backed by more joined tuples. "
    "Higher thresholds produce fewer but more reliable conditions."
)

support_levels = (1, 2, 3, 5, 8)
support_runs = [mine_cure(error=0.5, support=s) for s in support_levels]
for sup, (_, cs) in zip(support_levels, support_runs):
    total = sum(c.conditions_number() for c in cs)
    print(f"  support={sup}:  {len(cs)} CIND(s), {total} condition(s)")
print()

printlns(
    "In practice the workflow is iterative: start with a small support to "
    "see what patterns exist, then raise it to focus on the patterns that "
    "are well-supported by the data."
)


banner("Scenario 3. Cure vs CINDERELLA")

printlns(
    "On the same AINDs the two miners use different scoring and produce "
    "different sets of conditions. CINDERELLA filters multi-attribute "
    "patterns by validity and completeness; Cure mines pairwise patterns "
    "with a support threshold and merges them into a minimal cover."
)

algo_cind, cinderella_cinds = mine_cinderella(error=0.5, validity=0.75, completeness=0.25)
total_cinderella = sum(c.conditions_number() for c in cinderella_cinds)

algo_cure, cure_cinds = mine_cure(error=0.5, support=2)
total_cure = sum(c.conditions_number() for c in cure_cinds)

print(f"  CINDERELLA (validity>=0.75, completeness>=0.25): "
      f"{len(cinderella_cinds)} CIND(s), {total_cinderella} condition(s)")
print(f"  Cure       (support>=2):                          "
      f"{len(cure_cinds)} CIND(s), {total_cure} condition(s)")
print()

printlns(
    "Below is a side-by-side look at one CIND - the conditional patterns "
    "the two miners produce for the same underlying IND."
)


def find_by_ind(cinds, ind_string):
    for c in cinds:
        if c.get_ind_string() == ind_string:
            return c
    return None


def pick_demo_ind(cinderella_cinds, cure_cinds):
    """Pick an IND that has conditions from both miners,
    preferring one where Cure produces a disjunction."""
    cure_inds = {c.get_ind_string(): c for c in cure_cinds if c.conditions_number()}
    cind_inds = {c.get_ind_string(): c for c in cinderella_cinds if c.conditions_number()}
    common = [k for k in cure_inds if k in cind_inds]
    for k in common:
        if any("," in v for cond in cure_inds[k].get_conditions() for v in cond.data()):
            return k
    return common[0] if common else None


def show_conditions(cind, prefer_disjunction=False, max_show=3):
    if cind is None:
        print("    (no CIND with conditions)")
        return
    print(f"    IND: {fmt_ind(cind.get_ind_string())}")
    attrs = fmt_attrs(cind.get_condition_attributes())
    print_attrs_line("conditional attributes", attrs, indent=4)
    conditions = list(cind.get_conditions())
    if prefer_disjunction:
        conditions.sort(key=lambda c: not any("," in v for v in c.data()))
    for j, cond in enumerate(conditions[:max_show], start=1):
        print_condition(cond, j, indent=6)
    if len(conditions) > max_show:
        print(f"      ... ({len(conditions) - max_show} more)")


demo_ind = pick_demo_ind(cinderella_cinds, cure_cinds)
print("  CINDERELLA (validity>=0.75, completeness>=0.25):")
show_conditions(find_by_ind(cinderella_cinds, demo_ind))
print()
print("  Cure (support>=2):")
show_conditions(find_by_ind(cure_cinds, demo_ind), prefer_disjunction=True)
print()

printlns(
    "Two structural differences are visible. First, CINDERELLA's conditional "
    "attributes are taken only from the LHS table, while Cure also includes "
    "the RHS table's columns - patterns can constrain values on both sides "
    "of the inclusion. Second, Cure can collapse several RHS values for the "
    "same LHS key into one comma-separated slot (e.g. \"a, b\"), while "
    "CINDERELLA emits one explicit pattern per distinct value combination."
)


banner("See also")

print("Related primitives in Desbordante:")
print("  * CIND mining (CINDERELLA, PLI-CIND) -  examples/basic/mining_cind.py")
print("  * CIND verification                  -  examples/basic/verifying_cind.py")
print("  * IND mining                         -  examples/basic/mining_ind.py")
print("  * AIND mining                        -  examples/basic/mining_aind.py")
print("  * IND/AIND verification              -  examples/basic/verifying_ind_aind.py")
print()

print("References:")
print("  [1] L. Bravo, W. Fan, S. Ma. Extending Dependencies with Conditions.")
print("      VLDB 2007, pp. 243-254.  -- introduces CINDs.")
print("  [2] J. Bauckmann, Z. Abedjan, U. Leser, H. Muller, F. Naumann.")
print("      Discovering Conditional Inclusion Dependencies. CIKM 2012,")
print("      pp. 2094-2098.  -- CINDERELLA and PLI-CIND.")
print("  [3] O. Cure. Improving the Data Quality of Drug Databases using")
print("      Conditional Dependencies and Ontologies. ACM JDIQ 4(1):20, 2012.")
print("      -- the Cure algorithm used in this example.")
print()