|
64 | 64 | # Re-export synth-shipped datasets (unchanged DGPs; this is the |
65 | 65 | # consolidated namespace) |
66 | 66 | from ..synth.datasets import ( |
67 | | - california_tobacco as california_prop99, |
| 67 | + california_tobacco as _california_tobacco_simulated, |
68 | 68 | basque_terrorism, |
69 | 69 | german_reunification, |
70 | 70 | ) |
| 71 | +from ._canonical import _load_bundled_csv |
| 72 | + |
| 73 | + |
| 74 | +def california_prop99(simulated: bool = True) -> pd.DataFrame: |
| 75 | + """California Proposition 99 panel (Abadie-Diamond-Hainmueller 2010). |
| 76 | +
|
| 77 | + Parameters |
| 78 | + ---------- |
| 79 | + simulated : bool, default True |
| 80 | + If True, return the simulated covariate-rich replica from |
| 81 | + ``synth.california_tobacco`` (39 states × 31 years, 1970-2000, |
| 82 | + ADH-shaped DGP). Default for backward compatibility. |
| 83 | + If False, load the real ADH (2010) panel bundled in |
| 84 | + ``statspai/datasets/data/california_prop99.csv`` (39 states × |
| 85 | + 31 years, with covariates ``cigsale, retprice, lnincome, |
| 86 | + age15to24, beer``; identical to tidysynth's smoking dataset). |
| 87 | + Use this for exact paper replication. |
| 88 | +
|
| 89 | + Returns |
| 90 | + ------- |
| 91 | + pd.DataFrame |
| 92 | + Columns (both branches): ``state, year, cigsale, retprice, |
| 93 | + lnincome, age15to24, beer``. The simulated branch additionally |
| 94 | + provides ``treated``; on the real branch we derive it as |
| 95 | + ``(state == 'California') & (year >= 1989)``. |
| 96 | +
|
| 97 | + References |
| 98 | + ---------- |
| 99 | + Abadie, A., Diamond, A. & Hainmueller, J. (2010). |
| 100 | + Synthetic Control Methods for Comparative Case Studies. |
| 101 | + Journal of the American Statistical Association 105(490), 493-505. |
| 102 | + [@abadie2010synthetic] |
| 103 | + """ |
| 104 | + if simulated: |
| 105 | + return _california_tobacco_simulated() |
| 106 | + |
| 107 | + df = _load_bundled_csv("california_prop99.csv") |
| 108 | + # The bundled real CSV does not carry a 'treated' indicator; derive |
| 109 | + # it so downstream callers (synth, synthdid, plotting) work uniformly. |
| 110 | + if 'treated' not in df.columns: |
| 111 | + df = df.copy() |
| 112 | + df['treated'] = ( |
| 113 | + (df['state'] == 'California') & (df['year'] >= 1989) |
| 114 | + ).astype(int) |
| 115 | + df.attrs['paper'] = ( |
| 116 | + "Abadie, A., Diamond, A. & Hainmueller, J. (2010). " |
| 117 | + "Synthetic Control Methods for Comparative Case Studies. " |
| 118 | + "JASA 105(490), 493-505." |
| 119 | + ) |
| 120 | + df.attrs['data_source'] = 'real' |
| 121 | + df.attrs['simulated'] = False |
| 122 | + df.attrs['source_origin'] = ( |
| 123 | + "Public-domain ADH (2010) California Prop 99 panel; " |
| 124 | + "byte-identical to tidysynth's smoking dataset (1970-2000)." |
| 125 | + ) |
| 126 | + df.attrs['notes'] = ( |
| 127 | + "Real ADH panel for exact paper replication. Use the full " |
| 128 | + "ADH (2010) predictor recipe via sp.synth(method='classic', " |
| 129 | + "special_predictors=...) for canonical numbers; the headline " |
| 130 | + "1989-2000 average gap is roughly -19 packs/capita per ADH " |
| 131 | + "(2010) Figure 2." |
| 132 | + ) |
| 133 | + return df |
71 | 134 |
|
72 | 135 | # Convenience alias |
73 | 136 | teen_employment = mpdta |
|
76 | 139 | def list_datasets() -> pd.DataFrame: |
77 | 140 | """Return a DataFrame describing all available datasets. |
78 | 141 |
|
79 | | - Columns: name, design, n_obs, paper, expected_main. |
| 142 | + Columns: name, design, n_obs, paper, paper_original, expected_main. |
| 143 | +
|
| 144 | + - ``paper_original`` is the headline number from the published paper on the |
| 145 | + ORIGINAL data (what readers expect to see). |
| 146 | + - ``expected_main`` is what the canonical estimator recovers on this |
| 147 | + simulated replica (what users will actually observe). The two differ |
| 148 | + because the bundled replicas are deterministic DGPs calibrated to the |
| 149 | + neighbourhood of the published values, not the original data. |
| 150 | +
|
| 151 | + For the strict numerical neighbourhood proofs see |
| 152 | + ``tests/external_parity/test_published_replications.py`` and |
| 153 | + ``tests/external_parity/PUBLISHED_REFERENCE_VALUES.md``. |
80 | 154 | """ |
81 | 155 | registry = [ |
| 156 | + # (name, design, n_obs, paper, paper_original, expected_main) |
82 | 157 | ('mpdta', 'DID', 2500, |
83 | 158 | "Callaway-Sant'Anna (2021)", |
84 | | - "Simple ATT ≈ -0.040 (teen employment effect of min-wage)"), |
| 159 | + "Simple ATT ≈ -0.0454 (R did::att_gt on original mpdta)", |
| 160 | + "Simple ATT ≈ -0.033, dynamic ATT ≈ -0.034 on this replica"), |
85 | 161 | ('card_1995', 'IV', 3010, |
86 | 162 | "Card (1995)", |
87 | | - "IV returns-to-schooling ≈ 0.132 (OLS ≈ 0.075)"), |
| 163 | + "IV β_educ ≈ 0.132, OLS ≈ 0.075 (Table 3, NLSYM)", |
| 164 | + "IV β_educ ≈ 0.142, OLS ≈ 0.110 on this replica"), |
88 | 165 | ('nsw_lalonde', 'RCT / matching', 445, |
89 | 166 | "LaLonde (1986) / Dehejia-Wahba (1999)", |
90 | | - "Experimental ATT ≈ $1,794 (re78)"), |
| 167 | + "Experimental ATT ≈ $1,794 (DW 1999, re78)", |
| 168 | + "Naive OLS ≈ $1,556 on this replica (calibrated to $1,794)"), |
91 | 169 | ('nsw_dw', 'SOO', 2675, |
92 | 170 | "Dehejia-Wahba (1999)", |
93 | | - "Naive OLS ≈ -$8,498; PSM ≈ $1,794"), |
| 171 | + "Naive OLS ≈ -$8,498; PSM ≈ $1,794 (DW 1999)", |
| 172 | + "Naive OLS ≈ -$8,387; covariate-adjusted ≈ $2,313 on replica"), |
94 | 173 | ('lee_2008_senate', 'RD', 6558, |
95 | 174 | "Lee (2008)", |
96 | | - "Incumbent advantage ≈ 0.08 voteshare points"), |
| 175 | + "Incumbent advantage ≈ 0.077 voteshare pts (Table 4)", |
| 176 | + "Conventional ≈ 0.073, CCT robust ≈ 0.062 on this replica"), |
97 | 177 | ('angrist_krueger_1991', 'IV', 5000, |
98 | 178 | "Angrist-Krueger (1991)", |
99 | | - "QOB IV returns-to-schooling ≈ 0.08–0.11"), |
| 179 | + "QOB IV β_educ ≈ 0.08–0.11 (Table V, range)", |
| 180 | + "IV β_educ ≈ 0.10 by construction on this replica"), |
100 | 181 | ('california_prop99', 'SCM', 1200, |
101 | 182 | "Abadie-Diamond-Hainmueller (2010)", |
102 | | - "ATT ≈ -15 packs/capita (1988-2000)"), |
| 183 | + "Mean 1989-2000 ATT ≈ -19 packs/capita (JASA Fig. 2)", |
| 184 | + "Classic ADH ≈ -13.1, ASCM ≈ -13.3 packs/capita on this replica"), |
103 | 185 | ('basque_terrorism', 'SCM', 774, |
104 | 186 | "Abadie-Gardeazabal (2003)", |
105 | | - "GDP gap ≈ -0.855 (mean 1975-1997)"), |
| 187 | + "GDP gap ≈ -0.855 (mean 1975-1997)", |
| 188 | + "GDP gap ≈ -0.855 on this replica (calibrated)"), |
106 | 189 | ('german_reunification', 'SCM', 748, |
107 | 190 | "Abadie-Diamond-Hainmueller (2015)", |
108 | | - "West Germany GDPpc gap ≈ -1,500 (post-1990)"), |
| 191 | + "West Germany GDPpc gap ≈ -1,500 (post-1990)", |
| 192 | + "GDPpc gap ≈ -1,500 on this replica (calibrated)"), |
109 | 193 | ] |
110 | 194 | return pd.DataFrame(registry, |
111 | | - columns=['name', 'design', 'n_obs', |
112 | | - 'paper', 'expected_main']) |
| 195 | + columns=['name', 'design', 'n_obs', 'paper', |
| 196 | + 'paper_original', 'expected_main']) |
113 | 197 |
|
114 | 198 |
|
115 | 199 | __all__ = [ |
|
0 commit comments