Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion src/rail/estimation/algos/k_nearneigh.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,22 @@ def run(self):
training_data = self.get_data('input')[self.config.hdf5_groupname]
else: # pragma: no cover
training_data = self.get_data('input')
# check that bands are present in the data before creating dataframe
for band in self.config.bands:
if band not in training_data.keys():
raise KeyError(f"specified band {band} not found in input data")
knndf = pd.DataFrame(training_data, columns=self.config.bands)
self.zgrid = np.linspace(self.config.zmin, self.config.zmax, self.config.nzbins)

# check that ref band present in data
if self.config.ref_band not in knndf.keys():
raise ValueError(f"ref_band {self.config.ref_band} not found in input data")
# check that mag_limit dict keys are in input data
for mkey in self.config.mag_limits.keys():
if mkey not in knndf.keys():
raise KeyError(f"mag_limits dict key {mkey} not present in input data, make sure that you"
"have specified the mag_limits dict with the same names as your bands")

# replace nondetects
# will fancy this up later with a flow to sample from truth
for col in self.config.bands:
Expand Down Expand Up @@ -192,7 +205,7 @@ def _process_chunk(self, start, end, data, first):
dists, idxs = self.kdtree.query(testcolordata, k=self.numneigh)
dists += TEENY
test_ens = _makepdf(dists, idxs, self.trainszs, self.sigma)

zmode = test_ens.mode(grid=self.zgrid)
test_ens.set_ancil(dict(zmode=zmode))
self._do_chunk_output(test_ens, start, end, first)
89 changes: 64 additions & 25 deletions tests/sklearn/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
DS = RailStage.data_store
DS.__class__.allow_overwrite = True


def test_simple_nn():
train_config_dict = {
"width": 0.025,
Expand All @@ -36,7 +37,6 @@ def test_simple_nn():
assert np.isclose(results.ancil["zmode"], rerun_results.ancil["zmode"]).all()



@pytest.mark.skipif(
int(sci_ver_str[0]) < 2 and int(sci_ver_str[1]) < 8,
reason="mixmod parameterization known to break for scipy<1.8 due to array broadcast change",
Expand Down Expand Up @@ -83,6 +83,7 @@ def test_KNearNeigh():
# assert np.isclose(results.ancil['zmode'], zb_expected).all()
assert np.isclose(results.ancil["zmode"], rerun_results.ancil["zmode"]).all()


# test for k=1 when data point has same value, used to cause errors because of
# a divide by zero, should be fixed now but add a test
def test_same_data_knn():
Expand All @@ -103,21 +104,59 @@ def test_same_data_knn():
assert ~(np.isnan(modes).all())
os.remove(pz.get_output(pz.get_aliased_tag('output'), final_name=True))



def test_bad_inputs_knn():
train_algo = k_nearneigh.KNearNeighInformer
pz_algo = k_nearneigh.KNearNeighEstimator
def_maglims = dict(
mag_u_lsst=27.79,
mag_g_lsst=29.04,
mag_r_lsst=29.06,
mag_i_lsst=28.62,
mag_z_lsst=27.98,
mag_y_lsst=27.05,
)
with pytest.raises(KeyError):
params = dict(bands=["u, g, r, i, fakeband"],
ref_band="mag_i_lsst",
mag_limits=def_maglims)
results, rerun_results, rerun3_results = one_algo(
"KNN", train_algo, pz_algo, params, params)


def test_bad_ref_band_knn():
train_algo = k_nearneigh.KNearNeighInformer
pz_algo = k_nearneigh.KNearNeighEstimator
with pytest.raises(ValueError):
params = dict(ref_band="fakeband")
results, rerun_results, rerun3_results = one_algo(
"KNN", train_algo, pz_algo, params, params)


def test_bad_mag_lims_knn():
train_algo = k_nearneigh.KNearNeighInformer
pz_algo = k_nearneigh.KNearNeighEstimator
with pytest.raises(KeyError):
mag_limits = dict(fakeband=29., xband=30.)
params = dict(mag_limits=mag_limits)
results, rerun_results, rerun3_results = one_algo(
"KNN", train_algo, pz_algo, params, params)


def test_catch_bad_bands():
params = dict(bands="u,g,r,i,z,y")
with pytest.raises(ValueError):
sklearn_neurnet.SklNeurNetInformer.make_stage(hdf5_groupname="", **params)
with pytest.raises(ValueError):
sklearn_neurnet.SklNeurNetEstimator.make_stage(hdf5_groupname="", **params)


def test_randomForestClassifier():
class_bands = [ "r","i","z"]
class_bands = ["r", "i", "z"]
bands = {"r": "mag_r_lsst", "i": "mag_i_lsst", "z": "mag_z_lsst"}
bin_edges=[0,0.2,0.5]
train_config_dict=dict(
bin_edges = [0, 0.2, 0.5]

train_config_dict = dict(
class_bands=class_bands,
bands=bands,
redshift_col="redshift",
Expand All @@ -126,25 +165,25 @@ def test_randomForestClassifier():
hdf5_groupname="photometry",
model="model.tmp",
)
estim_config_dict=dict(hdf5_groupname="photometry", model="model.tmp", id_name="")

estim_config_dict = dict(hdf5_groupname="photometry", model="model.tmp", id_name="")

train_algo = random_forest.RandomForestInformer
tomo_algo = random_forest.RandomForestClassifier
results, rerun_results, rerun3_results = one_algo(
"randomForestClassifier", train_algo, tomo_algo, train_config_dict, estim_config_dict,
is_classifier=True,
)
assert np.isclose(results["data"]["class_id"], rerun_results["data"]["class_id"]).all()
assert len(results["data"]["class_id"])==len(results["data"]["row_index"])
assert len(results["data"]["class_id"]) == len(results["data"]["row_index"])


def test_randomForestClassifier_id():
class_bands = [ "r","i","z"]
class_bands = ["r", "i", "z"]
bands = {"r": "mag_r_lsst", "i": "mag_i_lsst", "z": "mag_z_lsst"}
bin_edges=[0,0.2,0.5]
train_config_dict=dict(
bin_edges = [0, 0.2, 0.5]

train_config_dict = dict(
class_bands=class_bands,
bands=bands,
redshift_col="redshift",
Expand All @@ -153,32 +192,32 @@ def test_randomForestClassifier_id():
hdf5_groupname="photometry",
model="model.tmp",
)
estim_config_dict=dict(hdf5_groupname="photometry", model="model.tmp", id_name="id")
estim_config_dict = dict(hdf5_groupname="photometry", model="model.tmp", id_name="id")

train_algo = random_forest.RandomForestInformer
tomo_algo = random_forest.RandomForestClassifier

traindata = os.path.join(RAILDIR, 'rail/examples_data/testdata/training_100gal.hdf5')
validdata = os.path.join(RAILDIR, 'rail/examples_data/testdata/validation_10gal.hdf5')

DS = RailStage.data_store
DS.__class__.allow_overwrite = True
DS.clear()
training_data = DS.read_file('training_data', TableHandle, traindata)
validation_data = DS.read_file('validation_data', TableHandle, validdata)

train_pz = train_algo.make_stage(**train_config_dict)
train_pz.inform(training_data)
pz = tomo_algo.make_stage(name="randomForestClassifier", **estim_config_dict)
estim = pz.classify(training_data)
results=estim.data
results = estim.data

os.remove(pz.get_output(pz.get_aliased_tag('output'), final_name=True))
model_file = estim_config_dict.get('model', 'None')
if model_file != 'None':
try:
os.remove(model_file)
except FileNotFoundError: #pragma: no cover
except FileNotFoundError: # pragma: no cover
pass
assert len(results["data"]["class_id"])==len(results["data"]["id"])

assert len(results["data"]["class_id"]) == len(results["data"]["id"])