Merge branch 'main' of github.com:sidratresearch/rail_tpz into sidrat-staging

jscora · jscora · commit ac41e444d9f9 · 2025-12-18T18:35:20.000-05:00
diff --git a/examples/TPZ_example_notebook.ipynb b/examples/TPZ_example_notebook.ipynb
@@ -8,7 +8,7 @@
     "# TPZ: Trees for Photo-Z's\n",
     "\n",
     "Author: Sam Schmidt <br>\n",
-    "Last successfully run: March 24, 2025 <br>\n",
+    "Last successfully run: December 11, 2025 <br>\n",
     "\n",
     "TPZ is one of the codes implemented in the MLZ (Machine Learning PhotoZ) package by Matias Carraso-Kind, some documentation for the algorithm is included in Matias' website for the package:\n",
     "http://matias-ck.com/mlz/\n",
@@ -137,11 +137,11 @@
     "\n",
     "The other configuration parameters for TPZ are:\n",
     "- `seed` (int): the random seed used by numpy for this stage <br>\n",
-    "- `nrandom` (int): the number of random training catalogs with Gaussian scatter to create. <br>\n",
-    "- `ntrees` (int): the number of bootstrap samples for a given random catalog to create. <br>\n",
-    "REMINDER: the total number of trees trained will be `nrandom` * `ntrees`, and if `nrandom` is set to 1, then no random catalogs are created, only the original training sample is used.<br>\n",
-    "- `minleaf` (int): the mininum number of galaxies in a terminal leaf. <br>\n",
-    "- `natt` (int): the number of attributes to split. <br>\n",
+    "- `n_random` (int): the number of random training catalogs with Gaussian scatter to create. <br>\n",
+    "- `n_trees` (int): the number of bootstrap samples for a given random catalog to create. <br>\n",
+    "REMINDER: the total number of trees trained will be `n_random` * `n_trees`, and if `n_random` is set to 1, then no random catalogs are created, only the original training sample is used.<br>\n",
+    "- `min_leaf` (int): the mininum number of galaxies in a terminal leaf. <br>\n",
+    "- `n_att` (int): the number of attributes to split. <br>\n",
     "- `sigmafactor` (float): Gaussian smoothing with kernel Sigma1*Resolution. <br>\n",
     "- `rmsfactor` (float): MS for zconf calculation. <br>\n",
     "- `tree_strategy` (string): see paragraph below.<br>\n",
@@ -197,8 +197,8 @@
     "                err_bands=error_list,\n",
     "                hdf5_groupname='photometry',\n",
     "                err_dict=new_err_dict,\n",
-    "                nrandom=3, \n",
-    "                ntrees=5,\n",
+    "                n_random=3, \n",
+    "                n_trees=5,\n",
     "                #tree_strategy='native')  # uncomment this line and comment out the line below to switch to using \"native\" trees \n",
     "                tree_strategy='sklearn')"
    ]
@@ -208,7 +208,7 @@
    "id": "638f6f40-e60d-47f4-b639-6bb4560b1631",
    "metadata": {},
    "source": [
-    "Now, lets create our stage and run `inform`.  We specified `nrandom = 3` and `ntrees = 5`, so we will get 15 trained trees that constitute our model.  For our 10k training galaxy sample this takes about 0.5 seconds for \"sklearn\", or about 90 seconds using \"native\" on my Mac desktop for a rough guide for how long this should take to train:"
+    "Now, lets create our stage and run `inform`.  We specified `n_random = 3` and `n_trees = 5`, so we will get 15 trained trees that constitute our model.  For our 10k training galaxy sample this takes about 0.5 seconds for \"sklearn\", or about 90 seconds using \"native\" on my Mac desktop for a rough guide for how long this should take to train:"
    ]
   },
   {
@@ -387,7 +387,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "which=5355\n",
+    "which=7187\n",
     "fig, axs = plt.subplots()\n",
     "results().plot_native(key=which,axes=axs, label=f\"PDF for galaxy {which}\")\n",
     "axs.axvline(sz[which],c='r',ls='--', label=\"true redshift\")\n",
@@ -428,7 +428,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.12.12"
   }
  },
  "nbformat": 4,
diff --git a/src/rail/estimation/algos/mlz_utils/data.py b/src/rail/estimation/algos/mlz_utils/data.py
@@ -287,7 +287,7 @@ def make_random(self, ntimes=-1):
         :param int ntimes: taken from class Pars unless otherwise indicated
         """
         if ntimes == -1:
-            ntimes = int(self.Pars.nrandom)
+            ntimes = int(self.Pars.n_random)
         # if outfileran == '': outfileran = self.Pars.randomcatname
         self.BigRan = create_random_realizations(
             self.AT, self.cat, ntimes, self.Pars.keyatt, self.rng
diff --git a/src/rail/estimation/algos/tpz_lite.py b/src/rail/estimation/algos/tpz_lite.py
@@ -99,16 +99,16 @@ class TPZliteInformer(CatInformer):
     data to train up a set of decision trees that are then stored
     as a pickled model file for use by the Estimator stage.
 
-    ntrees controls how many bootstrap realizations are created from a
+    n_trees controls how many bootstrap realizations are created from a
     single catalog realization to train one tree.
     nransom controls how many catalog realizations are created. Each
     random catalog consists of adding Gaussian scatter to each attribute
     based on its associated error column.  If the error column `eind` is
     -1 then a small error of 0.00005 is hardcoded into TPZ. The key
     attribute is not included in this random catalog creation.
 
-    So, a total of nrandom*ntrees trees are trained and stored in the
-    final model i.e. if nrandom=3 and ntrees=5 then 15 total trees
+    So, a total of n_random*n_trees trees are trained and stored in the
+    final model i.e. if n_random=3 and n_trees=5 then 15 total trees
     are trained and stored.
     """
 
@@ -129,12 +129,12 @@ class TPZliteInformer(CatInformer):
         # use_atts=Param(list, def_train_atts,
         #               msg="attributes to use in training trees"),
         err_dict=SHARED_PARAMS,
-        nrandom=Param(
+        n_random=Param(
             int, 8, msg="number of random bootstrap samples of training data to create"
         ),
-        ntrees=Param(int, 5, msg="number of trees to create"),
-        minleaf=Param(int, 5, msg="minimum number in terminal leaf"),
-        natt=Param(int, 3, msg="number of attributes to split for TPZ"),
+        n_trees=Param(int, 5, msg="number of trees to create"),
+        min_leaf=Param(int, 5, msg="minimum number in terminal leaf"),
+        n_att=Param(int, 3, msg="number of attributes to split for TPZ"),
         sigmafactor=Param(
             float, 3.0, msg="Gaussian smoothing with kernel Sigma1*Resolution"
         ),
@@ -240,12 +240,12 @@ def run(self):
         #####
         # make random data
         # So make_random takes the error columns and just adds Gaussian scatter to the input (or 0.00005 if no error supplied)
-        # it saves `nrandom` copies of this in a dictionary for each attribute for each galaxy
+        # it saves `n_random` copies of this in a dictionary for each attribute for each galaxy
         # not how I would have done things, but we're keeping it to try to duplicate MLZ's code exactly.
-        if self.config.nrandom > 1:
+        if self.config.n_random > 1:
             if self._rank == 0:
-                print(f"creating {self.config.nrandom} random realizations...")
-                traindata.make_random(ntimes=int(self.config.nrandom))
+                print(f"creating {self.config.n_random} random realizations...")
+                traindata.make_random(ntimes=int(self.config.n_random))
                 temprandos = traindata.BigRan
             else:  # pragma: no cover
                 temprandos = None
@@ -255,17 +255,17 @@ def run(self):
         # Matias writes out randoms from make_random for rank=0, then reads them all back in from file so that all ranks have access,
         # that seems slow so, instead, let's just assign them here (after broadcasting to all):
         if self._parallel == MPI_PARALLEL:
-            if self.config.nrandom > 1:
+            if self.config.n_random > 1:
                 temprandos = self._comm.bcast(temprandos, root=0)
-        if self.config.nrandom > 1:
+        if self.config.n_random > 1:
             traindata.BigRan = temprandos
         if self._parallel == MPI_PARALLEL:
             self._comm.Barrier()
 
-        ntot = int(self.config.nrandom * self.config.ntrees)
+        ntot = int(self.config.n_random * self.config.n_trees)
         if self._rank == 0:
             print(
-                f"making a total of {ntot} trees for {self.config.nrandom} random realizations * {self.config.ntrees} bootstraps"
+                f"making a total of {ntot} trees for {self.config.n_random} random realizations * {self.config.n_trees} bootstraps"
             )
 
         zfine, zfine2, resz, resz2, wzin = analysis.get_zbins(self.config)
@@ -287,9 +287,9 @@ def run(self):
             self._comm.Barrier()
         # copy some stuff from the runMLZ script:
         for kss in range(s0, s1):
-            print(f"making {kss+1} of {ntot}...")
-            if self.config.nrandom > 1:
-                ir = kss // int(self.config.ntrees)
+            print(f"making {kss + 1} of {ntot}...")
+            if self.config.n_random > 1:
+                ir = kss // int(self.config.n_trees)
                 if ir != 0:
                     traindata.newcat(ir)
             DD = "all"
@@ -300,16 +300,16 @@ def run(self):
                     traindata.X,
                     traindata.Y,
                     forest="yes",
-                    minleaf=int(self.config.minleaf),
-                    mstar=int(self.config.natt),
+                    minleaf=int(self.config.min_leaf),
+                    mstar=int(self.config.n_att),
                     dict_dim=DD,
                 )
             elif self.config.tree_strategy == "sklearn":
                 randx = rng.integers(low=0, high=25000, size=1)[0]
                 T = DecisionTreeRegressor(
                     random_state=randx,
-                    min_samples_leaf=self.config.minleaf,
-                    max_features=int(self.config.natt),
+                    min_samples_leaf=self.config.min_leaf,
+                    max_features=int(self.config.n_att),
                 )
                 T.fit(traindata.X, traindata.Y)
             else:  # pragma: no cover  already tested above
@@ -341,10 +341,10 @@ def run(self):
                 redshift_col=self.config.redshift_col,
                 att_dict=train_att_dict,
                 keyatt=self.config.keyatt,
-                nrandom=self.config.nrandom,
-                ntrees=self.config.ntrees,
-                minleaf=self.config.minleaf,
-                natt=self.config.natt,
+                n_random=self.config.n_random,
+                n_trees=self.config.n_trees,
+                min_leaf=self.config.min_leaf,
+                n_att=self.config.n_att,
                 sigmafactor=self.config.sigmafactor,
                 bands=self.config.bands,
                 rmsfactor=self.config.rmsfactor,
@@ -415,7 +415,7 @@ def _process_chunk(self, start, end, inputdata, first):
         test_att_dict = make_index_dict(self.config.err_dict, testkeys)
         zfine, zfine2, resz, resz2, wzin = analysis.get_zbins(self.attPars)
         zfine2 = zfine2[wzin]
-        ntot = int(self.attPars.nrandom * self.attPars.ntrees)
+        ntot = int(self.attPars.n_random * self.attPars.n_trees)
 
         Ng_temp = np.array(list(inputdata.values()))
         # Ng = np.array(Ng_temp, 'i')
diff --git a/tests/test_tpz.py b/tests/test_tpz.py
@@ -14,8 +14,8 @@
 def test_tpz_larger_training(treestrat, nrand):
     train_config_dict = {
         "hdf5_groupname": "photometry",
-        "nrandom": nrand,
-        "ntrees": 5,
+        "n_random": nrand,
+        "n_trees": 5,
         "model": "tpz_tests.pkl",
         "tree_strategy": treestrat,
     }
@@ -88,8 +88,8 @@ def test_tpz_input_data_format():
     nrand = 1
     train_config_dict = {
         "hdf5_groupname": "",
-        "nrandom": nrand,
-        "ntrees": 5,
+        "n_random": nrand,
+        "n_trees": 5,
         "model": "tpz_tests.pkl",
         "tree_strategy": treestrat,
     }