LSSTDESC · eacharles · Apr 10, 2026 · Jun 5, 2026 · Jun 5, 2026 · Jun 11, 2026
diff --git a/.github/workflows/submit_example.yaml b/.github/workflows/submit_example.yaml
@@ -0,0 +1,38 @@
+---
+# This workflow will install Python dependencies and run tests
+
+name: Unit test and code coverage
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.13']
+        submission: ['example']
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt install libbz2-dev
+          python -m pip install --upgrade pip
+          pip install wheel
+          pip install .
+          pip install .[dev]
+          if [ -f requirements_${{ matrix.submission }}.txt ]; then pip install -r requirements_${{ matrix.submission }}.txt; fi
+      - name: Run unit tests with pytest
+        run: |
+          python -m pytest tests/test_${{ matrix.submission }}.py
diff --git a/examples/do_example.py b/examples/do_example.py
@@ -1,3 +1,5 @@
+import tables_io
+import numpy as np
 from rail.core.data import TableHandle
 from rail.estimation.algos import sklearn_neurnet
 from rail.utils import catalog_utils
@@ -17,8 +19,17 @@ def train_and_estimate(
     scenario: str,
 ) -> None:
 
+    # clean up the training data
+    train_data_path = f"public/pz_challenge_{taskset}_{sim}_training_{scenario}.hdf5"
+    uncleaned_training_data = tables_io.read(train_data_path)
+    bad_mask = np.isnan(uncleaned_training_data['redshift'])
+    if bad_mask.any():
+        cleaned_training_data = {key: val[~bad_mask] for key, val in uncleaned_training_data.items()}
+        train_data_path = train_data_path.replace('.hdf5', '_cleaned.hdf5')
+        tables_io.write(cleaned_training_data, train_data_path)
+
     train_data = TableHandle(
-        "train", path=f"public/pz_challenge_{taskset}_{sim}_training_{scenario}.hdf5"
+        "train", path=train_data_path,
     )
     test_data = TableHandle(
         "test", path=f"public/pz_challenge_{taskset}_{sim}_test_{scenario}.hdf5"
@@ -33,7 +44,7 @@ def train_and_estimate(
         os.makedirs("evaluation")
     except:
         pass
-    
+
     model_path = f"submission/pz_challenge_{taskset}_{sim}_pz_model_{scenario}.pkl"
     output_path = f"submission/pz_challenge_{taskset}_{sim}_pz_estimate_{scenario}.hdf5"
     evaluate_path = f"evaluation/pz_challenge_{taskset}_{sim}_pz_evaluation_{scenario}.hdf5"
@@ -66,7 +77,7 @@ def train_and_estimate(
     pz_evaluate.data.ancil["object_id"] = train_data()["object_id"].astype(int)
     pz_evaluate.path = evaluate_path
     pz_evaluate.write()
-    
+
 
 
 if __name__ == "__main__":

diff --git a/requirements_example.txt b/requirements_example.txt
@@ -0,0 +1,2 @@
+pz-rail-base
+pz-rail-sklearn
diff --git a/scripts/download_public.py b/scripts/download_public.py
@@ -2,7 +2,7 @@
 from pz_data_challenge import submit_utils
 
 # don't change these
-PUBLIC_URL: str = "https://portal.nersc.gov/cfs/lsst/PZ/data_challenge/public.tgz"
+PUBLIC_URL: str = "https://portal.nersc.gov/cfs/lsst/PZ/data_challenge/public_test.tgz"
 
 
 def setup_public_area() -> None: