wq2012
diff --git a/‎README.md‎
Lines changed: 5 additions & 1 deletion b/‎README.md‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎kneeseg/bone_rf.py‎
Lines changed: 24 additions & 13 deletions b/‎kneeseg/bone_rf.py‎
Lines changed: 24 additions & 13 deletions
diff --git a/‎kneeseg/configs/config_schema.json‎
Lines changed: 9 additions & 0 deletions b/‎kneeseg/configs/config_schema.json‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎kneeseg/features.py‎
Lines changed: 41 additions & 22 deletions b/‎kneeseg/features.py‎
Lines changed: 41 additions & 22 deletions
diff --git a/‎kneeseg/pipeline/inference.py‎
Lines changed: 8 additions & 5 deletions b/‎kneeseg/pipeline/inference.py‎
Lines changed: 8 additions & 5 deletions
@@ -202,7 +202,8 @@ A valid configuration file has three main sections:
     },
     "model_config": {
         "target_bones": ["femur", "tibia", "patella"],
-        "model_directory": "/path/to/save/models"
+        "model_directory": "/path/to/save/models",
+        "dtype": "bfloat16"
     },
     "output_config": {
         "prediction_directory": "/path/to/save/predictions"
@@ -215,6 +216,9 @@ A valid configuration file has three main sections:
     -   Default for SKI10: `["femur", "tibia"]`
     -   Default for OAI: `["femur", "tibia", "patella"]`
     -   *Cartilage is closely coupled: "femur" includes "femoral cartilage".*
+-   **`dtype`**: (Optional) Data type for feature extraction matrices.
+    -   Options: `"float32"` (default), `"bfloat16"`.
+    -   **Recommedation**: Use `"bfloat16"` to reduce memory usage by ~50%. Requires `ml_dtypes`.
 
 > **Note**: The `split_file` should be a JSON containing `{"train": ["file1.mhd", ...], "eval": ["file2.mhd", ...]}`.
 
 
@@ -5,6 +5,7 @@
 from .features import extract_features, compute_rsid_features
 from scipy.ndimage import gaussian_filter
 from tqdm import tqdm
+import ml_dtypes
 
 class BoneClassifier:
     def __init__(self, n_estimators=100, max_depth=25, n_jobs=-1):
@@ -16,47 +17,57 @@ def __init__(self, n_estimators=100, max_depth=25, n_jobs=-1):
             class_weight='balanced'
         )
 
-    def extract_bone_features(self, image, prob_map=None, spacing=None):
+    def extract_bone_features(self, image, prob_map=None, spacing=None, target_dtype='float32'):
         """
         Extract features suitable for Bone Segmentation.
         Focuses on larger context and spatial coordinates.
         """
+        # Resolve dtype
+        if isinstance(target_dtype, str):
+            if target_dtype == 'bfloat16':
+                dtype = ml_dtypes.bfloat16
+            else:
+                dtype = np.float32
+        else:
+            dtype = target_dtype
+
         features = []
 
         # Normalize
         img_mean = image.mean()
         img_std = image.std()
         img_norm = (image.astype(np.float32) - img_mean) / (img_std + 1e-6)
+        img_norm = img_norm.astype(dtype)
 
         # 1. Intensity & Smooth (Multi-scale)
         features.append(img_norm.flatten())
-        features.append(gaussian_filter(img_norm, sigma=2.0).flatten())
-        features.append(gaussian_filter(img_norm, sigma=4.0).flatten())
+        features.append(gaussian_filter(img_norm.astype(np.float32), sigma=2.0).astype(dtype).flatten())
+        features.append(gaussian_filter(img_norm.astype(np.float32), sigma=4.0).astype(dtype).flatten())
 
         # 2. Spatial Coordinates (Normalized 0-1)
         z, y, x = np.mgrid[0:image.shape[0], 0:image.shape[1], 0:image.shape[2]]
-        features.append(z.flatten() / image.shape[0])
-        features.append(y.flatten() / image.shape[1])
-        features.append(x.flatten() / image.shape[2])
+        features.append((z.flatten() / image.shape[0]).astype(dtype))
+        features.append((y.flatten() / image.shape[1]).astype(dtype))
+        features.append((x.flatten() / image.shape[2]).astype(dtype))
 
         # 3. RSID (Texture) - Sparse but helpful
         # Keep shift small-ish but larger than cartilage
         # Downsample image for RSID to save memory? No, standard RSID.
-        rsid = compute_rsid_features(img_norm, num_shifts=20, max_shift=20, seed=42)
+        rsid = compute_rsid_features(img_norm, num_shifts=20, max_shift=20, seed=42, dtype=dtype)
         for i in range(rsid.shape[-1]):
              features.append(rsid[..., i].flatten())
 
         # 4. Auto-Context Probabilities
         if prob_map is not None:
             # prob_map is (Z, Y, X, C)
             for c in range(prob_map.shape[-1]):
-                p_ch = prob_map[..., c]
+                p_ch = prob_map[..., c].astype(dtype)
                 features.append(p_ch.flatten())
-                features.append(gaussian_filter(p_ch, sigma=2.0).flatten())
+                features.append(gaussian_filter(p_ch.astype(np.float32), sigma=2.0).astype(dtype).flatten())
 
         return np.stack(features, axis=1)
 
-    def train(self, images, labels, prob_maps=None, subsample=50000):
+    def train(self, images, labels, prob_maps=None, subsample=50000, dtype='float32'):
         """
         images: list of 3D arrays
         labels: list of 3D arrays (0=bg, 1=Femur, 2=FemCart, 3=Tibia, 4=TibCart)
@@ -113,7 +124,7 @@ def train(self, images, labels, prob_maps=None, subsample=50000):
             ])
 
             pm = prob_maps[i] if prob_maps else None
-            feats_flat = self.extract_bone_features(img, prob_map=pm) # (N_all, F)
+            feats_flat = self.extract_bone_features(img, prob_map=pm, target_dtype=dtype) # (N_all, F)
 
             # Convert 3D coords to 1D indices
             flat_indices = np.ravel_multi_index(coords.T, img.shape)
@@ -127,8 +138,8 @@ def train(self, images, labels, prob_maps=None, subsample=50000):
         print("    Fitting Bone Random Forest...")
         self.clf.fit(np.vstack(X_all), np.concatenate(y_all))
 
-    def predict(self, image, prob_map=None):
-        feats_flat = self.extract_bone_features(image, prob_map)
+    def predict(self, image, prob_map=None, dtype='float32'):
+        feats_flat = self.extract_bone_features(image, prob_map, target_dtype=dtype)
 
         # Predict in chunks to be safe? Or full.
         # 100GB RAM. Image ~10M voxels. Features ~30 floats -> 300MB * 4 = 1.2GB.
 
@@ -142,6 +142,15 @@
                     "type": "string",
                     "description": "Optional override for model directory."
                 },
+                "dtype": {
+                    "type": "string",
+                    "enum": [
+                        "float32",
+                        "bfloat16"
+                    ],
+                    "default": "float32",
+                    "description": "Data type for feature extraction and training. Use 'bfloat16' to reduce memory usage."
+                },
                 "n_jobs": {
                     "type": "integer",
                     "default": -1,
 
@@ -39,7 +39,7 @@ def compute_dts_from_landmarks(image_shape, landmarks_dict, spacing=None):
             dts[name] = np.full(image_shape, 200.0, dtype=np.float32)
     return dts
 
-def compute_rsid_features(image, num_shifts=30, max_shift=10, seed=42, mask=None):
+def compute_rsid_features(image, num_shifts=30, max_shift=10, seed=42, mask=None, dtype=np.float32):
     """
     Computes Random Shift Intensity Difference (RSID) features only at mask locations.
     Returns (num_masked_voxels, num_shifts) if mask provided, else full volume.
@@ -58,7 +58,7 @@ def compute_rsid_features(image, num_shifts=30, max_shift=10, seed=42, mask=None
             features.append(base_slice - padded[max_shift+dz : max_shift+z+dz,
                                                max_shift+dy : max_shift+y+dy,
                                                max_shift+dx : max_shift+x+dx])
-        return np.stack(features, axis=-1).astype(np.float32)
+        return np.stack(features, axis=-1).astype(dtype)
 
     # Masked computation
     if mask.ndim == 3:
@@ -70,7 +70,7 @@ def compute_rsid_features(image, num_shifts=30, max_shift=10, seed=42, mask=None
         raise ValueError("RSID requires a 3D mask for spatial optimization.")
 
     n_vox = len(mask_indices)
-    rsid_features = np.zeros((n_vox, num_shifts), dtype=np.float32)
+    rsid_features = np.zeros((n_vox, num_shifts), dtype=dtype)
 
     # Pad image once
     padded = np.pad(image, max_shift, mode='edge')
@@ -87,7 +87,7 @@ def compute_rsid_features(image, num_shifts=30, max_shift=10, seed=42, mask=None
 
     return rsid_features
 
-def compute_landmark_features(image_shape, landmarks_dict, indices_dict, mask=None, spacing=None):
+def compute_landmark_features(image_shape, landmarks_dict, indices_dict, mask=None, spacing=None, dtype=np.float32):
     """
     Computes distance to landmarks only at mask locations.
     Operates in Physical Space (MM) if spacing provided.
@@ -115,12 +115,17 @@ def compute_landmark_features(image_shape, landmarks_dict, indices_dict, mask=No
             if idx < len(points):
                 p = points[idx]
                 dist = np.sqrt(np.sum((coords - p)**2, axis=1))
-                features.append(dist.astype(np.float32))
+                features.append(dist.astype(dtype))
             else:
-                features.append(np.full(len(coords), 200.0, dtype=np.float32))
+                features.append(np.full(len(coords), 200.0, dtype=dtype))
     return features
 
 def compute_dt_arithmetic_features(dts, mask=None):
+    # This just returns float arrays, casting happens in extract_features usually
+    # But let's check its usage.
+    # It returns list of arrays. extract_features casts them.
+    # So we don't strictly need to update this unless we want intermediate memory savings.
+    # Let's keep it as is, casting is done by caller.
     features = []
     if 'femur' in dts and 'tibia' in dts:
         f = dts['femur']
@@ -139,32 +144,46 @@ def compute_dt_arithmetic_features(dts, mask=None):
         features.append(f - t) 
     return features
 
-def extract_features(image, dts, sigma=1.0, mask=None, r_shifts=30, landmarks_dict=None, landmark_indices=None, prob_map=None, spacing=None, sorted_bones_override=None):
+def extract_features(image, dts, sigma=1.0, mask=None, r_shifts=30, landmarks_dict=None, landmark_indices=None, prob_map=None, spacing=None, sorted_bones_override=None, target_dtype='float32'):
     """
-    Optimized feature extraction that avoids 4D intermediate arrays and respects masks.
+    Computes Optimized feature extraction that avoids 4D intermediate arrays and respects masks.
     """
+    import ml_dtypes
+    
+    # Resolve dtype
+    if isinstance(target_dtype, str):
+        if target_dtype == 'bfloat16':
+            dtype = ml_dtypes.bfloat16
+        else:
+            dtype = np.float32
+    else:
+        dtype = target_dtype
+        
     img_mean = image.mean()
     img_std = image.std()
+    
+    # Normalize and cast immediately to target dtype
     image_norm = (image.astype(np.float32) - img_mean) / (img_std + 1e-6)
+    image_norm = image_norm.astype(dtype)
 
     def get_masked(arr):
         if mask is not None:
             if mask.ndim == arr.ndim:
-                return arr[mask].astype(np.float32)
+                return arr[mask].astype(dtype)
             else:
-                return arr.flatten()[mask].astype(np.float32)
-        return arr.flatten().astype(np.float32)
+                return arr.flatten()[mask].astype(dtype)
+        return arr.flatten().astype(dtype)
 
     features = []
 
     # 1. Intensity
     features.append(get_masked(image_norm))
 
     # 2. Gaussian
-    features.append(get_masked(gaussian_filter(image_norm, sigma=sigma)))
+    features.append(get_masked(gaussian_filter(image_norm.astype(np.float32), sigma=sigma)))
 
     # 3. Gradient
-    features.append(get_masked(gaussian_gradient_magnitude(image_norm, sigma=sigma)))
+    features.append(get_masked(gaussian_gradient_magnitude(image_norm.astype(np.float32), sigma=sigma)))
 
     # 5. DTs
     if sorted_bones_override is None:
@@ -185,31 +204,31 @@ def get_masked(arr):
             # get_masked returns flat array of size N_masked
             # We can use shape from an existing feature (like Intensity)
             ref_shape = features[0].shape
-            features.append(np.full(ref_shape, 100.0, dtype=np.float32))
+            features.append(np.full(ref_shape, 100.0, dtype=dtype))
 
     # 6. DT Arithmetic
     # Only compute if 'femur' and 'tibia' are present/logic applies
     dt_arith = compute_dt_arithmetic_features(dts, mask=mask)
     for f in dt_arith:
         if mask is None:
-            features.append(f.flatten().astype(np.float32))
+            features.append(f.flatten().astype(dtype))
         else:
-            features.append(f.astype(np.float32))
+            features.append(f.astype(dtype))
 
     # 7. RSID (Mask-aware)
     if mask is not None and mask.ndim == 3:
-        rsid_masked = compute_rsid_features(image_norm, num_shifts=r_shifts, max_shift=10, mask=mask)
+        rsid_masked = compute_rsid_features(image_norm, num_shifts=r_shifts, max_shift=10, mask=mask, dtype=dtype)
         for i in range(rsid_masked.shape[1]):
             features.append(rsid_masked[:, i])
     else:
         # Fallback to full then mask (wasteful)
-        rsid_full = compute_rsid_features(image_norm, num_shifts=r_shifts, max_shift=10)
+        rsid_full = compute_rsid_features(image_norm, num_shifts=r_shifts, max_shift=10, dtype=dtype)
         for i in range(rsid_full.shape[-1]):
             features.append(get_masked(rsid_full[..., i]))
 
     # 8. Landmarks
     if landmarks_dict and landmark_indices:
-        lm_features = compute_landmark_features(image.shape, landmarks_dict, landmark_indices, mask=mask, spacing=spacing)
+        lm_features = compute_landmark_features(image.shape, landmarks_dict, landmark_indices, mask=mask, spacing=spacing, dtype=dtype)
         for f in lm_features:
             features.append(f)
 
@@ -220,15 +239,15 @@ def get_masked(arr):
 
         for p_ch in channels:
             features.append(get_masked(p_ch))
-            features.append(get_masked(gaussian_filter(p_ch, sigma=sigma)))
+            features.append(get_masked(gaussian_filter(p_ch.astype(np.float32), sigma=sigma)))
 
             # Context RSID
             if mask is not None and mask.ndim == 3:
-                rsid_p = compute_rsid_features(p_ch, num_shifts=15, max_shift=15, mask=mask)
+                rsid_p = compute_rsid_features(p_ch, num_shifts=15, max_shift=15, mask=mask, dtype=dtype)
                 for i in range(rsid_p.shape[1]):
                     features.append(rsid_p[:, i])
             else:
-                rsid_p_full = compute_rsid_features(p_ch, num_shifts=15, max_shift=15)
+                rsid_p_full = compute_rsid_features(p_ch, num_shifts=15, max_shift=15, dtype=dtype)
                 for i in range(rsid_p_full.shape[-1]):
                     features.append(get_masked(rsid_p_full[..., i]))
 
 
@@ -30,6 +30,9 @@ def inference_improved(config=None):
     target_bones = model_cfg.get('target_bones', ['femur', 'tibia'])
     out_cfg = config['output_config']
 
+    target_dtype = model_cfg.get('dtype', 'float32')
+    print(f"Target Bones: {target_bones}, dtype: {target_dtype}")
+    
     image_dir = data_cfg['image_directory']
     label_dir = data_cfg.get('label_directory') # Optional
     split_file = data_cfg.get('split_file') # Optional
@@ -110,10 +113,10 @@ def inference_improved(config=None):
         img, spacing = load_volume(img_path, return_spacing=True)
 
         # 1. Predict Bones (Pass 1)
-        _, prob1 = bone_rf_p1.predict(img)
+        _, prob1 = bone_rf_p1.predict(img, dtype=target_dtype)
 
         # 2. Predict Bones (Pass 2 - Auto-Context)
-        bone_pred_flat, _ = bone_rf_p2.predict(img, prob_map=prob1)
+        bone_pred_flat, _ = bone_rf_p2.predict(img, prob_map=prob1, dtype=target_dtype)
         bone_pred = bone_pred_flat.reshape(img.shape)
 
         bone_masks = {}
@@ -127,14 +130,14 @@ def inference_improved(config=None):
         if cart_rf_p1 is not None and cart_rf_p2 is not None:
              # 2. Predict Cartilage
              # Pass 1: Prob Map
-             c_prob1 = cart_rf_p1.predict_proba_map(img, bone_masks, proximity_mm=20.0)
+             c_prob1 = cart_rf_p1.predict_proba_map(img, bone_masks, proximity_mm=20.0, dtype=target_dtype)
 
              # Pass 2: Final Prediction (Auto-Context)
-             cart_pred, _ = cart_rf_p2.predict(img, bone_masks, proximity_mm=20.0, prob_map=c_prob1)
+             cart_pred, _ = cart_rf_p2.predict(img, bone_masks, proximity_mm=20.0, prob_map=c_prob1, dtype=target_dtype)
         elif cart_rf_p1 is not None:
              # Fallback to single pass if p2 missing (compatibility)
              print("Warning: Only P1 model found. Running single pass.")
-             cart_pred, _ = cart_rf_p1.predict(img, bone_masks, proximity_mm=20.0)
+             cart_pred, _ = cart_rf_p1.predict(img, bone_masks, proximity_mm=20.0, dtype=target_dtype)
 
         # 3. Evaluate
         if lbl is not None: