experiment with larger penalties (#19)

codekansas · web-flow · commit f4cf46ea7a59 · 2025-05-06T01:44:06.000-04:00
* experiment with larger penalties

* also penalize hip roll

* revert to previous scales

* bigger rewards

* reduce max speed

* update name

* misc cleanup

* update checkpoint
diff --git a/assets/ckpt.bin b/assets/ckpt.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4938765b0e9776e7800557bd118b209034afffc403959f15386b903c82d7d1b6
-size 12137848
+oid sha256:d68aabccd60572ebd0b3225e41c68da8da948d6d77737875ade95f2dfa21a636
+size 12129864
diff --git a/train.py b/train.py
@@ -49,45 +49,72 @@
 
 
 @attrs.define(frozen=True, kw_only=True)
-class BentArmPenalty(ksim.Reward):
-    arm_indices: tuple[int, ...] = attrs.field()
-    arm_targets: tuple[float, ...] = attrs.field()
+class JointPositionPenalty(ksim.JointDeviationPenalty):
+    @classmethod
+    def create_from_names(
+        cls,
+        names: list[str],
+        physics_model: ksim.PhysicsModel,
+        scale: float = -1.0,
+        scale_by_curriculum: bool = False,
+    ) -> Self:
+        zeros = {k: v for k, v in ZEROS}
+        joint_targets = [zeros[name] for name in names]
+
+        return cls.create(
+            physics_model=physics_model,
+            joint_names=tuple(names),
+            joint_targets=tuple(joint_targets),
+            scale=scale,
+            scale_by_curriculum=scale_by_curriculum,
+        )
 
-    def get_reward(self, trajectory: ksim.Trajectory) -> Array:
-        qpos = trajectory.qpos[..., self.arm_indices]
-        qpos_targets = jnp.array(self.arm_targets)
-        qpos_diff = qpos - qpos_targets
-        return xax.get_norm(qpos_diff, "l1").mean(axis=-1)
 
+@attrs.define(frozen=True, kw_only=True)
+class BentArmPenalty(JointPositionPenalty):
     @classmethod
-    def create(
+    def create_penalty(
         cls,
-        model: ksim.PhysicsModel,
-        scale: float,
+        physics_model: ksim.PhysicsModel,
+        scale: float = -1.0,
         scale_by_curriculum: bool = False,
     ) -> Self:
-        qpos_mapping = ksim.get_qpos_data_idxs_by_name(model)
-
-        names = [
-            "dof_right_shoulder_pitch_03",
-            "dof_right_shoulder_roll_03",
-            "dof_right_shoulder_yaw_02",
-            "dof_right_elbow_02",
-            "dof_right_wrist_00",
-            "dof_left_shoulder_pitch_03",
-            "dof_left_shoulder_roll_03",
-            "dof_left_shoulder_yaw_02",
-            "dof_left_elbow_02",
-            "dof_left_wrist_00",
-        ]
+        return cls.create_from_names(
+            names=[
+                "dof_right_shoulder_pitch_03",
+                "dof_right_shoulder_roll_03",
+                "dof_right_shoulder_yaw_02",
+                "dof_right_elbow_02",
+                "dof_right_wrist_00",
+                "dof_left_shoulder_pitch_03",
+                "dof_left_shoulder_roll_03",
+                "dof_left_shoulder_yaw_02",
+                "dof_left_elbow_02",
+                "dof_left_wrist_00",
+            ],
+            physics_model=physics_model,
+            scale=scale,
+            scale_by_curriculum=scale_by_curriculum,
+        )
 
-        zeros = {k: v for k, v in ZEROS}
-        arm_indices = [qpos_mapping[name][0] for name in names]
-        arm_targets = [zeros[name] for name in names]
 
-        return cls(
-            arm_indices=tuple(arm_indices),
-            arm_targets=tuple(arm_targets),
+@attrs.define(frozen=True, kw_only=True)
+class StraightLegPenalty(JointPositionPenalty):
+    @classmethod
+    def create_penalty(
+        cls,
+        physics_model: ksim.PhysicsModel,
+        scale: float = -1.0,
+        scale_by_curriculum: bool = False,
+    ) -> Self:
+        return cls.create_from_names(
+            names=[
+                "dof_left_hip_roll_03",
+                "dof_left_hip_yaw_03",
+                "dof_right_hip_roll_03",
+                "dof_right_hip_yaw_03",
+            ],
+            physics_model=physics_model,
             scale=scale,
             scale_by_curriculum=scale_by_curriculum,
         )
@@ -436,16 +463,14 @@ def get_rewards(self, physics_model: ksim.PhysicsModel) -> list[ksim.Reward]:
             ksim.UprightReward(index="x", inverted=False, scale=0.1),
             # Normalization penalties.
             ksim.ActionInBoundsReward.create(physics_model, scale=0.01),
-            ksim.ActionSmoothnessPenalty(scale=-0.01),
-            ksim.ActuatorJerkPenalty(ctrl_dt=self.config.ctrl_dt, scale=-0.001),
-            ksim.ActuatorRelativeForcePenalty.create(physics_model, scale=-0.001),
-            ksim.AngularVelocityPenalty(index="x", scale=-0.0005),
-            ksim.AngularVelocityPenalty(index="y", scale=-0.0005),
-            ksim.AngularVelocityPenalty(index="z", scale=-0.0005),
-            ksim.LinearVelocityPenalty(index="y", scale=-0.0005),
-            ksim.LinearVelocityPenalty(index="z", scale=-0.0005),
+            ksim.AngularVelocityPenalty(index="x", scale=-0.005),
+            ksim.AngularVelocityPenalty(index="y", scale=-0.005),
+            ksim.AngularVelocityPenalty(index="z", scale=-0.005),
+            ksim.LinearVelocityPenalty(index="y", scale=-0.005),
+            ksim.LinearVelocityPenalty(index="z", scale=-0.005),
             # Bespoke rewards.
-            BentArmPenalty.create(physics_model, scale=-0.01),
+            BentArmPenalty.create_penalty(physics_model, scale=-0.1),
+            StraightLegPenalty.create_penalty(physics_model, scale=-0.01),
         ]
 
     def get_terminations(self, physics_model: ksim.PhysicsModel) -> list[ksim.Termination]: