refactor(agent): artifact-ground the empty-result screen (fix zero-count FPs)

fmschulz · fmschulz · commit 9ad855814d4f · 2026-05-29T19:27:34.000-07:00
The empty-result screen now suppresses the "suspicious empty result" prompt
when the tool produced a non-empty workspace artifact, via the new
_screen_suspicious_empty / _tool_produced_nonempty_artifact helpers. This
fixes the false positives where a successful analysis legitimately reports a
zero count (e.g. "0 contaminated genomes detected" alongside a written
summary table) that previously derailed strong models.

Measured rationale: the FP-vs-TP distinction is impossible from output text
alone -- "0 valid files" (a real empty result) is textually identical to
"0 contaminated genomes" -- so deleting the regex would regress recall (max
text precision with recall held at 1.0 was 0.76). The structured TOOLRES
artifact signal (result.files) is the reliable discriminator, so the fix
augments the text heuristic with it rather than replacing it. The regex stays
as the recall fallback for genuinely empty (no-artifact) results.

Tests: +5 (ArtifactGroundedEmptyScreenTests); full suite 671 passed / 2 skipped.
diff --git a/src/nelli_ai_scientist/agent.py b/src/nelli_ai_scientist/agent.py
@@ -1750,8 +1750,8 @@ def _execute_tool_calls(self, ctx: GateContext) -> GateOutcome:
                     incomplete_verification_calls_this_turn.append(tc)
                 elif (
                     should_screen_output
-                    and self._output_suggests_empty_result(
-                        tool_result.output,
+                    and self._screen_suspicious_empty(
+                        tool_result,
                         treat_blank_as_empty=is_pending_verification_call,
                         allow_header_only=self._is_header_only_diagnostic_call(
                             tc,
@@ -3833,6 +3833,46 @@ def _tool_error_recovery_prompt(
             guidance=f"\n{combined_guidance}" if combined_guidance else "",
         )
 
+    @staticmethod
+    def _tool_produced_nonempty_artifact(tool_result: "ToolResult") -> bool:
+        """True if the tool wrote at least one non-empty workspace file.
+
+        Distinguishes a successful analysis that legitimately reports a zero
+        count (e.g. "0 contaminated genomes detected" alongside a produced
+        summary table) from a call that produced nothing. Output-text screening
+        cannot tell these apart -- "0 valid files" is textually identical to
+        "0 contaminated genomes" -- so the structured artifact signal is the
+        only reliable discriminator.
+        """
+        return any(
+            getattr(artifact, "size", 0) > 0
+            for artifact in (tool_result.files or ())
+        )
+
+    def _screen_suspicious_empty(
+        self,
+        tool_result: "ToolResult",
+        *,
+        treat_blank_as_empty: bool = False,
+        allow_header_only: bool = False,
+        pending_targets: set[str] | None = None,
+    ) -> bool:
+        """Artifact-grounded empty-result screen.
+
+        A tool call that produced a non-empty artifact is never treated as an
+        empty result, even if its stdout reports a zero count. Otherwise fall
+        back to the text heuristic, which still flags genuinely empty output
+        (blank, or a bare zero-record listing with no artifact to back it).
+        """
+        if self._tool_produced_nonempty_artifact(tool_result):
+            return False
+        return self._output_suggests_empty_result(
+            tool_result.output,
+            treat_blank_as_empty=treat_blank_as_empty,
+            allow_header_only=allow_header_only,
+            pending_targets=pending_targets,
+        )
+
     @staticmethod
     def _output_suggests_empty_result(
         output: str,
diff --git a/tests/test_agent.py b/tests/test_agent.py
@@ -8500,3 +8500,74 @@ def transport(req, timeout):  # noqa: ANN001
 
 if __name__ == "__main__":
     unittest.main()
+
+
+class ArtifactGroundedEmptyScreenTests(unittest.TestCase):
+    """The empty-result screen is suppressed when the tool produced a real
+    artifact, fixing the "0 <word> <noun>" false positives (e.g. "0 contaminated
+    genomes detected" from a run that wrote a summary table) without losing the
+    text-based recall for genuinely empty results ("0 valid files", blank)."""
+
+    @staticmethod
+    def _loop():
+        # The methods under test only use static helpers, so a bare instance
+        # (no __init__) is sufficient and avoids constructing a client.
+        return AgentLoop.__new__(AgentLoop)
+
+    def test_nonempty_artifact_detected(self):
+        from nelli_ai_scientist.tools import FileArtifact, ToolResult
+        with_file = ToolResult(
+            output="x",
+            files=(FileArtifact(path="a.tsv", sha256="d", size=42, kind="file"),),
+        )
+        self.assertTrue(AgentLoop._tool_produced_nonempty_artifact(with_file))
+        self.assertFalse(
+            AgentLoop._tool_produced_nonempty_artifact(ToolResult(output="x"))
+        )
+        zero_byte = ToolResult(
+            output="x",
+            files=(FileArtifact(path="a.tsv", sha256="d", size=0, kind="file"),),
+        )
+        self.assertFalse(AgentLoop._tool_produced_nonempty_artifact(zero_byte))
+
+    def test_zero_count_with_artifact_not_flagged(self):
+        from nelli_ai_scientist.tools import FileArtifact, ToolResult
+        produced = ToolResult(
+            output="Contamination check complete: 0 contaminated genomes detected.",
+            files=(
+                FileArtifact(
+                    path="checkv/quality_summary.tsv",
+                    sha256="d",
+                    size=2048,
+                    kind="file",
+                ),
+            ),
+        )
+        self.assertFalse(self._loop()._screen_suspicious_empty(produced))
+
+    def test_zero_count_without_artifact_still_flagged(self):
+        from nelli_ai_scientist.tools import ToolResult
+        bare = ToolResult(output="0 contaminated genomes detected.")
+        self.assertTrue(self._loop()._screen_suspicious_empty(bare))
+
+    def test_real_empty_result_preserved(self):
+        from nelli_ai_scientist.tools import ToolResult
+        loop = self._loop()
+        self.assertTrue(
+            loop._screen_suspicious_empty(ToolResult(output="0 valid files"))
+        )
+        self.assertTrue(
+            loop._screen_suspicious_empty(
+                ToolResult(output=""), treat_blank_as_empty=True
+            )
+        )
+
+    def test_blank_output_with_artifact_not_flagged(self):
+        from nelli_ai_scientist.tools import FileArtifact, ToolResult
+        produced = ToolResult(
+            output="",
+            files=(FileArtifact(path="out.tsv", sha256="d", size=10, kind="file"),),
+        )
+        self.assertFalse(
+            self._loop()._screen_suspicious_empty(produced, treat_blank_as_empty=True)
+        )