Skip to content

Commit 9ad8558

Browse files
committed
refactor(agent): artifact-ground the empty-result screen (fix zero-count FPs)
The empty-result screen now suppresses the "suspicious empty result" prompt when the tool produced a non-empty workspace artifact, via the new _screen_suspicious_empty / _tool_produced_nonempty_artifact helpers. This fixes the false positives where a successful analysis legitimately reports a zero count (e.g. "0 contaminated genomes detected" alongside a written summary table) that previously derailed strong models. Measured rationale: the FP-vs-TP distinction is impossible from output text alone -- "0 valid files" (a real empty result) is textually identical to "0 contaminated genomes" -- so deleting the regex would regress recall (max text precision with recall held at 1.0 was 0.76). The structured TOOLRES artifact signal (result.files) is the reliable discriminator, so the fix augments the text heuristic with it rather than replacing it. The regex stays as the recall fallback for genuinely empty (no-artifact) results. Tests: +5 (ArtifactGroundedEmptyScreenTests); full suite 671 passed / 2 skipped.
1 parent 20c9fc0 commit 9ad8558

2 files changed

Lines changed: 113 additions & 2 deletions

File tree

src/nelli_ai_scientist/agent.py

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1750,8 +1750,8 @@ def _execute_tool_calls(self, ctx: GateContext) -> GateOutcome:
17501750
incomplete_verification_calls_this_turn.append(tc)
17511751
elif (
17521752
should_screen_output
1753-
and self._output_suggests_empty_result(
1754-
tool_result.output,
1753+
and self._screen_suspicious_empty(
1754+
tool_result,
17551755
treat_blank_as_empty=is_pending_verification_call,
17561756
allow_header_only=self._is_header_only_diagnostic_call(
17571757
tc,
@@ -3833,6 +3833,46 @@ def _tool_error_recovery_prompt(
38333833
guidance=f"\n{combined_guidance}" if combined_guidance else "",
38343834
)
38353835

3836+
@staticmethod
3837+
def _tool_produced_nonempty_artifact(tool_result: "ToolResult") -> bool:
3838+
"""True if the tool wrote at least one non-empty workspace file.
3839+
3840+
Distinguishes a successful analysis that legitimately reports a zero
3841+
count (e.g. "0 contaminated genomes detected" alongside a produced
3842+
summary table) from a call that produced nothing. Output-text screening
3843+
cannot tell these apart -- "0 valid files" is textually identical to
3844+
"0 contaminated genomes" -- so the structured artifact signal is the
3845+
only reliable discriminator.
3846+
"""
3847+
return any(
3848+
getattr(artifact, "size", 0) > 0
3849+
for artifact in (tool_result.files or ())
3850+
)
3851+
3852+
def _screen_suspicious_empty(
3853+
self,
3854+
tool_result: "ToolResult",
3855+
*,
3856+
treat_blank_as_empty: bool = False,
3857+
allow_header_only: bool = False,
3858+
pending_targets: set[str] | None = None,
3859+
) -> bool:
3860+
"""Artifact-grounded empty-result screen.
3861+
3862+
A tool call that produced a non-empty artifact is never treated as an
3863+
empty result, even if its stdout reports a zero count. Otherwise fall
3864+
back to the text heuristic, which still flags genuinely empty output
3865+
(blank, or a bare zero-record listing with no artifact to back it).
3866+
"""
3867+
if self._tool_produced_nonempty_artifact(tool_result):
3868+
return False
3869+
return self._output_suggests_empty_result(
3870+
tool_result.output,
3871+
treat_blank_as_empty=treat_blank_as_empty,
3872+
allow_header_only=allow_header_only,
3873+
pending_targets=pending_targets,
3874+
)
3875+
38363876
@staticmethod
38373877
def _output_suggests_empty_result(
38383878
output: str,

tests/test_agent.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8500,3 +8500,74 @@ def transport(req, timeout): # noqa: ANN001
85008500

85018501
if __name__ == "__main__":
85028502
unittest.main()
8503+
8504+
8505+
class ArtifactGroundedEmptyScreenTests(unittest.TestCase):
8506+
"""The empty-result screen is suppressed when the tool produced a real
8507+
artifact, fixing the "0 <word> <noun>" false positives (e.g. "0 contaminated
8508+
genomes detected" from a run that wrote a summary table) without losing the
8509+
text-based recall for genuinely empty results ("0 valid files", blank)."""
8510+
8511+
@staticmethod
8512+
def _loop():
8513+
# The methods under test only use static helpers, so a bare instance
8514+
# (no __init__) is sufficient and avoids constructing a client.
8515+
return AgentLoop.__new__(AgentLoop)
8516+
8517+
def test_nonempty_artifact_detected(self):
8518+
from nelli_ai_scientist.tools import FileArtifact, ToolResult
8519+
with_file = ToolResult(
8520+
output="x",
8521+
files=(FileArtifact(path="a.tsv", sha256="d", size=42, kind="file"),),
8522+
)
8523+
self.assertTrue(AgentLoop._tool_produced_nonempty_artifact(with_file))
8524+
self.assertFalse(
8525+
AgentLoop._tool_produced_nonempty_artifact(ToolResult(output="x"))
8526+
)
8527+
zero_byte = ToolResult(
8528+
output="x",
8529+
files=(FileArtifact(path="a.tsv", sha256="d", size=0, kind="file"),),
8530+
)
8531+
self.assertFalse(AgentLoop._tool_produced_nonempty_artifact(zero_byte))
8532+
8533+
def test_zero_count_with_artifact_not_flagged(self):
8534+
from nelli_ai_scientist.tools import FileArtifact, ToolResult
8535+
produced = ToolResult(
8536+
output="Contamination check complete: 0 contaminated genomes detected.",
8537+
files=(
8538+
FileArtifact(
8539+
path="checkv/quality_summary.tsv",
8540+
sha256="d",
8541+
size=2048,
8542+
kind="file",
8543+
),
8544+
),
8545+
)
8546+
self.assertFalse(self._loop()._screen_suspicious_empty(produced))
8547+
8548+
def test_zero_count_without_artifact_still_flagged(self):
8549+
from nelli_ai_scientist.tools import ToolResult
8550+
bare = ToolResult(output="0 contaminated genomes detected.")
8551+
self.assertTrue(self._loop()._screen_suspicious_empty(bare))
8552+
8553+
def test_real_empty_result_preserved(self):
8554+
from nelli_ai_scientist.tools import ToolResult
8555+
loop = self._loop()
8556+
self.assertTrue(
8557+
loop._screen_suspicious_empty(ToolResult(output="0 valid files"))
8558+
)
8559+
self.assertTrue(
8560+
loop._screen_suspicious_empty(
8561+
ToolResult(output=""), treat_blank_as_empty=True
8562+
)
8563+
)
8564+
8565+
def test_blank_output_with_artifact_not_flagged(self):
8566+
from nelli_ai_scientist.tools import FileArtifact, ToolResult
8567+
produced = ToolResult(
8568+
output="",
8569+
files=(FileArtifact(path="out.tsv", sha256="d", size=10, kind="file"),),
8570+
)
8571+
self.assertFalse(
8572+
self._loop()._screen_suspicious_empty(produced, treat_blank_as_empty=True)
8573+
)

0 commit comments

Comments
 (0)