From 33f38ca147a9dad35c739a6eeae09008a94e6eed Mon Sep 17 00:00:00 2001 From: yangfei222666-9 <261852489+yangfei222666-9@users.noreply.github.com> Date: Wed, 24 Jun 2026 23:52:02 +0800 Subject: [PATCH 1/2] docs: add openai codex application drafts --- .../cover-note-one-page.md | 12 +++ .../openai-applied-ai-codex/cover-note.md | 50 +++++++++++ .../openai-applied-ai-codex/resume-draft.md | 88 +++++++++++++++++++ .../resume-one-page.md | 53 +++++++++++ 4 files changed, 203 insertions(+) create mode 100644 docs/career/openai-applied-ai-codex/cover-note-one-page.md create mode 100644 docs/career/openai-applied-ai-codex/cover-note.md create mode 100644 docs/career/openai-applied-ai-codex/resume-draft.md create mode 100644 docs/career/openai-applied-ai-codex/resume-one-page.md diff --git a/docs/career/openai-applied-ai-codex/cover-note-one-page.md b/docs/career/openai-applied-ai-codex/cover-note-one-page.md new file mode 100644 index 0000000..2f2bf91 --- /dev/null +++ b/docs/career/openai-applied-ai-codex/cover-note-one-page.md @@ -0,0 +1,12 @@ +# OpenAI Applied AI Engineer, Codex Core Agent - Cover Note + +Dear OpenAI Codex team, + +I am applying for the Applied AI Engineer, Codex Core Agent role because my strongest recent work is about agent reliability: preventing AI-agent workflows from claiming "done" without evidence. + +My current public proof is an Agent Reliability False-Pass Gate. It blocks unsupported success claims when passing-evidence pointers or explicit `cannot_claim` boundaries are missing. While building it, I found and fixed a real false-pass issue: an empty or missing fixture directory could make a self-test appear successful with `self_test=PASS cases=0`. I hardened the gate, added negative tests, and published the evidence path through merged PRs and a reviewer-readable proof page. + +This experience is relevant to Codex work on evals, failure modes, edge cases, and dependable completion of software-engineering tasks. My work is local and narrow, not a claim of production-scale eval infrastructure, but it shows how I think about turning model behavior into dependable systems. + +Best, +Yang Fei (Xiaojiu) diff --git a/docs/career/openai-applied-ai-codex/cover-note.md b/docs/career/openai-applied-ai-codex/cover-note.md new file mode 100644 index 0000000..21b84de --- /dev/null +++ b/docs/career/openai-applied-ai-codex/cover-note.md @@ -0,0 +1,50 @@ +# OpenAI Applied AI Engineer, Codex Core Agent - Cover Note Draft + +Target role: Applied AI Engineer, Codex Core Agent +Candidate positioning: AI Agent Reliability / Evals / Developer Tools Engineer +Contact: [insert private contact details only in final application copy] + +## Short Cover Note + +Dear OpenAI Codex team, + +I am applying for the Applied AI Engineer, Codex Core Agent role because my strongest recent work is about agent reliability: preventing AI-agent workflows from claiming "done" without evidence. + +My current public proof is an Agent Reliability False-Pass Gate. It blocks unsupported success claims when passing-evidence pointers or explicit `cannot_claim` boundaries are missing. While building it, I found and fixed a real false-pass issue: an empty or missing fixture directory could make a self-test appear successful with `self_test=PASS cases=0`. I hardened the gate, added negative tests, and published the evidence path through merged PRs and a reviewer-readable proof page. + +This experience is relevant to Codex work on evals, failure modes, edge cases, and dependable completion of software-engineering tasks. My work is local and narrow, not a claim of production-scale eval infrastructure, but it shows how I think about turning model behavior into dependable systems. + +Best, +Yang Fei (Xiaojiu) + +## Longer Version + +Dear OpenAI Codex team, + +I am interested in the Applied AI Engineer, Codex Core Agent role because I have been working on a concrete reliability problem: how to make AI-agent closeouts inspectable enough that a reviewer can trust the evidence instead of the claim. + +My recent public work is an Agent Reliability False-Pass Gate. The premise is simple: an agent saying "done" is not evidence. The gate checks whether a success claim has passing-evidence pointers and explicit `cannot_claim` boundaries. If those are missing, it blocks the closeout instead of allowing unsupported success language. + +The most useful part of the project was catching a false-pass inside the gate itself. A missing or empty fixture directory could produce a fake pass with zero cases. I changed the behavior to fail closed, added regression coverage, and kept the proof narrow: local validation, merged PR evidence, remote CI status, provider output, and production claims are not collapsed into one generic success state. + +I also built a candidate-review bridge for sanitized agent summaries and locked its provider/model boundary. The goal was not to add another model for its own sake. The goal was to preserve model review as advisory while keeping local verification, GitHub evidence, and human approval as separate gates. + +I believe this experience is relevant to Codex agent work because reliability is not only about model capability. It is also about eval design, failure modes, edge cases, closeout discipline, and the systems that make the difference between an impressive demo and a dependable tool. + +Best, +Yang Fei (Xiaojiu) + +## Role Match Notes + +- OpenAI role theme: improve Codex agents from impressive demos into dependable tools. +- Candidate proof theme: false-pass prevention for AI-agent closeouts. +- OpenAI role theme: evals, failure modes, edge cases, robustness, real-world coding tasks. +- Candidate proof theme: fail-closed validation, negative tests, zero-case fixture fix, explicit `cannot_claim` state. +- OpenAI role theme: define what good completion looks like for agents handling complex tasks. +- Candidate proof theme: a task is not complete until evidence, validation, and limitations are visible. + +## Final Submission Gaps + +- Insert private contact details only in the final application copy. +- Confirm the exact application destination before transmitting any personal details. +- Keep the proof language narrow: local and GitHub-level evidence, not production readiness or customer validation. diff --git a/docs/career/openai-applied-ai-codex/resume-draft.md b/docs/career/openai-applied-ai-codex/resume-draft.md new file mode 100644 index 0000000..7474abb --- /dev/null +++ b/docs/career/openai-applied-ai-codex/resume-draft.md @@ -0,0 +1,88 @@ +# Yang Fei (Xiaojiu) - OpenAI Codex Applied AI Resume Draft + +Johor Bahru, Malaysia | Email: [private email] | Phone: [private phone] | LinkedIn: [add LinkedIn] | GitHub: https://github.com/yangfei222666-9 +AI Agent Reliability / Evals / Developer Tools Engineer + +## Target Role + +Applied AI Engineer, Codex Core Agent +Official role page: https://openai.com/careers/applied-ai-engineer-codex-core-agent-san-francisco/ + +This draft is local prep material. It should be converted into a private final application copy before submission. + +## Summary + +AI agent reliability engineer focused on evals, failure analysis, and evidence-gated workflows. I build reproducible checks that prevent unsupported "done" claims, preserve explicit uncertainty boundaries, and separate local validation, remote CI, provider output, and canonical truth. + +My current public proof is narrow and inspectable: a False-Pass Gate, a zero-case validation fix, a provider-locked candidate-review bridge, and a recruiter-readable proof page merged to GitHub main with CI evidence. I am interested in Codex work that turns agent capability into dependable completion of real software-engineering tasks. + +## Selected Engineering Work + +### Agent Reliability False-Pass Gate | Python, Pytest, GitHub Actions | 2026 + +- Built a schema-level evidence gate that rejects AI-agent success claims when required passing-evidence pointers or explicit `cannot_claim` boundaries are missing. +- Identified and fixed a zero-case validation flaw where missing or empty fixtures could incorrectly produce `self_test=PASS cases=0`; added fail-closed regression coverage. +- Designed a provider-locked candidate-review bridge with sanitized stdin-only inputs, credential isolation, no repository reads, and explicitly non-canonical model output. +- Published the implementation, reviewer guide, limitations, and reproducible validation path to GitHub main with remote CI passing. + +### Product Spine / Reliability Tooling | Python, TypeScript, CI/CD | 2026 + +- Built and maintained evidence-first workflow artifacts that distinguish local evidence, remote CI, provider output, and canonical truth before making completion claims. +- Added proof-index and reviewer-facing documentation so claims can be inspected by status, evidence command, limitation, and `cannot_claim` boundary. +- Used GitHub PR, CI, local regression tests, and closeout records as separate gates instead of treating a local pass as final truth. +- Practiced fail-closed review behavior: blocked or partial states are preserved instead of being rewritten into unsupported success language. + +## Technical Skills + +- Languages and tools: Python, TypeScript, Bash, Git, GitHub CLI, JSON, Markdown. +- Testing and validation: Pytest, CLI self-tests, regression fixtures, CI/CD, GitHub Actions, proof-index validation. +- AI-agent reliability: false-pass prevention, evidence gates, agent closeout review, provider-output boundaries, `cannot_claim` handling. +- Developer tooling: command-line validators, reviewer docs, reproducible local setup, Git evidence hygiene. + +## Experience + +### Independent AI Systems Engineer | March 2026 - Present + +- Built public proof around AI-agent reliability, false-pass prevention, and evidence-gated task closeouts. +- Converted agent workflow failures into testable validation rules, proof documents, and reviewer-readable engineering artifacts. +- Maintained strict boundaries around credentials, provider output, local validation, remote CI, and human approval. + +### Prior Experience + +[Add verified prior company, role, dates, and measurable bullets before final submission. Do not invent experience.] + +## Education + +[Add verified school, degree, field, and dates before final submission.] + +## Selected Links + +- Agent Reliability proof: https://github.com/yangfei222666-9/taiji/blob/main/docs/portfolio/agent-reliability-proof.md +- Main repository: https://github.com/yangfei222666-9/taiji +- One-line technical case: Agent said done. Where is the evidence? + +## Interview Stories + +### 1. Zero-case false-pass bug + +A validator designed to block false-pass behavior could itself pass with zero cases when fixtures were missing or empty. I changed the behavior to fail closed, added regression coverage, and documented the limitation so reviewers could inspect the evidence. + +### 2. Local pass is not remote truth + +The project keeps local validation, remote CI, provider output, and canonical truth as separate states. This prevents a common AI-agent failure mode where a local check becomes an unsupported completion claim. + +### 3. Provider output remains advisory + +The candidate-review bridge uses sanitized input and explicit provider/model boundaries, but model output is not treated as canonical. The final gate remains evidence, CI, and human approval. + +## Final Submission Gaps + +- Add private contact details in the final application copy only. +- Add verified education. +- Add verified prior work experience, if applicable. +- Confirm work authorization and relocation/sponsorship language for the specific application form. +- Convert to a clean PDF only after private details and final links are confirmed. + +## Boundaries + +This material should not claim deployed-system readiness, external customer adoption, third-party endorsement, fleet-scale sandboxing, production eval infrastructure, or self-updating agent authority. diff --git a/docs/career/openai-applied-ai-codex/resume-one-page.md b/docs/career/openai-applied-ai-codex/resume-one-page.md new file mode 100644 index 0000000..9503ca9 --- /dev/null +++ b/docs/career/openai-applied-ai-codex/resume-one-page.md @@ -0,0 +1,53 @@ +# Yang Fei (Xiaojiu) + +Johor Bahru, Malaysia | Email: [private email] | LinkedIn: [add LinkedIn] | GitHub: https://github.com/yangfei222666-9 +AI Agent Reliability / Evals / Developer Tools Engineer + +## Summary + +AI agent reliability engineer focused on evals, failure analysis, and evidence-gated workflows. I build reproducible checks that prevent unsupported "done" claims, preserve explicit uncertainty boundaries, and separate local validation, remote CI, provider output, and canonical truth. Recent public work includes a merged False-Pass Gate with fail-closed regression coverage and reviewer-readable proof. + +## Selected Engineering Work + +### Agent Reliability False-Pass Gate | Python, Pytest, GitHub Actions | 2026 + +- Built a schema-level evidence gate that rejects AI-agent success claims when required passing-evidence pointers or explicit `cannot_claim` boundaries are missing. +- Identified and fixed a zero-case validation flaw where missing or empty fixtures could incorrectly produce `self_test=PASS cases=0`; added fail-closed regression coverage. +- Designed a provider-locked candidate-review bridge with sanitized stdin-only inputs, credential isolation, no repository reads, and explicitly non-canonical model output. +- Published the implementation, reviewer guide, limitations, and reproducible validation path to GitHub main with remote CI passing. + +### Product Spine / Reliability Tooling | Python, TypeScript, CI/CD | 2026 + +- Built and maintained evidence-first workflow artifacts that distinguish local evidence, remote CI, provider output, and canonical truth before making completion claims. +- Added proof-index and reviewer-facing documentation so project claims can be inspected by status, command, limitation, and `cannot_claim` boundary. +- Used GitHub PR, CI, local regression tests, and closeout records as separate gates instead of treating a local pass as final truth. + +## Technical Skills + +Python, Pytest, CLI validation tools, TypeScript, GitHub Actions, CI/CD, Git, JSON evidence manifests, LLM/API boundary design, agent workflow auditing, eval-style regression checks, developer documentation. + +## Experience + +### Independent AI Systems Engineer | March 2026 - Present + +- Built public proof around AI-agent reliability, false-pass prevention, and evidence-gated task closeouts. +- Converted agent workflow failures into testable validation rules, proof documents, and reviewer-readable engineering artifacts. +- Maintained strict boundaries around credentials, provider output, local validation, remote CI, and human approval. + +### Prior Experience + +[Add verified prior company, role, dates, and 2-3 measurable bullets before final submission. Do not invent experience.] + +## Education + +[Add verified school, degree, field, and dates before final submission.] + +## Selected Links + +- Portfolio proof: https://github.com/yangfei222666-9/taiji/blob/main/docs/portfolio/agent-reliability-proof.md +- Main repository: https://github.com/yangfei222666-9/taiji +- Target technical case: Agent said done. Where is the evidence? + +## Boundaries + +Current public proof supports local and GitHub-level evidence for false-pass prevention. It does not claim deployed-system readiness, external customer adoption, third-party endorsement, fleet-scale sandboxing, or production eval infrastructure. From 88ef79662f3066b8d608c2c3023c4fd69a5e892c Mon Sep 17 00:00:00 2001 From: yangfei222666-9 <261852489+yangfei222666-9@users.noreply.github.com> Date: Thu, 25 Jun 2026 01:04:24 +0800 Subject: [PATCH 2/2] docs: add codex reliability gap map --- data/codex-reliability-gap-map-01.json | 923 ++++++++++++++++++ docs/research/codex-reliability-gap-map-01.md | 136 +++ scripts/check_codex_gap_map.py | 129 +++ tests/test_codex_gap_map.py | 46 + 4 files changed, 1234 insertions(+) create mode 100644 data/codex-reliability-gap-map-01.json create mode 100644 docs/research/codex-reliability-gap-map-01.md create mode 100755 scripts/check_codex_gap_map.py create mode 100644 tests/test_codex_gap_map.py diff --git a/data/codex-reliability-gap-map-01.json b/data/codex-reliability-gap-map-01.json new file mode 100644 index 0000000..bf17dcb --- /dev/null +++ b/data/codex-reliability-gap-map-01.json @@ -0,0 +1,923 @@ +{ + "schema": "codex_reliability_gap_map_01_v1", + "title": "Codex Reliability Gap Map #01", + "subtitle": "A Scoped Review of Public Coding-Agent Failure Reports", + "snapshot_date": "2026-06-24", + "snapshot_cutoff": "2026-06-24T23:59:59Z", + "source_repo": "openai/codex", + "source_query": "https://api.github.com/repos/openai/codex/issues?state=open&per_page=100&sort=created&direction=desc", + "sample_size": 30, + "sample_type": "deterministic_non_random_snapshot", + "sampling_rule": "Take the first 30 open, non-PR GitHub issues with created_at <= 2026-06-24T23:59:59Z after sorting by created_at descending.", + "purpose": "exploratory_failure_mode_mapping", + "sources": [ + "https://openai.com/index/introducing-codex/", + "https://github.com/openai/codex/issues", + "https://arxiv.org/abs/2603.20847", + "https://arxiv.org/abs/2605.18583", + "https://arxiv.org/abs/2606.22721" + ], + "cannot_establish": [ + "prevalence across all Codex users", + "maintainer-confirmed product defects", + "root causes", + "whether reports remain valid after later releases", + "security impact", + "current Codex product quality" + ], + "failure_modes": { + "completion_evidence_integrity": { + "label": "Completion evidence integrity", + "gate": "Closeout Evidence Gate", + "description": "Completion claims need inspectable evidence rather than partial, stale, or UI-only success signals." + }, + "session_state_continuity": { + "label": "Session and state continuity", + "gate": "State Resume Receipt", + "description": "Long-running tasks need receipts that preserve resumable state, connection assumptions, and stale-context boundaries." + }, + "authorization_scope_control": { + "label": "Authorization and scope control", + "gate": "Exact-Scope Authorization Gate", + "description": "Agent actions need scoped permission records before touching files, config, tools, credentials, or persistent settings." + }, + "tool_sandbox_execution_integrity": { + "label": "Tool and sandbox execution integrity", + "gate": "Tool-Call Execution Receipt", + "description": "Command, tool-call, sandbox, ACL, and cross-platform execution results need explicit receipts." + }, + "cost_runaway_loop_visibility": { + "label": "Cost and runaway-loop visibility", + "gate": "Run Budget / Stop-Condition Gate", + "description": "Users need visible budgets, loop stop conditions, and cost or performance degradation signals." + } + }, + "statistics": { + "sample_size": 30, + "report_type_counts": { + "bug_report": 27, + "feature_request": 3 + }, + "failure_mode_counts": { + "authorization_scope_control": 4, + "completion_evidence_integrity": 10, + "cost_runaway_loop_visibility": 4, + "session_state_continuity": 6, + "tool_sandbox_execution_integrity": 6 + }, + "top_label_counts": { + "bug": 27, + "app": 21, + "windows-os": 13, + "CLI": 9, + "performance": 5, + "sandbox": 5, + "skills": 4, + "session": 4, + "enhancement": 3, + "app-server": 3, + "exec": 3, + "model-behavior": 2, + "safety-check": 2, + "extension": 2, + "connectivity": 2 + } + }, + "issues": [ + { + "issue_id": 29882, + "issue_url": "https://github.com/openai/codex/issues/29882", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T16:58:10Z", + "updated_at": "2026-06-24T16:58:49Z", + "report_type": "bug_report", + "labels": [ + "bug", + "windows-os", + "app", + "performance" + ], + "platform": "Windows", + "user_reported_symptom": "Windows Codex Desktop may trigger full system freeze after Modern Standby resume; prior LiveKernelEvent 193 dxgkrnl watchdog", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "session_state_continuity", + "mapped_gate": "State Resume Receipt", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29881, + "issue_url": "https://github.com/openai/codex/issues/29881", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T16:57:32Z", + "updated_at": "2026-06-24T16:58:12Z", + "report_type": "bug_report", + "labels": [ + "bug", + "windows-os", + "sandbox", + "CLI", + "app" + ], + "platform": "Windows", + "user_reported_symptom": "[Windows] Switching App Agent Environment WSL → Native initializes the sandbox and fixes both App and CLI", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "tool_sandbox_execution_integrity", + "mapped_gate": "Tool-Call Execution Receipt", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29880, + "issue_url": "https://github.com/openai/codex/issues/29880", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T16:53:50Z", + "updated_at": "2026-06-24T16:54:50Z", + "report_type": "bug_report", + "labels": [ + "bug", + "app" + ], + "platform": "Desktop app", + "user_reported_symptom": "Codex app image thumbnails fail to render after feedback upload", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "completion_evidence_integrity", + "mapped_gate": "Closeout Evidence Gate", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29879, + "issue_url": "https://github.com/openai/codex/issues/29879", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T16:52:46Z", + "updated_at": "2026-06-24T16:53:25Z", + "report_type": "bug_report", + "labels": [ + "bug", + "model-behavior", + "app" + ], + "platform": "Desktop app", + "user_reported_symptom": "Model not respecting agents instructions", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "authorization_scope_control", + "mapped_gate": "Exact-Scope Authorization Gate", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29878, + "issue_url": "https://github.com/openai/codex/issues/29878", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T15:56:27Z", + "updated_at": "2026-06-24T15:57:06Z", + "report_type": "feature_request", + "labels": [ + "enhancement", + "CLI", + "safety-check", + "skills" + ], + "platform": "CLI", + "user_reported_symptom": "open source codex-security plugin", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "authorization_scope_control", + "mapped_gate": "Exact-Scope Authorization Gate", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29876, + "issue_url": "https://github.com/openai/codex/issues/29876", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T15:50:31Z", + "updated_at": "2026-06-24T15:57:27Z", + "report_type": "bug_report", + "labels": [ + "bug", + "extension", + "app", + "performance" + ], + "platform": "IDE extension", + "user_reported_symptom": "Excessive disk writes / SSD wear concern on macOS Codex app and JetBrains ACP", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "cost_runaway_loop_visibility", + "mapped_gate": "Run Budget / Stop-Condition Gate", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29873, + "issue_url": "https://github.com/openai/codex/issues/29873", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T15:39:44Z", + "updated_at": "2026-06-24T16:31:20Z", + "report_type": "bug_report", + "labels": [ + "bug", + "app" + ], + "platform": "Desktop app", + "user_reported_symptom": "Office files open from output links but show unsupported preview from workspace file picker", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "completion_evidence_integrity", + "mapped_gate": "Closeout Evidence Gate", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29872, + "issue_url": "https://github.com/openai/codex/issues/29872", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T15:39:25Z", + "updated_at": "2026-06-24T15:40:08Z", + "report_type": "bug_report", + "labels": [ + "bug", + "CLI", + "app-server" + ], + "platform": "CLI", + "user_reported_symptom": "\"app-server/src/lib.rs\" has unused mutable \"loader_overrides\"", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "completion_evidence_integrity", + "mapped_gate": "Closeout Evidence Gate", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29871, + "issue_url": "https://github.com/openai/codex/issues/29871", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T15:31:59Z", + "updated_at": "2026-06-24T15:32:42Z", + "report_type": "bug_report", + "labels": [ + "bug", + "windows-os", + "app", + "safety-check", + "skills" + ], + "platform": "Windows", + "user_reported_symptom": "[Codex Security] Workspace fails on Japanese Windows when Git commit subject contains non-ASCII characters", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "authorization_scope_control", + "mapped_gate": "Exact-Scope Authorization Gate", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29868, + "issue_url": "https://github.com/openai/codex/issues/29868", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T15:14:47Z", + "updated_at": "2026-06-24T15:16:14Z", + "report_type": "bug_report", + "labels": [ + "bug", + "app", + "session", + "app-server" + ], + "platform": "Desktop app", + "user_reported_symptom": "Codex Desktop exposes stale ghost conversations that cannot be resumed or archived", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "session_state_continuity", + "mapped_gate": "State Resume Receipt", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29867, + "issue_url": "https://github.com/openai/codex/issues/29867", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T15:12:39Z", + "updated_at": "2026-06-24T15:14:41Z", + "report_type": "bug_report", + "labels": [ + "bug", + "windows-os", + "sandbox", + "app" + ], + "platform": "Windows", + "user_reported_symptom": "Windows sandbox can leave workspace owned by CodexSandboxOnline, then setup refresh fails with SetNamedSecurityInfoW 5", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "tool_sandbox_execution_integrity", + "mapped_gate": "Tool-Call Execution Receipt", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29866, + "issue_url": "https://github.com/openai/codex/issues/29866", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T15:06:23Z", + "updated_at": "2026-06-24T15:13:50Z", + "report_type": "bug_report", + "labels": [ + "bug", + "windows-os", + "extension", + "sandbox", + "connectivity", + "session", + "remote" + ], + "platform": "Windows", + "user_reported_symptom": "Codex IDE chat loses SSH/network access after idle/resume in same conversation", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "session_state_continuity", + "mapped_gate": "State Resume Receipt", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29864, + "issue_url": "https://github.com/openai/codex/issues/29864", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T14:55:08Z", + "updated_at": "2026-06-24T14:55:48Z", + "report_type": "bug_report", + "labels": [ + "bug", + "app", + "session" + ], + "platform": "Desktop app", + "user_reported_symptom": "Codex Desktop resume can fail when shell snapshot tmp file disappears during validation", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "session_state_continuity", + "mapped_gate": "State Resume Receipt", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29860, + "issue_url": "https://github.com/openai/codex/issues/29860", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T14:30:13Z", + "updated_at": "2026-06-24T14:30:52Z", + "report_type": "bug_report", + "labels": [ + "bug", + "code-review", + "app" + ], + "platform": "Desktop app", + "user_reported_symptom": "Review comments are not cleared after being addressed", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "completion_evidence_integrity", + "mapped_gate": "Closeout Evidence Gate", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29859, + "issue_url": "https://github.com/openai/codex/issues/29859", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T13:58:56Z", + "updated_at": "2026-06-24T13:59:44Z", + "report_type": "feature_request", + "labels": [ + "enhancement", + "exec", + "CLI", + "skills", + "session" + ], + "platform": "CLI", + "user_reported_symptom": "TypeScript SDK: Expand feature coverage — fork, session management, messages, metadata", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "session_state_continuity", + "mapped_gate": "State Resume Receipt", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29858, + "issue_url": "https://github.com/openai/codex/issues/29858", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T13:53:43Z", + "updated_at": "2026-06-24T14:07:28Z", + "report_type": "bug_report", + "labels": [ + "bug", + "windows-os", + "app", + "performance" + ], + "platform": "Windows", + "user_reported_symptom": "Windows: Opening Codex tab in non-Git workspace causes continuous git.exe spawning and high Defender CPU", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "cost_runaway_loop_visibility", + "mapped_gate": "Run Budget / Stop-Condition Gate", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29857, + "issue_url": "https://github.com/openai/codex/issues/29857", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T13:37:00Z", + "updated_at": "2026-06-24T13:37:42Z", + "report_type": "bug_report", + "labels": [ + "bug", + "mcp", + "exec", + "CLI", + "config" + ], + "platform": "CLI", + "user_reported_symptom": "codex exec silently auto-cancels MCP tool calls regardless of default_tools_approval_mode", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "tool_sandbox_execution_integrity", + "mapped_gate": "Tool-Call Execution Receipt", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29855, + "issue_url": "https://github.com/openai/codex/issues/29855", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T13:25:26Z", + "updated_at": "2026-06-24T13:27:01Z", + "report_type": "bug_report", + "labels": [ + "bug", + "windows-os", + "app" + ], + "platform": "Windows", + "user_reported_symptom": "Spell checker incorrectly flags common English words in Codex Desktop 26.616.81150 (Windows 11)", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "completion_evidence_integrity", + "mapped_gate": "Closeout Evidence Gate", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29854, + "issue_url": "https://github.com/openai/codex/issues/29854", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T13:08:37Z", + "updated_at": "2026-06-24T13:12:40Z", + "report_type": "bug_report", + "labels": [ + "bug", + "windows-os", + "app", + "connectivity", + "app-server", + "performance" + ], + "platform": "Windows", + "user_reported_symptom": "Codex Windows app-server saturates upload bandwidth and causes packet loss", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "session_state_continuity", + "mapped_gate": "State Resume Receipt", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29849, + "issue_url": "https://github.com/openai/codex/issues/29849", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T12:01:23Z", + "updated_at": "2026-06-24T12:02:03Z", + "report_type": "bug_report", + "labels": [ + "bug", + "app" + ], + "platform": "Desktop app", + "user_reported_symptom": "PR status panel shows GitHub CLI unavailable in non-GitHub workspace", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "completion_evidence_integrity", + "mapped_gate": "Closeout Evidence Gate", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29848, + "issue_url": "https://github.com/openai/codex/issues/29848", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T11:30:20Z", + "updated_at": "2026-06-24T13:16:41Z", + "report_type": "bug_report", + "labels": [ + "bug", + "windows-os", + "app" + ], + "platform": "Windows", + "user_reported_symptom": "BUG Codex Windows 404 Not Found Model not found gpt-5.5", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "completion_evidence_integrity", + "mapped_gate": "Closeout Evidence Gate", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29847, + "issue_url": "https://github.com/openai/codex/issues/29847", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T11:13:05Z", + "updated_at": "2026-06-24T11:13:48Z", + "report_type": "bug_report", + "labels": [ + "bug", + "windows-os", + "app" + ], + "platform": "Windows", + "user_reported_symptom": "codex pc app bug", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "completion_evidence_integrity", + "mapped_gate": "Closeout Evidence Gate", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29846, + "issue_url": "https://github.com/openai/codex/issues/29846", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T11:11:10Z", + "updated_at": "2026-06-24T11:11:51Z", + "report_type": "bug_report", + "labels": [ + "bug", + "CLI", + "skills", + "subagent", + "config" + ], + "platform": "CLI", + "user_reported_symptom": "skills.config enabled=false cannot be overridden by project or custom subagent config", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "authorization_scope_control", + "mapped_gate": "Exact-Scope Authorization Gate", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29843, + "issue_url": "https://github.com/openai/codex/issues/29843", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T10:55:29Z", + "updated_at": "2026-06-24T13:01:08Z", + "report_type": "bug_report", + "labels": [ + "bug", + "model-behavior", + "TUI", + "CLI" + ], + "platform": "CLI", + "user_reported_symptom": "Codex CLI scroll view jump on type and queued messages steering too hard", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "completion_evidence_integrity", + "mapped_gate": "Closeout Evidence Gate", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29840, + "issue_url": "https://github.com/openai/codex/issues/29840", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T10:26:32Z", + "updated_at": "2026-06-24T10:27:15Z", + "report_type": "bug_report", + "labels": [ + "bug", + "tool-calls", + "app" + ], + "platform": "Desktop app", + "user_reported_symptom": "Gmail connector _create_draft fails for threaded replies with reply_message_id: Subject does not match", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "tool_sandbox_execution_integrity", + "mapped_gate": "Tool-Call Execution Receipt", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29838, + "issue_url": "https://github.com/openai/codex/issues/29838", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T10:14:57Z", + "updated_at": "2026-06-24T10:20:12Z", + "report_type": "feature_request", + "labels": [ + "enhancement", + "rate-limits", + "CLI" + ], + "platform": "CLI", + "user_reported_symptom": "Add /usage command to codex cli", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "cost_runaway_loop_visibility", + "mapped_gate": "Run Budget / Stop-Condition Gate", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29836, + "issue_url": "https://github.com/openai/codex/issues/29836", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T09:58:56Z", + "updated_at": "2026-06-24T10:38:10Z", + "report_type": "bug_report", + "labels": [ + "bug", + "windows-os", + "sandbox", + "exec", + "CLI" + ], + "platform": "Windows", + "user_reported_symptom": "Can't run WinGet executables in the sandbox when Windows Developer mode is enabled", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "tool_sandbox_execution_integrity", + "mapped_gate": "Tool-Call Execution Receipt", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29834, + "issue_url": "https://github.com/openai/codex/issues/29834", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T09:49:59Z", + "updated_at": "2026-06-24T11:02:30Z", + "report_type": "bug_report", + "labels": [ + "bug", + "app" + ], + "platform": "Desktop app", + "user_reported_symptom": "codex crash bug when I enter information in a running task", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "completion_evidence_integrity", + "mapped_gate": "Closeout Evidence Gate", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29832, + "issue_url": "https://github.com/openai/codex/issues/29832", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T09:25:44Z", + "updated_at": "2026-06-24T09:27:26Z", + "report_type": "bug_report", + "labels": [ + "bug", + "windows-os", + "app", + "performance" + ], + "platform": "Windows", + "user_reported_symptom": "Windows Codex app updated to 26.616.10790.0, but continuous disk writes persist and appear worse.", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "cost_runaway_loop_visibility", + "mapped_gate": "Run Budget / Stop-Condition Gate", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + }, + { + "issue_id": 29830, + "issue_url": "https://github.com/openai/codex/issues/29830", + "snapshot_date": "2026-06-24", + "sample_rule": "openai/codex open issues; created_at <= 2026-06-24T23:59:59Z; sorted by created_at desc; first 30 non-PR issues", + "issue_status": "open", + "created_at": "2026-06-24T09:17:09Z", + "updated_at": "2026-06-24T09:19:07Z", + "report_type": "bug_report", + "labels": [ + "bug", + "windows-os", + "sandbox", + "tool-calls", + "app" + ], + "platform": "Windows", + "user_reported_symptom": "The Codex App always requires my approval, and I've granted it all permissions.", + "maintainer_confirmation": "not_observed", + "independent_reproduction": "not_performed", + "failure_mode": "tool_sandbox_execution_integrity", + "mapped_gate": "Tool-Call Execution Receipt", + "cannot_claim": [ + "Cannot claim product-wide prevalence from this sample.", + "Cannot claim root cause is confirmed by maintainers.", + "Cannot claim independent reproduction was performed.", + "Cannot claim the issue remains present in later versions." + ] + } + ] +} diff --git a/docs/research/codex-reliability-gap-map-01.md b/docs/research/codex-reliability-gap-map-01.md new file mode 100644 index 0000000..32e94b8 --- /dev/null +++ b/docs/research/codex-reliability-gap-map-01.md @@ -0,0 +1,136 @@ +# Codex Reliability Gap Map #01 + +A Scoped Review of Public Coding-Agent Failure Reports + +## Research Question + +What recurring reliability risks can be observed in a dated, deterministic sample of public `openai/codex` issue reports, and how can those reported symptoms be mapped to evidence-gate design patterns without overstating them as confirmed product defects? + +## Scope and Sampling Method + +- `snapshot_date`: `2026-06-24` +- `sample_size`: `30` public GitHub issues +- `sample_type`: `deterministic_non_random_snapshot` +- `source_repo`: `openai/codex` +- `sampling_rule`: Take the first 30 open, non-PR GitHub issues with created_at <= 2026-06-24T23:59:59Z after sorting by created_at descending. +- `report_type_counts`: bug_report=27, feature_request=3 + +The sample was not randomly drawn and should not be used to infer product-wide prevalence. GitHub labels are not mutually exclusive; one issue can count toward multiple label buckets, so label totals are descriptive metadata rather than incidence rates. + +## Sources + +- https://openai.com/index/introducing-codex/ +- https://github.com/openai/codex/issues +- https://arxiv.org/abs/2603.20847 +- https://arxiv.org/abs/2605.18583 +- https://arxiv.org/abs/2606.22721 + +## What This Study Cannot Establish + +This is a scoped review of public user reports, not a prevalence study, security audit, or assessment of current Codex product quality. An open issue is treated as a reported symptom, not a confirmed defect. + +- prevalence across all Codex users +- maintainer-confirmed product defects +- root causes +- whether reports remain valid after later releases +- security impact +- current Codex product quality + +## Five Failure Modes + +| Failure mode | Records | Evidence gate | Why it matters | +|---|---:|---|---| +| Completion evidence integrity | 10 | Closeout Evidence Gate | Completion claims need inspectable evidence rather than partial, stale, or UI-only success signals. | +| Session and state continuity | 6 | State Resume Receipt | Long-running tasks need receipts that preserve resumable state, connection assumptions, and stale-context boundaries. | +| Authorization and scope control | 4 | Exact-Scope Authorization Gate | Agent actions need scoped permission records before touching files, config, tools, credentials, or persistent settings. | +| Tool and sandbox execution integrity | 6 | Tool-Call Execution Receipt | Command, tool-call, sandbox, ACL, and cross-platform execution results need explicit receipts. | +| Cost and runaway-loop visibility | 4 | Run Budget / Stop-Condition Gate | Users need visible budgets, loop stop conditions, and cost or performance degradation signals. | + +## Thirty-Issue Evidence Table + +| Issue | Type | Labels | Reported symptom | Failure mode | Evidence gate | +|---|---|---|---|---|---| +| [#29882](https://github.com/openai/codex/issues/29882) | bug_report | bug, windows-os, app, performance | Windows Codex Desktop may trigger full system freeze after Modern Standby resume; prior LiveKernelEvent 193 dxgkrnl watchdog | session_state_continuity | State Resume Receipt | +| [#29881](https://github.com/openai/codex/issues/29881) | bug_report | bug, windows-os, sandbox, CLI, app | [Windows] Switching App Agent Environment WSL → Native initializes the sandbox and fixes both App and CLI | tool_sandbox_execution_integrity | Tool-Call Execution Receipt | +| [#29880](https://github.com/openai/codex/issues/29880) | bug_report | bug, app | Codex app image thumbnails fail to render after feedback upload | completion_evidence_integrity | Closeout Evidence Gate | +| [#29879](https://github.com/openai/codex/issues/29879) | bug_report | bug, model-behavior, app | Model not respecting agents instructions | authorization_scope_control | Exact-Scope Authorization Gate | +| [#29878](https://github.com/openai/codex/issues/29878) | feature_request | enhancement, CLI, safety-check, skills | open source codex-security plugin | authorization_scope_control | Exact-Scope Authorization Gate | +| [#29876](https://github.com/openai/codex/issues/29876) | bug_report | bug, extension, app, performance | Excessive disk writes / SSD wear concern on macOS Codex app and JetBrains ACP | cost_runaway_loop_visibility | Run Budget / Stop-Condition Gate | +| [#29873](https://github.com/openai/codex/issues/29873) | bug_report | bug, app | Office files open from output links but show unsupported preview from workspace file picker | completion_evidence_integrity | Closeout Evidence Gate | +| [#29872](https://github.com/openai/codex/issues/29872) | bug_report | bug, CLI, app-server | "app-server/src/lib.rs" has unused mutable "loader_overrides" | completion_evidence_integrity | Closeout Evidence Gate | +| [#29871](https://github.com/openai/codex/issues/29871) | bug_report | bug, windows-os, app, safety-check, skills | [Codex Security] Workspace fails on Japanese Windows when Git commit subject contains non-ASCII characters | authorization_scope_control | Exact-Scope Authorization Gate | +| [#29868](https://github.com/openai/codex/issues/29868) | bug_report | bug, app, session, app-server | Codex Desktop exposes stale ghost conversations that cannot be resumed or archived | session_state_continuity | State Resume Receipt | +| [#29867](https://github.com/openai/codex/issues/29867) | bug_report | bug, windows-os, sandbox, app | Windows sandbox can leave workspace owned by CodexSandboxOnline, then setup refresh fails with SetNamedSecurityInfoW 5 | tool_sandbox_execution_integrity | Tool-Call Execution Receipt | +| [#29866](https://github.com/openai/codex/issues/29866) | bug_report | bug, windows-os, extension, sandbox, connectivity, session, remote | Codex IDE chat loses SSH/network access after idle/resume in same conversation | session_state_continuity | State Resume Receipt | +| [#29864](https://github.com/openai/codex/issues/29864) | bug_report | bug, app, session | Codex Desktop resume can fail when shell snapshot tmp file disappears during validation | session_state_continuity | State Resume Receipt | +| [#29860](https://github.com/openai/codex/issues/29860) | bug_report | bug, code-review, app | Review comments are not cleared after being addressed | completion_evidence_integrity | Closeout Evidence Gate | +| [#29859](https://github.com/openai/codex/issues/29859) | feature_request | enhancement, exec, CLI, skills, session | TypeScript SDK: Expand feature coverage — fork, session management, messages, metadata | session_state_continuity | State Resume Receipt | +| [#29858](https://github.com/openai/codex/issues/29858) | bug_report | bug, windows-os, app, performance | Windows: Opening Codex tab in non-Git workspace causes continuous git.exe spawning and high Defender CPU | cost_runaway_loop_visibility | Run Budget / Stop-Condition Gate | +| [#29857](https://github.com/openai/codex/issues/29857) | bug_report | bug, mcp, exec, CLI, config | codex exec silently auto-cancels MCP tool calls regardless of default_tools_approval_mode | tool_sandbox_execution_integrity | Tool-Call Execution Receipt | +| [#29855](https://github.com/openai/codex/issues/29855) | bug_report | bug, windows-os, app | Spell checker incorrectly flags common English words in Codex Desktop 26.616.81150 (Windows 11) | completion_evidence_integrity | Closeout Evidence Gate | +| [#29854](https://github.com/openai/codex/issues/29854) | bug_report | bug, windows-os, app, connectivity, app-server, performance | Codex Windows app-server saturates upload bandwidth and causes packet loss | session_state_continuity | State Resume Receipt | +| [#29849](https://github.com/openai/codex/issues/29849) | bug_report | bug, app | PR status panel shows GitHub CLI unavailable in non-GitHub workspace | completion_evidence_integrity | Closeout Evidence Gate | +| [#29848](https://github.com/openai/codex/issues/29848) | bug_report | bug, windows-os, app | BUG Codex Windows 404 Not Found Model not found gpt-5.5 | completion_evidence_integrity | Closeout Evidence Gate | +| [#29847](https://github.com/openai/codex/issues/29847) | bug_report | bug, windows-os, app | codex pc app bug | completion_evidence_integrity | Closeout Evidence Gate | +| [#29846](https://github.com/openai/codex/issues/29846) | bug_report | bug, CLI, skills, subagent, config | skills.config enabled=false cannot be overridden by project or custom subagent config | authorization_scope_control | Exact-Scope Authorization Gate | +| [#29843](https://github.com/openai/codex/issues/29843) | bug_report | bug, model-behavior, TUI, CLI | Codex CLI scroll view jump on type and queued messages steering too hard | completion_evidence_integrity | Closeout Evidence Gate | +| [#29840](https://github.com/openai/codex/issues/29840) | bug_report | bug, tool-calls, app | Gmail connector _create_draft fails for threaded replies with reply_message_id: Subject does not match | tool_sandbox_execution_integrity | Tool-Call Execution Receipt | +| [#29838](https://github.com/openai/codex/issues/29838) | feature_request | enhancement, rate-limits, CLI | Add /usage command to codex cli | cost_runaway_loop_visibility | Run Budget / Stop-Condition Gate | +| [#29836](https://github.com/openai/codex/issues/29836) | bug_report | bug, windows-os, sandbox, exec, CLI | Can't run WinGet executables in the sandbox when Windows Developer mode is enabled | tool_sandbox_execution_integrity | Tool-Call Execution Receipt | +| [#29834](https://github.com/openai/codex/issues/29834) | bug_report | bug, app | codex crash bug when I enter information in a running task | completion_evidence_integrity | Closeout Evidence Gate | +| [#29832](https://github.com/openai/codex/issues/29832) | bug_report | bug, windows-os, app, performance | Windows Codex app updated to 26.616.10790.0, but continuous disk writes persist and appear worse. | cost_runaway_loop_visibility | Run Budget / Stop-Condition Gate | +| [#29830](https://github.com/openai/codex/issues/29830) | bug_report | bug, windows-os, sandbox, tool-calls, app | The Codex App always requires my approval, and I've granted it all permissions. | tool_sandbox_execution_integrity | Tool-Call Execution Receipt | + +## Failure Mode To Evidence Gate Map + +### Completion Evidence Integrity -> Closeout Evidence Gate + +A completion claim should cite inspectable evidence: commands run, exit status, test output, changed files, unresolved blockers, and `cannot_claim`. UI-level success, stale status, or partial evidence should not be enough to mark work complete. + +### Session And State Continuity -> State Resume Receipt + +Long-running tasks need a resume packet that records current branch, HEAD, dirty state, pending user choices, active tool sessions, remote connections, and stale assumptions. A resumed session should explicitly prove continuity before acting. + +### Authorization And Scope Control -> Exact-Scope Authorization Gate + +Benign requests can still trigger out-of-scope work. The gate should bind each write, command, network call, file upload, provider call, branch operation, and submit action to a narrow user-approved scope. + +### Tool And Sandbox Execution Integrity -> Tool-Call Execution Receipt + +Tool and sandbox failures should leave typed evidence: command, cwd, exit code, stdout/stderr summary, sandbox permission state, platform, and retry outcome. Silent auto-cancel or UI-only reporting is insufficient. + +### Cost And Runaway-Loop Visibility -> Run Budget / Stop-Condition Gate + +Long-running agent work needs visible usage, loop counters, stop conditions, retry limits, and degradation state. A task should not keep spending tokens or repeatedly editing without an explicit budget and closeout. + +## Existing False-Pass Gate Demonstration + +The existing False-Pass Gate proof is aligned with the first failure mode: it blocks unsupported success language when passing evidence pointers or explicit `cannot_claim` boundaries are missing. This Gap Map extends that idea from one proof into a broader failure-mode taxonomy. + +## Cross-Layer Risk: Review Habituation + +Even when evidence is present, repeated AI-generated reviews may reduce active human scrutiny. Reviewer-facing gates should therefore surface evidence deltas, negative tests, scope changes, and `cannot_claim` boundaries rather than burying them in long summaries. + +## Open Questions + +- Which reported symptoms can be independently reproduced in a clean environment? +- Which reports remain current after later Codex releases? +- Which gates should be implemented as local validators, UI affordances, or reviewer checklist items? +- Which risks are best handled by product changes versus user workflow conventions? + +## Reproduction Instructions + +1. Query `https://api.github.com/repos/openai/codex/issues?state=open&per_page=100&sort=created&direction=desc`. +2. Exclude pull requests. +3. Keep issues with `created_at <= 2026-06-24T23:59:59Z`. +4. Take the first 30 records in descending creation order. +5. Store issue id, URL, status, labels, title, snapshot date, report type, mapped failure mode, mapped evidence gate, and `cannot_claim`. +6. Run `python3 scripts/check_codex_gap_map.py` to verify the JSON and Markdown stay aligned. + +## Cannot Claim + +- Cannot claim Codex is missing all mapped capabilities. +- Cannot claim the open issues are maintainer-confirmed defects. +- Cannot claim these reports are statistically representative. +- Cannot claim the failure modes remain present in later versions. +- Cannot claim the existing gates solve the reported issues in production. diff --git a/scripts/check_codex_gap_map.py b/scripts/check_codex_gap_map.py new file mode 100755 index 0000000..2b773b5 --- /dev/null +++ b/scripts/check_codex_gap_map.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +"""Validate Codex Reliability Gap Map #01 without judging issue truth.""" + +from __future__ import annotations + +import argparse +import json +import re +import sys +from pathlib import Path +from typing import Any + +ROOT = Path(__file__).resolve().parents[1] +DEFAULT_JSON = ROOT / "data" / "codex-reliability-gap-map-01.json" +DEFAULT_MD = ROOT / "docs" / "research" / "codex-reliability-gap-map-01.md" +REQUIRED_FIELDS = { + "issue_id", + "issue_url", + "snapshot_date", + "issue_status", + "report_type", + "labels", + "user_reported_symptom", + "maintainer_confirmation", + "independent_reproduction", + "failure_mode", + "mapped_gate", + "cannot_claim", +} +VALID_REPORT_TYPES = {"bug_report", "feature_request"} + + +def load_json(path: Path) -> dict[str, Any]: + return json.loads(path.read_text(encoding="utf-8")) + + +def fail(message: str) -> int: + print(f"gap_map=FAIL reason={message}") + return 1 + + +def validate_payload(payload: dict[str, Any]) -> list[str]: + errors: list[str] = [] + issues = payload.get("issues") + if not isinstance(issues, list): + return ["issues must be a list"] + if payload.get("sample_size") != len(issues): + errors.append("sample_size does not match issue count") + if len(issues) != 30: + errors.append("expected exactly 30 issue records") + + ids: list[int] = [] + modes = set((payload.get("failure_modes") or {}).keys()) + for idx, issue in enumerate(issues): + missing = sorted(REQUIRED_FIELDS - set(issue)) + if missing: + errors.append(f"issue[{idx}] missing fields: {', '.join(missing)}") + continue + issue_id = issue["issue_id"] + ids.append(issue_id) + if not isinstance(issue_id, int): + errors.append(f"issue[{idx}] issue_id must be int") + if not re.match(r"^https://github\.com/openai/codex/issues/\d+$", issue["issue_url"]): + errors.append(f"issue[{idx}] issue_url must be an openai/codex issue URL") + if issue["snapshot_date"] != payload.get("snapshot_date"): + errors.append(f"issue[{idx}] snapshot_date mismatch") + if issue["report_type"] not in VALID_REPORT_TYPES: + errors.append(f"issue[{idx}] invalid report_type") + if issue["failure_mode"] not in modes: + errors.append(f"issue[{idx}] unknown failure_mode") + if not isinstance(issue["labels"], list): + errors.append(f"issue[{idx}] labels must be a list") + if not isinstance(issue["cannot_claim"], list) or not issue["cannot_claim"]: + errors.append(f"issue[{idx}] cannot_claim must be non-empty list") + if issue["maintainer_confirmation"] == "confirmed" or issue["independent_reproduction"] == "performed": + errors.append(f"issue[{idx}] validator cannot certify confirmation or reproduction") + + if len(ids) != len(set(ids)): + errors.append("issue records contain duplicate issue_id values") + + stats = payload.get("statistics") or {} + expected_counts: dict[str, int] = {} + for issue in issues: + expected_counts[issue["failure_mode"]] = expected_counts.get(issue["failure_mode"], 0) + 1 + if stats.get("failure_mode_counts") != dict(sorted(expected_counts.items())): + errors.append("failure_mode_counts does not match issue records") + return errors + + +def validate_markdown(payload: dict[str, Any], md_text: str) -> list[str]: + errors: list[str] = [] + if "This is a scoped review of public user reports" not in md_text: + errors.append("markdown missing scope limitation statement") + if "An open issue is treated as a reported symptom, not a confirmed defect." not in md_text: + errors.append("markdown missing reported-symptom limitation") + for issue in payload["issues"]: + token = f"[#{issue['issue_id']}]({issue['issue_url']})" + if token not in md_text: + errors.append(f"markdown missing issue link #{issue['issue_id']}") + return errors + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Validate Codex Reliability Gap Map #01 files.") + parser.add_argument("--json", type=Path, default=DEFAULT_JSON) + parser.add_argument("--markdown", type=Path, default=DEFAULT_MD) + return parser + + +def main() -> int: + args = build_parser().parse_args() + payload = load_json(args.json) + md_text = args.markdown.read_text(encoding="utf-8") + errors = validate_payload(payload) + validate_markdown(payload, md_text) + if errors: + for error in errors: + print(f"error={error}") + return fail("validation_errors") + print( + "gap_map=PASS " + f"sample_size={payload['sample_size']} " + f"snapshot_date={payload['snapshot_date']} " + f"issues={len(payload['issues'])}" + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/test_codex_gap_map.py b/tests/test_codex_gap_map.py new file mode 100644 index 0000000..9c0852c --- /dev/null +++ b/tests/test_codex_gap_map.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +import copy +import importlib.util +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +SCRIPT = ROOT / "scripts" / "check_codex_gap_map.py" + +spec = importlib.util.spec_from_file_location("check_codex_gap_map", SCRIPT) +assert spec and spec.loader +checker = importlib.util.module_from_spec(spec) +spec.loader.exec_module(checker) + + +def load_payload() -> dict: + return checker.load_json(ROOT / "data" / "codex-reliability-gap-map-01.json") + + +def test_gap_map_payload_passes_validator() -> None: + payload = load_payload() + md_text = (ROOT / "docs" / "research" / "codex-reliability-gap-map-01.md").read_text(encoding="utf-8") + assert checker.validate_payload(payload) == [] + assert checker.validate_markdown(payload, md_text) == [] + + +def test_duplicate_issue_ids_fail() -> None: + payload = load_payload() + broken = copy.deepcopy(payload) + broken["issues"][1]["issue_id"] = broken["issues"][0]["issue_id"] + errors = checker.validate_payload(broken) + assert any("duplicate issue_id" in error for error in errors) + + +def test_missing_cannot_claim_fails() -> None: + payload = load_payload() + broken = copy.deepcopy(payload) + broken["issues"][0]["cannot_claim"] = [] + errors = checker.validate_payload(broken) + assert any("cannot_claim" in error for error in errors) + + +def test_markdown_must_include_scope_limitation() -> None: + payload = load_payload() + errors = checker.validate_markdown(payload, "# Missing the limitation") + assert any("scope limitation" in error for error in errors)