From 339e79a712f21adbcee82c44f1f35ff0a524c32c Mon Sep 17 00:00:00 2001 From: Danilo Reyes Date: Tue, 31 Mar 2026 22:18:15 -0600 Subject: [PATCH] codex update --- .codex/rules/default.rules | 1 + .../.system/.codex-system-skills.marker | 2 +- .codex/skills/.system/openai-docs/LICENSE.txt | 201 ++++++++ .codex/skills/.system/openai-docs/SKILL.md | 69 +++ .../.system/openai-docs/agents/openai.yaml | 14 + .../openai-docs/assets/openai-small.svg | 3 + .../.system/openai-docs/assets/openai.png | Bin 0 -> 1429 bytes .../references/gpt-5p4-prompting-guide.md | 433 ++++++++++++++++++ .../openai-docs/references/latest-model.md | 35 ++ .../references/upgrading-to-gpt-5p4.md | 164 +++++++ .codex/skills/.system/skill-creator/SKILL.md | 60 ++- .../skill-creator/scripts/init_skill.py | 3 + 12 files changed, 978 insertions(+), 7 deletions(-) create mode 100644 .codex/skills/.system/openai-docs/LICENSE.txt create mode 100644 .codex/skills/.system/openai-docs/SKILL.md create mode 100644 .codex/skills/.system/openai-docs/agents/openai.yaml create mode 100644 .codex/skills/.system/openai-docs/assets/openai-small.svg create mode 100644 .codex/skills/.system/openai-docs/assets/openai.png create mode 100644 .codex/skills/.system/openai-docs/references/gpt-5p4-prompting-guide.md create mode 100644 .codex/skills/.system/openai-docs/references/latest-model.md create mode 100644 .codex/skills/.system/openai-docs/references/upgrading-to-gpt-5p4.md diff --git a/.codex/rules/default.rules b/.codex/rules/default.rules index a227a09..8651f88 100644 --- a/.codex/rules/default.rules +++ b/.codex/rules/default.rules @@ -1,3 +1,4 @@ prefix_rule(pattern=["deadnix", "-e"], decision="allow") prefix_rule(pattern=["nix", "eval"], decision="allow") prefix_rule(pattern=["nix", "develop", "path:.#nix", "--command", "bash", "-lc", "deadnix -e && nix run nixpkgs#nixfmt-tree && statix fix"], decision="allow") +prefix_rule(pattern=["nix", "flake", "lock", "--update-input"], decision="allow") diff --git a/.codex/skills/.system/.codex-system-skills.marker b/.codex/skills/.system/.codex-system-skills.marker index 1c19104..ae09150 100644 --- a/.codex/skills/.system/.codex-system-skills.marker +++ b/.codex/skills/.system/.codex-system-skills.marker @@ -1 +1 @@ -6277a8b83820afd8 +96d79780d5bf8bcb diff --git a/.codex/skills/.system/openai-docs/LICENSE.txt b/.codex/skills/.system/openai-docs/LICENSE.txt new file mode 100644 index 0000000..13e25df --- /dev/null +++ b/.codex/skills/.system/openai-docs/LICENSE.txt @@ -0,0 +1,201 @@ +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf of + any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don\'t include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/.codex/skills/.system/openai-docs/SKILL.md b/.codex/skills/.system/openai-docs/SKILL.md new file mode 100644 index 0000000..5a67772 --- /dev/null +++ b/.codex/skills/.system/openai-docs/SKILL.md @@ -0,0 +1,69 @@ +--- +name: "openai-docs" +description: "Use when the user asks how to build with OpenAI products or APIs and needs up-to-date official documentation with citations, help choosing the latest model for a use case, or explicit GPT-5.4 upgrade and prompt-upgrade guidance; prioritize OpenAI docs MCP tools, use bundled references only as helper context, and restrict any fallback browsing to official OpenAI domains." +--- + + +# OpenAI Docs + +Provide authoritative, current guidance from OpenAI developer docs using the developers.openai.com MCP server. Always prioritize the developer docs MCP tools over web.run for OpenAI-related questions. This skill may also load targeted files from `references/` for model-selection and GPT-5.4-specific requests, but current OpenAI docs remain authoritative. Only if the MCP server is installed and returns no meaningful results should you fall back to web search. + +## Quick start + +- Use `mcp__openaiDeveloperDocs__search_openai_docs` to find the most relevant doc pages. +- Use `mcp__openaiDeveloperDocs__fetch_openai_doc` to pull exact sections and quote/paraphrase accurately. +- Use `mcp__openaiDeveloperDocs__list_openai_docs` only when you need to browse or discover pages without a clear query. +- Load only the relevant file from `references/` when the question is about model selection or a GPT-5.4 upgrade. + +## OpenAI product snapshots + +1. Apps SDK: Build ChatGPT apps by providing a web component UI and an MCP server that exposes your app's tools to ChatGPT. +2. Responses API: A unified endpoint designed for stateful, multimodal, tool-using interactions in agentic workflows. +3. Chat Completions API: Generate a model response from a list of messages comprising a conversation. +4. Codex: OpenAI's coding agent for software development that can write, understand, review, and debug code. +5. gpt-oss: Open-weight OpenAI reasoning models (gpt-oss-120b and gpt-oss-20b) released under the Apache 2.0 license. +6. Realtime API: Build low-latency, multimodal experiences including natural speech-to-speech conversations. +7. Agents SDK: A toolkit for building agentic apps where a model can use tools and context, hand off to other agents, stream partial results, and keep a full trace. + +## If MCP server is missing + +If MCP tools fail or no OpenAI docs resources are available: + +1. Run the install command yourself: `codex mcp add openaiDeveloperDocs --url https://developers.openai.com/mcp` +2. If it fails due to permissions/sandboxing, immediately retry the same command with escalated permissions and include a 1-sentence justification for approval. Do not ask the user to run it yet. +3. Only if the escalated attempt fails, ask the user to run the install command. +4. Ask the user to restart Codex. +5. Re-run the doc search/fetch after restart. + +## Workflow + +1. Clarify the product scope and whether the request is general docs lookup, model selection, a GPT-5.4 upgrade, or a GPT-5.4 prompt upgrade. +2. If it is a model-selection request, load `references/latest-model.md`. +3. If it is an explicit GPT-5.4 upgrade request, load `references/upgrading-to-gpt-5p4.md`. +4. If the upgrade may require prompt changes, or the workflow is research-heavy, tool-heavy, coding-oriented, multi-agent, or long-running, also load `references/gpt-5p4-prompting-guide.md`. +5. Search docs with a precise query. +6. Fetch the best page and the exact section needed (use `anchor` when possible). +7. For GPT-5.4 upgrade reviews, always make the per-usage-site output explicit: target model, starting reasoning recommendation, `phase` assessment when relevant, prompt blocks, and compatibility status. +8. Answer with concise guidance and cite the doc source, using the reference files only as helper context. + +## Reference map + +Read only what you need: + +- `references/latest-model.md` -> model-selection and "best/latest/current model" questions; verify every recommendation against current OpenAI docs before answering. +- `references/upgrading-to-gpt-5p4.md` -> only for explicit GPT-5.4 upgrade and upgrade-planning requests; verify the checklist and compatibility guidance against current OpenAI docs before answering. +- `references/gpt-5p4-prompting-guide.md` -> prompt rewrites and prompt-behavior upgrades for GPT-5.4; verify prompting guidance against current OpenAI docs before answering. + +## Quality rules + +- Treat OpenAI docs as the source of truth; avoid speculation. +- Keep quotes short and within policy limits; prefer paraphrase with citations. +- If multiple pages differ, call out the difference and cite both. +- Reference files are convenience guides only; for volatile guidance such as recommended models, upgrade instructions, or prompting advice, current OpenAI docs always win. +- If docs do not cover the user’s need, say so and offer next steps. + +## Tooling notes + +- Always use MCP doc tools before any web search for OpenAI-related questions. +- If the MCP server is installed but returns no meaningful results, then use web search as a fallback. +- When falling back to web search, restrict to official OpenAI domains (developers.openai.com, platform.openai.com) and cite sources. diff --git a/.codex/skills/.system/openai-docs/agents/openai.yaml b/.codex/skills/.system/openai-docs/agents/openai.yaml new file mode 100644 index 0000000..d72b601 --- /dev/null +++ b/.codex/skills/.system/openai-docs/agents/openai.yaml @@ -0,0 +1,14 @@ +interface: + display_name: "OpenAI Docs" + short_description: "Reference official OpenAI docs, including upgrade guidance" + icon_small: "./assets/openai-small.svg" + icon_large: "./assets/openai.png" + default_prompt: "Look up official OpenAI docs, load relevant GPT-5.4 upgrade references when applicable, and answer with concise, cited guidance." + +dependencies: + tools: + - type: "mcp" + value: "openaiDeveloperDocs" + description: "OpenAI Developer Docs MCP server" + transport: "streamable_http" + url: "https://developers.openai.com/mcp" diff --git a/.codex/skills/.system/openai-docs/assets/openai-small.svg b/.codex/skills/.system/openai-docs/assets/openai-small.svg new file mode 100644 index 0000000..1d075dc --- /dev/null +++ b/.codex/skills/.system/openai-docs/assets/openai-small.svg @@ -0,0 +1,3 @@ + + + diff --git a/.codex/skills/.system/openai-docs/assets/openai.png b/.codex/skills/.system/openai-docs/assets/openai.png new file mode 100644 index 0000000000000000000000000000000000000000..e9b9eb80cd90ccdfc7e276b07f4046aa9c9d1887 GIT binary patch literal 1429 zcmeAS@N?(olHy`uVBq!ia0y~yU`PRB4kiW$hW`yr%nS^yF`h1tAr*7p-Z`6pJ4NQe z$K(C+N$RJsxkT^Yw&HF^>y~$}Nl`mDddip;E;ILw`DwnR*lCstr($8FvwER}wCiOT zYnDr=CbIHnP7mFZIsMwX*>BbR<{y_~3XR(8|F?LvO2_@@H-3El{N(oXm7*uZ7HKGH z$XUH${WCp5hoOhHVIzZq7{f890}%`o+ziQ#=z_Oi`U|=`eb^?DDDv^Ie9$Ukrx4wx z8h`)Pc6hY7v;?Yrt6vh(s}ZvB*2{9YLlJ>0OxGSH3bf68u<(gT>#9{*8M{`#oVG0b z|UUFeBr9RC7aT$#6MREhEz{l=^S`&ReHi~?L3`_yL+#Se*b>; z!rHlipIml1Z7mi0)u+toYS5%)rE?ayRh_;)%?IzUaovZrDAvB-_t(N`)?;Lx+{>ge*5IIA4i+wbu7HQcUE&3PMRxoIx=d@ zbg}A$CnrWEBneGm}N)b zo$hhxcHQNr-)`FLgoVqU?4Nj)>#f&)uhI`2p6>qt=evNe^`kZOH(Pu2x4!q{n{f2g zb*{Z}=T9pf{J!^ZJEs}XSlPYluVS}q&HC%z zl4S>{y{U>8>;C^g>Ey$36YO!0H6f5K=y0ieiR9x_^*frv_r-P9R)sostdQNI>tyJw zntE}f;4Gu7-NId&S&JVpkN>E+@G;BnryH`kj~!a&%<=Fl^XUNhq85Q%6?flY>65{i zO^&Ir$W)ubE#rMf=J)6GO4B}wsg&l#%-^@hfA8AfxFyP-d9flrmGyfv@~1~;?!ByW z(DRo_x4@(wB8n~Vr~LGrY5TubU{PQ1?CbYU@6GHE_BFWn<7;|YRQl4dM>dzw*FCsG z&gQzJ;lALq@>B07yZ^fK^h)pSmD1+@Ha&Y|*8lyoM7`I~Q_OqorM-JJe^$Kho&L6W z`n}D|Uw*fL;rHxsbkSn!{okK*{TbeF8}sae5>90U;00DpKf}slU;Uh*4)=$tJ=c9 z?X|wv^XBio4>9ZycTau4FHdgI($K9&QU9E>K20*TTf5|P)w`y7`wI%D9Ip-kk(cp) z+uZ91ueoiAe01q{zrXWVqhn80lasv8_nx|b`Bs%^PA+KRYDUXPQ=N6>|Nn(GOG{ki)r1zXRim6DF41V@kh9^$VMxv^V6mG zuPv$n!soxNPqFK&;H|0W#Jqj|_&?~s^Kcf=p7C&cJyA*b!2LB>JRF0zPPSrTU|{fc L^>bP0l+XkKtSza4 literal 0 HcmV?d00001 diff --git a/.codex/skills/.system/openai-docs/references/gpt-5p4-prompting-guide.md b/.codex/skills/.system/openai-docs/references/gpt-5p4-prompting-guide.md new file mode 100644 index 0000000..dc4ebde --- /dev/null +++ b/.codex/skills/.system/openai-docs/references/gpt-5p4-prompting-guide.md @@ -0,0 +1,433 @@ +# GPT-5.4 prompting upgrade guide + +Use this guide when prompts written for older models need to be adapted for GPT-5.4 during an upgrade. Start lean: keep the model-string change narrow, preserve the original task intent, and add only the smallest prompt changes needed to recover behavior. + +## Default upgrade posture + +- Start with `model string only` whenever the old prompt is already short, explicit, and task-bounded. +- Move to `model string + light prompt rewrite` only when regressions appear in completeness, persistence, citation quality, verification, or verbosity. +- Prefer one or two targeted prompt additions over a broad rewrite. +- Treat reasoning effort as a last-mile knob. Start lower, then increase only after prompt-level fixes and evals. +- Before increasing reasoning effort, first add a completeness contract, a verification loop, and tool persistence rules - depending on the usage case. +- If the workflow clearly depends on implementation changes rather than prompt changes, treat it as blocked for prompt-only upgrade guidance. +- Do not classify a case as blocked just because the workflow uses tools; block only if the upgrade requires changing tool definitions, wiring, or other implementation details. + +## Behavioral differences to account for + +Current GPT-5.4 upgrade guidance suggests these strengths: + +- stronger personality and tone adherence, with less drift over long answers +- better long-horizon and agentic workflow stamina +- stronger spreadsheet, finance, and formatting tasks +- more efficient tool selection and fewer unnecessary calls by default +- stronger structured generation and classification reliability + +The main places where prompt guidance still helps are: + +- retrieval-heavy workflows that need persistent tool use and explicit completeness +- research and citation discipline +- verification before irreversible or high-impact actions +- terminal and tool workflow hygiene +- defaults and implied follow-through +- verbosity control for compact, information-dense answers + +Start with the smallest set of instructions that preserves correctness. Add the prompt blocks below only for workflows that actually need them. + +## Prompt rewrite patterns + +| Older prompt pattern | GPT-5.4 adjustment | Why | Example addition | +| --- | --- | --- | --- | +| Long, repetitive instructions that compensate for weaker instruction following | Remove duplicate scaffolding and keep only the constraints that materially change behavior | GPT-5.4 usually needs less repeated steering | Replace repeated reminders with one concise rule plus a verification block | +| Fast assistant prompt with no verbosity control | Keep the prompt as-is first; add a verbosity clamp only if outputs become too long | Many GPT-4o or GPT-4.1 upgrades work with just a model-string swap | Add `output_verbosity_spec` only after a verbosity regression | +| Tool-heavy agent prompt that assumes the model will keep searching until complete | Add persistence and verification rules | GPT-5.4 may use fewer tool calls by default for efficiency | Add `tool_persistence_rules` and `verification_loop` | +| Tool-heavy workflow where later actions depend on earlier lookup or retrieval | Add prerequisite and missing-context rules before action steps | GPT-5.4 benefits from explicit dependency-aware routing when context is still thin | Add `dependency_checks` and `missing_context_gating` | +| Retrieval workflow with several independent lookups | Add selective parallelism guidance | GPT-5.4 is strong at parallel tool use, but should not parallelize dependent steps | Add `parallel_tool_calling` | +| Batch workflow prompt that often misses items | Add an explicit completeness contract | Item accounting benefits from direct instruction | Add `completeness_contract` | +| Research prompt that needs grounding and citation discipline | Add research, citation, and empty-result recovery blocks | Multi-pass retrieval is stronger when the model is told how to react to weak or empty search results | Add `research_mode`, `citation_rules`, and `empty_result_handling`; add `tool_persistence_rules` when retrieval tools are already in use | +| Coding or terminal prompt with shell misuse or early stop failures | Keep the same tool surface and add terminal hygiene and verification instructions | Tool-using coding workflows are not blocked just because tools exist; they usually need better prompt steering, not host rewiring | Add `terminal_tool_hygiene` and `verification_loop`, optionally `tool_persistence_rules` | +| Multi-agent or support-triage workflow with escalation or completeness requirements | Add one lightweight control block for persistence, completeness, or verification | GPT-5.4 can be more efficient by default, so multi-step support flows benefit from an explicit completion or verification contract | Add at least one of `tool_persistence_rules`, `completeness_contract`, or `verification_loop` | + +## Prompt blocks + +Use these selectively. Do not add all of them by default. + +### `output_verbosity_spec` + +Use when: + +- the upgraded model gets too wordy +- the host needs compact, information-dense answers +- the workflow benefits from a short overview plus a checklist + +```text + +- Default: 3-6 sentences or up to 6 bullets. +- If the user asked for a doc or report, use headings with short bullets. +- For multi-step tasks: + - Start with 1 short overview paragraph. + - Then provide a checklist with statuses: [done], [todo], or [blocked]. +- Avoid repeating the user's request. +- Prefer compact, information-dense writing. + +``` + +### `default_follow_through_policy` + +Use when: + +- the host expects the model to proceed on reversible, low-risk steps +- the upgraded model becomes too conservative or asks for confirmation too often + +```text + +- If the user's intent is clear and the next step is reversible and low-risk, proceed without asking permission. +- Only ask permission if the next step is: + (a) irreversible, + (b) has external side effects, or + (c) requires missing sensitive information or a choice that materially changes outcomes. +- If proceeding, state what you did and what remains optional. + +``` + +### `instruction_priority` + +Use when: + +- users often change task shape, format, or tone mid-conversation +- the host needs an explicit override policy instead of relying on defaults + +```text + +- User instructions override default style, tone, formatting, and initiative preferences. +- Safety, honesty, privacy, and permission constraints do not yield. +- If a newer user instruction conflicts with an earlier one, follow the newer instruction. +- Preserve earlier instructions that do not conflict. + +``` + +### `tool_persistence_rules` + +Use when: + +- the workflow needs multiple retrieval or verification steps +- the model starts stopping too early because it is trying to save tool calls + +```text + +- Use tools whenever they materially improve correctness, completeness, or grounding. +- Do not stop early just to save tool calls. +- Keep calling tools until: + (1) the task is complete, and + (2) verification passes. +- If a tool returns empty or partial results, retry with a different strategy. + +``` + +### `dig_deeper_nudge` + +Use when: + +- the model is too literal or stops at the first plausible answer +- the task is safety- or accuracy-sensitive and needs a small initiative nudge before raising reasoning effort + +```text + +- Do not stop at the first plausible answer. +- Look for second-order issues, edge cases, and missing constraints. +- If the task is safety- or accuracy-critical, perform at least one verification step. + +``` + +### `dependency_checks` + +Use when: + +- later actions depend on prerequisite lookup, memory retrieval, or discovery steps +- the model may be tempted to skip prerequisite work because the intended end state seems obvious + +```text + +- Before taking an action, check whether prerequisite discovery, lookup, or memory retrieval is required. +- Do not skip prerequisite steps just because the intended final action seems obvious. +- If a later step depends on the output of an earlier one, resolve that dependency first. + +``` + +### `parallel_tool_calling` + +Use when: + +- the workflow has multiple independent retrieval steps +- wall-clock time matters but some steps still need sequencing + +```text + +- When multiple retrieval or lookup steps are independent, prefer parallel tool calls to reduce wall-clock time. +- Do not parallelize steps with prerequisite dependencies or where one result determines the next action. +- After parallel retrieval, pause to synthesize before making more calls. +- Prefer selective parallelism: parallelize independent evidence gathering, not speculative or redundant tool use. + +``` + +### `completeness_contract` + +Use when: + +- the task involves batches, lists, enumerations, or multiple deliverables +- missing items are a common failure mode + +```text + +- Deliver all requested items. +- Maintain an itemized checklist of deliverables. +- For lists or batches: + - state the expected count, + - enumerate items 1..N, + - confirm that none are missing before finalizing. +- If any item is blocked by missing data, mark it [blocked] and state exactly what is missing. + +``` + +### `empty_result_handling` + +Use when: + +- the workflow frequently performs search, CRM, logs, or retrieval steps +- no-results failures are often false negatives + +```text + +If a lookup returns empty or suspiciously small results: +- Do not conclude that no results exist immediately. +- Try at least 2 fallback strategies, such as a broader query, alternate filters, or another source. +- Only then report that no results were found, along with what you tried. + +``` + +### `verification_loop` + +Use when: + +- the workflow has downstream impact +- accuracy, formatting, or completeness regressions matter + +```text + +Before finalizing: +- Check correctness: does the output satisfy every requirement? +- Check grounding: are factual claims backed by retrieved sources or tool output? +- Check formatting: does the output match the requested schema or style? +- Check safety and irreversibility: if the next step has external side effects, ask permission first. + +``` + +### `missing_context_gating` + +Use when: + +- required context is sometimes missing early in the workflow +- the model should prefer retrieval over guessing + +```text + +- If required context is missing, do not guess. +- Prefer the appropriate lookup tool when the context is retrievable; ask a minimal clarifying question only when it is not. +- If you must proceed, label assumptions explicitly and choose a reversible action. + +``` + +### `action_safety` + +Use when: + +- the agent will actively take actions through tools +- the host benefits from a short pre-flight and post-flight execution frame + +```text + +- Pre-flight: summarize the intended action and parameters in 1-2 lines. +- Execute via tool. +- Post-flight: confirm the outcome and any validation that was performed. + +``` + +### `citation_rules` + +Use when: + +- the workflow produces cited answers +- fabricated citations or wrong citation formats are costly + +```text + +- Only cite sources that were actually retrieved in this session. +- Never fabricate citations, URLs, IDs, or quote spans. +- If you cannot find a source for a claim, say so and either: + - soften the claim, or + - explain how to verify it with tools. +- Use exactly the citation format required by the host application. + +``` + +### `research_mode` + +Use when: + +- the workflow is research-heavy +- the host uses web search or retrieval tools + +```text + +- Do research in 3 passes: + 1) Plan: list 3-6 sub-questions to answer. + 2) Retrieve: search each sub-question and follow 1-2 second-order leads. + 3) Synthesize: resolve contradictions and write the final answer with citations. +- Stop only when more searching is unlikely to change the conclusion. + +``` + +If your host environment uses a specific research tool or requires a submit step, combine this with the host's finalization contract. + +### `structured_output_contract` + +Use when: + +- the host depends on strict JSON, SQL, or other structured output + +```text + +- Output only the requested format. +- Do not add prose or markdown fences unless they were requested. +- Validate that parentheses and brackets are balanced. +- Do not invent tables or fields. +- If required schema information is missing, ask for it or return an explicit error object. + +``` + +### `bbox_extraction_spec` + +Use when: + +- the workflow extracts OCR boxes, document regions, or other coordinates +- layout drift or missed dense regions are common failure modes + +```text + +- Use the specified coordinate format exactly, such as [x1,y1,x2,y2] normalized to 0..1. +- For each box, include page, label, text snippet, and confidence. +- Add a vertical-drift sanity check so boxes stay aligned with the correct line of text. +- If the layout is dense, process page by page and do a second pass for missed items. + +``` + +### `terminal_tool_hygiene` + +Use when: + +- the prompt belongs to a terminal-based or coding-agent workflow +- tool misuse or shell misuse has been observed + +```text + +- Only run shell commands through the terminal tool. +- Never try to "run" tool names as shell commands. +- If a patch or edit tool exists, use it directly instead of emulating it in bash. +- After changes, run a lightweight verification step such as ls, tests, or a build before declaring the task done. + +``` + +### `user_updates_spec` + +Use when: + +- the workflow is long-running and user updates matter + +```text + +- Only update the user when starting a new major phase or when the plan changes. +- Each update should contain: + - 1 sentence on what changed, + - 1 sentence on the next step. +- Do not narrate routine tool calls. +- Keep the user-facing update short, even when the actual work is exhaustive. + +``` + +If you are using [Compaction](https://developers.openai.com/api/docs/guides/compaction) in the Responses API, compact after major milestones, treat compacted items as opaque state, and keep prompts functionally identical after compaction. + +## Responses `phase` guidance + +For long-running Responses workflows, preambles, or tool-heavy agents that replay assistant items, review whether `phase` is already preserved. + +- If the host already round-trips `phase`, keep it intact during the upgrade. +- If the host uses `previous_response_id` and does not manually replay assistant items, note that this may reduce manual `phase` handling needs. +- If reliable GPT-5.4 behavior would require adding or preserving `phase` and that would need code edits, treat the case as blocked for prompt-only or model-string-only migration guidance. + +## Example upgrade profiles + +### GPT-5.2 + +- Use `gpt-5.4` +- Match the current reasoning effort first +- Preserve the existing latency and quality profile before tuning prompt blocks +- If the repo does not expose the exact setting, emit `same` as the starting recommendation + +### GPT-5.3-Codex + +- Use `gpt-5.4` +- Match the current reasoning effort first +- If you need Codex-style speed and efficiency, add verification blocks before increasing reasoning effort +- If the repo does not expose the exact setting, emit `same` as the starting recommendation + +### GPT-4o or GPT-4.1 assistant + +- Use `gpt-5.4` +- Start with `none` reasoning effort +- Add `output_verbosity_spec` only if output becomes too verbose + +### Long-horizon agent + +- Use `gpt-5.4` +- Start with `medium` reasoning effort +- Add `tool_persistence_rules` +- Add `completeness_contract` +- Add `verification_loop` + +### Research workflow + +- Use `gpt-5.4` +- Start with `medium` reasoning effort +- Add `research_mode` +- Add `citation_rules` +- Add `empty_result_handling` +- Add `tool_persistence_rules` when the host already uses web or retrieval tools +- Add `parallel_tool_calling` when the retrieval steps are independent + +### Support triage or multi-agent workflow + +- Use `gpt-5.4` +- Prefer `model string + light prompt rewrite` over `model string only` +- Add at least one of `tool_persistence_rules`, `completeness_contract`, or `verification_loop` +- Add more only if evals show a real regression + +### Coding or terminal workflow + +- Use `gpt-5.4` +- Keep the model-string change narrow +- Match the current reasoning effort first if you are upgrading from GPT-5.3-Codex +- Add `terminal_tool_hygiene` +- Add `verification_loop` +- Add `dependency_checks` when actions depend on prerequisite lookup or discovery +- Add `tool_persistence_rules` if the agent stops too early +- Review whether `phase` is already preserved for long-running Responses flows or assistant preambles +- Do not classify this as blocked just because the workflow uses tools; block only if the upgrade requires changing tool definitions or wiring +- If the repo already uses Responses plus tools and no required host-side change is shown, prefer `model_string_plus_light_prompt_rewrite` over `blocked` + +## Prompt regression checklist + +- Check whether the upgraded prompt still preserves the original task intent. +- Check whether the new prompt is leaner, not just longer. +- Check completeness, citation quality, dependency handling, verification behavior, and verbosity. +- For long-running Responses agents, check whether `phase` handling is already in place or needs implementation work. +- Confirm that each added prompt block addresses an observed regression. +- Remove prompt blocks that are not earning their keep. diff --git a/.codex/skills/.system/openai-docs/references/latest-model.md b/.codex/skills/.system/openai-docs/references/latest-model.md new file mode 100644 index 0000000..91a787e --- /dev/null +++ b/.codex/skills/.system/openai-docs/references/latest-model.md @@ -0,0 +1,35 @@ +# Latest model guide + +This file is a curated helper. Every recommendation here must be verified against current OpenAI docs before it is repeated to a user. + +## Current model map + +| Model ID | Use for | +| --- | --- | +| `gpt-5.4` | Default text plus reasoning for most new apps | +| `gpt-5.4-pro` | Only when the user explicitly asks for maximum reasoning or quality; substantially slower and more expensive | +| `gpt-5-mini` | Cheaper and faster reasoning with good quality | +| `gpt-5-nano` | High-throughput simple tasks and classification | +| `gpt-5.4` | Explicit no-reasoning text path via `reasoning.effort: none` | +| `gpt-4.1-mini` | Cheaper no-reasoning text | +| `gpt-4.1-nano` | Fastest and cheapest no-reasoning text | +| `gpt-5.3-codex` | Agentic coding, code editing, and tool-heavy coding workflows | +| `gpt-5.1-codex-mini` | Cheaper coding workflows | +| `gpt-image-1.5` | Best image generation and edit quality | +| `gpt-image-1-mini` | Cost-optimized image generation | +| `gpt-4o-mini-tts` | Text-to-speech | +| `gpt-4o-mini-transcribe` | Speech-to-text, fast and cost-efficient | +| `gpt-realtime-1.5` | Realtime voice and multimodal sessions | +| `gpt-realtime-mini` | Cheaper realtime sessions | +| `gpt-audio` | Chat Completions audio input and output | +| `gpt-audio-mini` | Cheaper Chat Completions audio workflows | +| `sora-2` | Faster iteration and draft video generation | +| `sora-2-pro` | Higher-quality production video | +| `omni-moderation-latest` | Text and image moderation | +| `text-embedding-3-large` | Higher-quality retrieval embeddings; default in this skill because no best-specific row exists | +| `text-embedding-3-small` | Lower-cost embeddings | + +## Maintenance notes + +- This file will drift unless it is periodically re-verified against current OpenAI docs. +- If this file conflicts with current docs, the docs win. diff --git a/.codex/skills/.system/openai-docs/references/upgrading-to-gpt-5p4.md b/.codex/skills/.system/openai-docs/references/upgrading-to-gpt-5p4.md new file mode 100644 index 0000000..7a6775f --- /dev/null +++ b/.codex/skills/.system/openai-docs/references/upgrading-to-gpt-5p4.md @@ -0,0 +1,164 @@ +# Upgrading to GPT-5.4 + +Use this guide when the user explicitly asks to upgrade an existing integration to GPT-5.4. Pair it with current OpenAI docs lookups. The default target string is `gpt-5.4`. + +## Upgrade posture + +Upgrade with the narrowest safe change set: + +- replace the model string first +- update only the prompts that are directly tied to that model usage +- prefer prompt-only upgrades when possible +- if the upgrade would require API-surface changes, parameter rewrites, tool rewiring, or broader code edits, mark it as blocked instead of stretching the scope + +## Upgrade workflow + +1. Inventory current model usage. + - Search for model strings, client calls, and prompt-bearing files. + - Include inline prompts, prompt templates, YAML or JSON configs, Markdown docs, and saved prompts when they are clearly tied to a model usage site. +2. Pair each model usage with its prompt surface. + - Prefer the closest prompt surface first: inline system or developer text, then adjacent prompt files, then shared templates. + - If you cannot confidently tie a prompt to the model usage, say so instead of guessing. +3. Classify the source model family. + - Common buckets: `gpt-4o` or `gpt-4.1`, `o1` or `o3` or `o4-mini`, early `gpt-5`, later `gpt-5.x`, or mixed and unclear. +4. Decide the upgrade class. + - `model string only` + - `model string + light prompt rewrite` + - `blocked without code changes` +5. Run the no-code compatibility gate. + - Check whether the current integration can accept `gpt-5.4` without API-surface changes or implementation changes. + - For long-running Responses or tool-heavy agents, check whether `phase` is already preserved or round-tripped when the host replays assistant items or uses preambles. + - If compatibility depends on code changes, return `blocked`. + - If compatibility is unclear, return `unknown` rather than improvising. +6. Recommend the upgrade. + - Default replacement string: `gpt-5.4` + - Keep the intervention small and behavior-preserving. +7. Deliver a structured recommendation. + - `Current model usage` + - `Recommended model-string updates` + - `Starting reasoning recommendation` + - `Prompt updates` + - `Phase assessment` when the flow is long-running, replayed, or tool-heavy + - `No-code compatibility check` + - `Validation plan` + - `Launch-day refresh items` + +Output rule: + +- Always emit a starting `reasoning_effort_recommendation` for each usage site. +- If the repo exposes the current reasoning setting, preserve it first unless the source guide says otherwise. +- If the repo does not expose the current setting, use the source-family starting mapping instead of returning `null`. + +## Upgrade outcomes + +### `model string only` + +Choose this when: + +- the existing prompts are already short, explicit, and task-bounded +- the workflow is not strongly research-heavy, tool-heavy, multi-agent, batch or completeness-sensitive, or long-horizon +- there are no obvious compatibility blockers + +Default action: + +- replace the model string with `gpt-5.4` +- keep prompts unchanged +- validate behavior with existing evals or spot checks + +### `model string + light prompt rewrite` + +Choose this when: + +- the old prompt was compensating for weaker instruction following +- the workflow needs more persistence than the default tool-use behavior will likely provide +- the task needs stronger completeness, citation discipline, or verification +- the upgraded model becomes too verbose or under-complete unless instructed otherwise +- the workflow is research-heavy and needs stronger handling of sparse or empty retrieval results +- the workflow is coding-oriented, tool-heavy, or multi-agent, but the existing API surface and tool definitions can remain unchanged + +Default action: + +- replace the model string with `gpt-5.4` +- add one or two targeted prompt blocks +- read `references/gpt-5p4-prompting-guide.md` to choose the smallest prompt changes that recover the old behavior +- avoid broad prompt cleanup unrelated to the upgrade +- for research workflows, default to `research_mode` + `citation_rules` + `empty_result_handling`; add `tool_persistence_rules` when the host already uses retrieval tools +- for dependency-aware or tool-heavy workflows, default to `tool_persistence_rules` + `dependency_checks` + `verification_loop`; add `parallel_tool_calling` only when retrieval steps are truly independent +- for coding or terminal workflows, default to `terminal_tool_hygiene` + `verification_loop` +- for multi-agent support or triage workflows, default to at least one of `tool_persistence_rules`, `completeness_contract`, or `verification_loop` +- for long-running Responses agents with preambles or multiple assistant messages, explicitly review whether `phase` is already handled; if adding or preserving `phase` would require code edits, mark the path as `blocked` +- do not classify a coding or tool-using Responses workflow as `blocked` just because the visible snippet is minimal; prefer `model string + light prompt rewrite` unless the repo clearly shows that a safe GPT-5.4 path would require host-side code changes + +### `blocked` + +Choose this when: + +- the upgrade appears to require API-surface changes +- the upgrade appears to require parameter rewrites or reasoning-setting changes that are not exposed outside implementation code +- the upgrade would require changing tool definitions, tool handler wiring, or schema contracts +- you cannot confidently identify the prompt surface tied to the model usage + +Default action: + +- do not improvise a broader upgrade +- report the blocker and explain that the fix is out of scope for this guide + +## No-code compatibility checklist + +Before recommending a no-code upgrade, check: + +1. Can the current host accept the `gpt-5.4` model string without changing client code or API surface? +2. Are the related prompts identifiable and editable? +3. Does the host depend on behavior that likely needs API-surface changes, parameter rewrites, or tool rewiring? +4. Would the likely fix be prompt-only, or would it need implementation changes? +5. Is the prompt surface close enough to the model usage that you can make a targeted change instead of a broad cleanup? +6. For long-running Responses or tool-heavy agents, is `phase` already preserved if the host relies on preambles, replayed assistant items, or multiple assistant messages? + +If item 1 is no, items 3 through 4 point to implementation work, or item 6 is no and the fix needs code changes, return `blocked`. + +If item 2 is no, return `unknown` unless the user can point to the prompt location. + +Important: + +- Existing use of tools, agents, or multiple usage sites is not by itself a blocker. +- If the current host can keep the same API surface and the same tool definitions, prefer `model string + light prompt rewrite` over `blocked`. +- Reserve `blocked` for cases that truly require implementation changes, not cases that only need stronger prompt steering. + +## Scope boundaries + +This guide may: + +- update or recommend updated model strings +- update or recommend updated prompts +- inspect code and prompt files to understand where those changes belong +- inspect whether existing Responses flows already preserve `phase` +- flag compatibility blockers + +This guide may not: + +- move Chat Completions code to Responses +- move Responses code to another API surface +- rewrite parameter shapes +- change tool definitions or tool-call handling +- change structured-output wiring +- add or retrofit `phase` handling in implementation code +- edit business logic, orchestration logic, or SDK usage beyond a literal model-string replacement + +If a safe GPT-5.4 upgrade requires any of those changes, mark the path as blocked and out of scope. + +## Validation plan + +- Validate each upgraded usage site with existing evals or realistic spot checks. +- Check whether the upgraded model still matches expected latency, output shape, and quality. +- If prompt edits were added, confirm each block is doing real work instead of adding noise. +- If the workflow has downstream impact, add a lightweight verification pass before finalization. + +## Launch-day refresh items + +When final GPT-5.4 guidance changes: + +1. Replace release-candidate assumptions with final GPT-5.4 guidance where appropriate. +2. Re-check whether the default target string should stay `gpt-5.4` for all source families. +3. Re-check any prompt-block recommendations whose semantics may have changed. +4. Re-check research, citation, and compatibility guidance against the final model behavior. +5. Re-run the same upgrade scenarios and confirm the blocked-versus-viable boundaries still hold. diff --git a/.codex/skills/.system/skill-creator/SKILL.md b/.codex/skills/.system/skill-creator/SKILL.md index 72bc0b9..57f4e58 100644 --- a/.codex/skills/.system/skill-creator/SKILL.md +++ b/.codex/skills/.system/skill-creator/SKILL.md @@ -45,6 +45,14 @@ Match the level of specificity to the task's fragility and variability: Think of Codex as exploring a path: a narrow bridge with cliffs needs specific guardrails (low freedom), while an open field allows many routes (high freedom). +### Protect Validation Integrity + +You may use subagents during iteration to validate whether a skill works on realistic tasks or whether a suspected problem is real. This is most useful when you want an independent pass on the skill's behavior, outputs, or failure modes after a revision. Only do this when it is possible to start new subagents. + +When using subagents for validation, treat that as an evaluation surface. The goal is to learn whether the skill generalizes, not whether another agent can reconstruct the answer from leaked context. + +Prefer raw artifacts such as example prompts, outputs, diffs, logs, or traces. Give the minimum task-local context needed to perform the validation. Avoid passing the intended answer, suspected bug, intended fix, or your prior conclusions unless the validation explicitly requires them. + ### Anatomy of a Skill Every skill consists of a required SKILL.md file and optional bundled resources: @@ -221,7 +229,7 @@ Skill creation involves these steps: 3. Initialize the skill (run init_skill.py) 4. Edit the skill (implement resources and write SKILL.md) 5. Validate the skill (run quick_validate.py) -6. Iterate based on real usage +6. Iterate based on real usage and forward-test complex skills. Follow these steps in order, skipping only if there is a clear reason why they are not applicable. @@ -245,6 +253,7 @@ For example, when building an image-editor skill, relevant questions include: - "Can you give some examples of how this skill would be used?" - "I can imagine users asking for things like 'Remove the red-eye from this image' or 'Rotate this image'. Are there other ways you imagine this skill being used?" - "What would a user say that should trigger this skill?" +- "Where should I create this skill? If you do not have a preference, I will place it in `$CODEX_HOME/skills` (or `~/.codex/skills` when `CODEX_HOME` is unset) so Codex can discover it automatically." To avoid overwhelming users, avoid asking too many questions in a single message. Start with the most important questions and follow up as needed for better effectiveness. @@ -280,6 +289,8 @@ At this point, it is time to actually create the skill. Skip this step only if the skill being developed already exists. In this case, continue to the next step. +Before running `init_skill.py`, ask where the user wants the skill created. If they do not specify a location, default to `$CODEX_HOME/skills`; when `CODEX_HOME` is unset, fall back to `~/.codex/skills` so the skill is auto-discovered. + When creating a new skill from scratch, always run the `init_skill.py` script. The script conveniently generates a new template skill directory that automatically includes everything a skill requires, making the skill creation process much more efficient and reliable. Usage: @@ -291,9 +302,9 @@ scripts/init_skill.py --path [--resources script Examples: ```bash -scripts/init_skill.py my-skill --path skills/public -scripts/init_skill.py my-skill --path skills/public --resources scripts,references -scripts/init_skill.py my-skill --path skills/public --resources scripts --examples +scripts/init_skill.py my-skill --path "${CODEX_HOME:-$HOME/.codex}/skills" +scripts/init_skill.py my-skill --path "${CODEX_HOME:-$HOME/.codex}/skills" --resources scripts,references +scripts/init_skill.py my-skill --path ~/work/skills --resources scripts --examples ``` The script: @@ -318,6 +329,8 @@ Only include other optional interface fields when the user explicitly provides t When editing the (newly-generated or existing) skill, remember that the skill is being created for another instance of Codex to use. Include information that would be beneficial and non-obvious to Codex. Consider what procedural knowledge, domain-specific details, or reusable assets would help another Codex instance execute these tasks more effectively. +After substantial revisions, or if the skill is particularly tricky, you should use subagents to forward-test the skill on realistic tasks or artifacts. When doing so, pass the artifact under validation rather than your diagnosis of what is wrong, and keep the prompt generic enough that success depends on transferable reasoning rather than hidden ground truth. + #### Start with Reusable Skill Contents To begin implementation, start with the reusable resources identified above: `scripts/`, `references/`, and `assets/` files. Note that this step may require user input. For example, when implementing a `brand-guidelines` skill, the user may need to provide brand assets or templates to store in `assets/`, or documentation to store in `references/`. @@ -358,11 +371,46 @@ The validation script checks YAML frontmatter format, required fields, and namin ### Step 6: Iterate -After testing the skill, users may request improvements. Often this happens right after using the skill, with fresh context of how the skill performed. +After testing the skill, you may detect the skill is complex enough that it requires forward-testing; or users may request improvements. -**Iteration workflow:** +User testing often this happens right after using the skill, with fresh context of how the skill performed. + +**Forward-testing and iteration workflow:** 1. Use the skill on real tasks 2. Notice struggles or inefficiencies 3. Identify how SKILL.md or bundled resources should be updated 4. Implement changes and test again +5. Forward-test if it is reasonable and appropriate + +## Forward-testing + +To forward-test, launch subagents as a way to stress test the skill with minimal context. +Subagents should *not* know that they are being asked to test the skill. They should be treated as +an agent asked to perform a task by the user. Prompts to subagents should look like: + `Use $skill-x at /path/to/skill-x to solve problem y` +Not: + `Review the skill at /path/to/skill-x; pretend a user asks you to...` + +Decision rule for forward-testing: + - Err on the side of forward-testing + - Ask for approval if you think there's a risk that forward-testing would: + * take a long time, + * require additional approvals from the user, or + * modify live production systems + + In these cases, show the user your proposed prompt and request (1) a yes/no decision, and + (2) any suggested modifictions. + +Considerations when forward-testing: + - use fresh threads for independent passes + - pass the skill, and a request in a similar way the user would. + - pass raw artifacts, not your conclusions + - avoid showing expected answers or intended fixes + - rebuild context from source artifacts after each iteration + - review the subagent's output and reasoning and emitted artifacts + - avoid leaving artifacts the agent can find on disk between iterations; + clean up subagents' artifacts to avoid additional contamination. + +If forward-testing only succeeds when subagents see leaked context, tighten the skill or the +forward-testing setup before trusting the result. diff --git a/.codex/skills/.system/skill-creator/scripts/init_skill.py b/.codex/skills/.system/skill-creator/scripts/init_skill.py index f90703e..69673ea 100644 --- a/.codex/skills/.system/skill-creator/scripts/init_skill.py +++ b/.codex/skills/.system/skill-creator/scripts/init_skill.py @@ -326,6 +326,9 @@ def init_skill(skill_name, path, resources, include_examples, interface_override print("2. Create resource directories only if needed (scripts/, references/, assets/)") print("3. Update agents/openai.yaml if the UI metadata should differ") print("4. Run the validator when ready to check the skill structure") + print( + "5. Forward-test complex skills with realistic user requests to ensure they work as intended" + ) return skill_dir