diff --git a/openkb/agent/chat.py b/openkb/agent/chat.py index 9b1afc2f..9e9907ff 100644 --- a/openkb/agent/chat.py +++ b/openkb/agent/chat.py @@ -415,6 +415,12 @@ def _start_live() -> Any: def _save_transcript(kb_dir: Path, session: ChatSession, name: str | None) -> Path: + from openkb.lint import ( + build_norm_index, + list_existing_wiki_targets, + strip_ghost_wikilinks, + ) + explore_dir = kb_dir / "wiki" / "explorations" explore_dir.mkdir(parents=True, exist_ok=True) @@ -423,6 +429,15 @@ def _save_transcript(kb_dir: Path, session: ChatSession, name: str | None) -> Pa date = session.created_at[:10].replace("-", "") path = explore_dir / f"{slug}-{date}.md" + # Strip ghost wikilinks from assistant responses (the agent's + # instructions encourage [[wikilinks]] but it can reference pages + # that don't exist on disk). User turns are written verbatim — they + # represent intentional user input, not LLM hallucination. + # Build the normalized index once and reuse for every turn — the + # whitelist is the same across the whole session. + known = list_existing_wiki_targets(kb_dir / "wiki") + norm_index = build_norm_index(known) + lines: list[str] = [ "---", f'session: "{session.id}"', @@ -436,7 +451,11 @@ def _save_transcript(kb_dir: Path, session: ChatSession, name: str | None) -> Pa for i, (u, a) in enumerate(zip(session.user_turns, session.assistant_texts), 1): lines.append(f"## [{i}] {u}") lines.append("") - lines.append(a or "_(no response recorded)_") + if a: + cleaned_a, _ = strip_ghost_wikilinks(a, known, norm_index=norm_index) + lines.append(cleaned_a) + else: + lines.append("_(no response recorded)_") lines.append("") path.write_text("\n".join(lines), encoding="utf-8") diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index b5695d58..6fee638e 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -28,6 +28,7 @@ import litellm +from openkb.lint import list_existing_wiki_targets, strip_ghost_wikilinks from openkb.schema import get_agents_md logger = logging.getLogger(__name__) @@ -89,6 +90,19 @@ Return ONLY valid JSON, no fences, no explanation. """ +_KNOWN_TARGETS_USER = """\ +The wiki currently contains these pages, and they are the COMPLETE list of \ +valid [[wikilink]] targets you may use in the responses that follow: + +{known_targets} + +Rules for [[wikilinks]] in all subsequent responses: +- For [[concepts/X]]: X must appear in the whitelist above. +- For [[summaries/Y]]: Y must appear in the whitelist above. +- Do NOT invent new wikilink targets. If you want to mention a concept \ +that is not in the whitelist, write it as plain text without brackets. +""" + _CONCEPT_PAGE_USER = """\ Write the concept page for: {title} @@ -99,7 +113,8 @@ - "brief": A single sentence (under 100 chars) defining this concept - "content": The full concept page in Markdown. Include clear explanation, \ key details from the source document, and [[wikilinks]] to related concepts \ -and [[summaries/{doc_name}]] +and [[summaries/{doc_name}]] — subject to the wikilink rules from the \ +whitelist message above. Return ONLY valid JSON, no fences. """ @@ -112,8 +127,13 @@ New information from document "{doc_name}" (summarized above) should be \ integrated into this page. Rewrite the full page incorporating the new \ -information naturally — do not just append. Maintain existing \ -[[wikilinks]] and add new ones where appropriate. +information naturally — do not just append. Preserve the existing structure \ +and intent of the page. + +For [[wikilinks]] in the rewrite, follow the whitelist rules from the \ +message above: keep links whose target is in the whitelist, convert any \ +existing links whose target is NOT in the whitelist to plain text, and do \ +not invent new wikilink targets. Return a JSON object with two keys: - "brief": A single sentence (under 100 chars) defining this concept (may differ from before) @@ -122,6 +142,25 @@ Return ONLY valid JSON, no fences. """ +_SUMMARY_REWRITE_USER = """\ +Task: Rewrite the summary you wrote above into a final version that is \ +consistent with the concept pages now in the wiki (per the whitelist message \ +above). + +STRICT rules: +- Preserve every factual claim, finding, and detail from your draft. Do \ +NOT add or remove technical content, examples, or claims. +- For [[wikilinks]], follow the whitelist message above: keep valid links, \ +replace targets not in the whitelist with plain text, do not invent new \ +wikilink targets. +- You MAY upgrade plain-text mentions to [[wikilinks]] when the concept \ +appears in the whitelist — this is encouraged. +- Keep the headings, paragraph structure, and approximately the same length \ +as the draft. + +Return ONLY the rewritten Markdown content (no JSON, no fences, no frontmatter). +""" + _LONG_DOC_SUMMARY_USER = """\ This is a PageIndex summary for long document "{doc_name}" (doc_id: {doc_id}): @@ -650,6 +689,13 @@ def _update_index( DEFAULT_COMPILE_CONCURRENCY = 5 +def _format_known_targets(targets: set[str]) -> str: + """Format the whitelist as a bulleted Markdown list for prompt injection.""" + if not targets: + return "(none yet — do not use any [[wikilinks]] in your output)" + return "\n".join(f"- {t}" for t in sorted(targets)) + + async def _compile_concepts( wiki_dir: Path, kb_dir: Path, @@ -661,11 +707,16 @@ async def _compile_concepts( max_concurrency: int, doc_brief: str = "", doc_type: str = "short", + rewrite_summary: bool = False, ) -> None: """Shared Steps 2-4: concepts plan → generate/update → index. Uses ``_CONCEPTS_PLAN_USER`` to get a plan with create/update/related - actions, then executes each action type accordingly. + actions, then executes each action type accordingly. Concept bodies are + generated in memory, scrubbed of unresolved wikilinks, and only then + written to disk. When ``rewrite_summary=True`` (short-doc path), the + summary is rewritten by the LLM after concepts are finalized so its + wikilinks reflect the actual concept pages on disk. """ source_file = f"summaries/{doc_name}.md" @@ -685,11 +736,32 @@ async def _compile_concepts( )}, ], "concepts-plan", max_tokens=1024) + def _write_v1_summary_stripped() -> None: + """Fallback writer for the v1 summary on early-return paths. + + Strips against the set of wikilink targets currently on disk before + writing, so the v1 summary's LLM-hallucinated links don't slip past + the ghost-link defense when plan parsing fails or the plan is empty. + ``plan.create`` slugs are unknown at this point, so the whitelist + is just what physically exists. + """ + fallback_targets = list_existing_wiki_targets(wiki_dir) + fallback_targets.add(f"summaries/{doc_name}") + cleaned, ghosts = strip_ghost_wikilinks(summary, fallback_targets) + if ghosts: + logger.info( + "stripped %d ghost wikilink(s) from fallback v1 summary %s: %s", + len(ghosts), doc_name, ghosts[:5], + ) + _write_summary(wiki_dir, doc_name, cleaned) + try: parsed = _parse_json(plan_raw) except (json.JSONDecodeError, ValueError) as exc: logger.warning("Failed to parse concepts plan: %s", exc) logger.debug("Raw: %s", plan_raw) + if rewrite_summary: + _write_v1_summary_stripped() _update_index(wiki_dir, doc_name, [], doc_brief=doc_brief, doc_type=doc_type) return @@ -708,9 +780,43 @@ async def _compile_concepts( related_items = plan["related"] if not create_items and not update_items and not related_items: + if rewrite_summary: + _write_v1_summary_stripped() _update_index(wiki_dir, doc_name, [], doc_brief=doc_brief, doc_type=doc_type) return + # Build the whitelist of valid wikilink targets the LLM may emit. It + # combines what already exists on disk with what *this* round will + # produce (plan.create + plan.update + plan.related), plus the + # summary about to be written for this document. + planned_slugs = { + _sanitize_concept_name(c["name"]) for c in create_items + update_items + } | { + _sanitize_concept_name(s) for s in related_items + } + known_targets: set[str] = ( + list_existing_wiki_targets(wiki_dir) + | {f"concepts/{s}" for s in planned_slugs} + | {f"summaries/{doc_name}"} + ) + known_targets_str = _format_known_targets(known_targets) + + # Third cache breakpoint: the whitelist of valid wikilink targets. By + # carrying this list in its own cached user message — placed between + # summary_msg (BP2) and each per-concept user turn — every concept + # generation call and the summary-rewrite call reuses the whitelist + # tokens from cache instead of re-billing them on every request. This + # matters as the KB grows (the list can reach 5-10k tokens for a + # 500-concept wiki). Plan call deliberately omits this message — at + # plan time the whitelist isn't known yet, and plan uses concept_briefs + # via _CONCEPTS_PLAN_USER instead. + known_targets_msg = { + "role": "user", + "content": _cached_text(_KNOWN_TARGETS_USER.format( + known_targets=known_targets_str, + )), + } + # --- Step 3: Generate/update concept pages concurrently (A cached) --- semaphore = asyncio.Semaphore(max_concurrency) @@ -720,8 +826,9 @@ async def _gen_create(concept: dict) -> tuple[str, str, bool, str]: async with semaphore: raw = await _llm_call_async(model, [ system_msg, - doc_msg, - summary_msg, + doc_msg, # cached (BP1) + summary_msg, # cached (BP2) + known_targets_msg, # cached (BP3) — whitelist {"role": "user", "content": _CONCEPT_PAGE_USER.format( title=title, doc_name=doc_name, update_instruction="", @@ -751,8 +858,9 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool, str]: async with semaphore: raw = await _llm_call_async(model, [ system_msg, - doc_msg, - summary_msg, + doc_msg, # cached (BP1) + summary_msg, # cached (BP2) + known_targets_msg, # cached (BP3) — whitelist {"role": "user", "content": _CONCEPT_UPDATE_USER.format( title=title, doc_name=doc_name, existing_content=existing_content, @@ -772,6 +880,7 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool, str]: concept_names: list[str] = [] concept_briefs_map: dict[str, str] = {} + pending_writes: list[tuple[str, str, bool, str]] = [] if tasks: total = len(tasks) @@ -785,12 +894,93 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool, str]: logger.warning("Concept generation failed: %s", r) continue name, page_content, is_update, brief = r - _write_concept(wiki_dir, name, page_content, source_file, is_update, brief=brief) + pending_writes.append((name, page_content, is_update, brief)) safe_name = _sanitize_concept_name(name) concept_names.append(safe_name) if brief: concept_briefs_map[safe_name] = brief + # Strip unresolved wikilinks from concept bodies before writing. The + # whitelist includes existing files + this round's planned slugs + + # the summary for this document. + for i, (name, page_content, is_update, brief) in enumerate(pending_writes): + cleaned, ghosts = strip_ghost_wikilinks(page_content, known_targets) + if ghosts: + logger.info( + "stripped %d ghost wikilink(s) from concept %s: %s", + len(ghosts), name, ghosts[:5], + ) + pending_writes[i] = (name, cleaned, is_update, brief) + + # --- Optional Step 3a: LLM rewrite the summary with full whitelist --- + # Only for the short-doc path. The long-doc path leaves the indexer- + # written summary untouched. + # + # The rewrite call is best-effort: on any failure (API error, empty + # response, exception) we fall back to the v1 summary stripped against + # the full whitelist, so the summary is always written and never wiped. + if rewrite_summary: + candidate: str | None = None + try: + # No max_tokens cap — matches the v1 summary call. The rewrite + # prompt asks the model to keep length within ±20% of the v1. + rewrite_raw = _llm_call(model, [ + system_msg, + doc_msg, # cached (BP1) + summary_msg, # cached (BP2) — contains the v1 summary text + known_targets_msg, # cached (BP3) — whitelist + {"role": "user", "content": _SUMMARY_REWRITE_USER}, + ], "summary-rewrite") + candidate = rewrite_raw.strip() + # Strip frontmatter if the model added one anyway. + if candidate.startswith("---"): + end = candidate.find("---", 3) + if end != -1: + candidate = candidate[end + 3:].lstrip("\n") + # Safety net: strip any wikilink the rewrite emitted that is + # not in the whitelist. + candidate, summary_ghosts = strip_ghost_wikilinks( + candidate, known_targets + ) + if summary_ghosts: + logger.info( + "stripped %d ghost wikilink(s) from summary %s: %s", + len(summary_ghosts), doc_name, summary_ghosts[:5], + ) + except Exception as exc: + logger.warning( + "summary-rewrite failed for %s: %s. Falling back to v1.", + doc_name, exc, + ) + candidate = None + + if candidate: + final_summary = candidate + else: + # Rewrite produced no content (empty response or exception). + # Strip the v1 summary against the same whitelist so the + # fallback doesn't reintroduce ghost links. + if candidate is not None: + logger.warning( + "summary-rewrite returned empty for %s; using v1 fallback.", + doc_name, + ) + final_summary, fallback_ghosts = strip_ghost_wikilinks( + summary, known_targets, + ) + if fallback_ghosts: + logger.info( + "stripped %d ghost wikilink(s) from v1 fallback summary %s: %s", + len(fallback_ghosts), doc_name, fallback_ghosts[:5], + ) + _write_summary(wiki_dir, doc_name, final_summary) + + # --- Write concept pages to disk --- + for name, page_content, is_update, brief in pending_writes: + _write_concept( + wiki_dir, name, page_content, source_file, is_update, brief=brief, + ) + # --- Step 3b: Process related items (code only, no LLM) --- sanitized_related = [_sanitize_concept_name(s) for s in related_items] for slug in sanitized_related: @@ -840,7 +1030,11 @@ async def compile_short_doc( doc_name=doc_name, content=content, ))} - # --- Step 1: Generate summary --- + # --- Step 1: Generate summary (v1, held in memory) --- + # The summary is NOT written to disk yet — it's used as cache context + # for the plan + concept-generation calls, then rewritten into a final + # v2 (with a whitelist of known wikilink targets) inside + # _compile_concepts before being written to disk. summary_raw = _llm_call(model, [system_msg, doc_msg], "summary") try: summary_parsed = _parse_json(summary_raw) @@ -849,13 +1043,12 @@ async def compile_short_doc( except (json.JSONDecodeError, ValueError): doc_brief = "" summary = summary_raw - _write_summary(wiki_dir, doc_name, summary) - # --- Steps 2-4: Concept plan → generate/update → index --- + # --- Steps 2-4: Concept plan → generate/update → summary rewrite → index --- await _compile_concepts( wiki_dir, kb_dir, model, system_msg, doc_msg, summary, doc_name, max_concurrency, doc_brief=doc_brief, - doc_type="short", + doc_type="short", rewrite_summary=True, ) diff --git a/openkb/cli.py b/openkb/cli.py index 8b08d520..7e833f4a 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -405,12 +405,20 @@ def query(ctx, question, save, raw): if save and answer: import re + from openkb.lint import list_existing_wiki_targets, strip_ghost_wikilinks slug = re.sub(r"[^a-z0-9]+", "-", question.lower()).strip("-")[:60] explore_dir = kb_dir / "wiki" / "explorations" explore_dir.mkdir(parents=True, exist_ok=True) explore_path = explore_dir / f"{slug}.md" + # Strip ghost wikilinks the agent may have emitted to non-existent + # concept/summary pages — the schema_md in the agent's instructions + # encourages [[wikilinks]] but the agent's view of "which pages + # exist" can drift from disk reality. + known = list_existing_wiki_targets(kb_dir / "wiki") + cleaned_answer, _ = strip_ghost_wikilinks(answer, known) explore_path.write_text( - f"---\nquery: \"{question}\"\n---\n\n{answer}\n", encoding="utf-8" + f"---\nquery: \"{question}\"\n---\n\n{cleaned_answer}\n", + encoding="utf-8", ) click.echo(f"\nSaved to {explore_path}") @@ -601,16 +609,25 @@ async def run_lint(kb_dir: Path) -> Path | None: @cli.command() -@click.option("--fix", is_flag=True, default=False, help="Automatically fix lint issues (not yet implemented).") +@click.option("--fix", is_flag=True, default=False, + help="Rewrite broken [[wikilinks]] in place (fuzzy match) or " + "strip to plain text when no match. Runs before the report.") @click.pass_context def lint(ctx, fix): """Lint the knowledge base for structural and semantic inconsistencies.""" - if fix: - click.echo("Warning: --fix is not yet implemented. Running lint in report-only mode.") kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override")) if kb_dir is None: click.echo("No knowledge base found. Run `openkb init` first.") return + if fix: + from openkb.lint import fix_broken_links + files_changed, ghosts = fix_broken_links(kb_dir / "wiki") + if files_changed: + click.echo( + f"Fixed {ghosts} wikilink(s) across {files_changed} file(s)." + ) + else: + click.echo("Nothing to fix — all wikilinks resolve.") asyncio.run(run_lint(kb_dir)) diff --git a/openkb/lint.py b/openkb/lint.py index 78b22e5f..db5af226 100644 --- a/openkb/lint.py +++ b/openkb/lint.py @@ -9,6 +9,7 @@ from __future__ import annotations import re +import unicodedata from pathlib import Path # Matches [[wikilink]] or [[subdir/link]] @@ -18,6 +19,106 @@ _EXCLUDED_FILES = {"AGENTS.md", "SCHEMA.md", "log.md"} +def _normalize_target(target: str) -> str: + """Normalize a wikilink target for fuzzy comparison. + + Applies, in order: + - NFKC unicode normalization (e.g. full-width ')' → ASCII ')') + - Lowercase + - Underscore → hyphen + - Collapse repeated hyphens + - Strip leading/trailing hyphens (per segment when path-like) + + Path separators are preserved so ``concepts/Gist_Memory`` normalizes to + ``concepts/gist-memory``. + """ + s = unicodedata.normalize("NFKC", target) + s = s.lower().replace("_", "-") + # Normalize each path segment independently to avoid collapsing the '/' + parts = [re.sub(r"-+", "-", p).strip("-") for p in s.split("/")] + return "/".join(parts) + + +def build_norm_index(known_targets: set[str]) -> dict[str, str]: + """Build the normalized-form → canonical-target index used by + :func:`strip_ghost_wikilinks`. + + Useful when calling ``strip_ghost_wikilinks`` repeatedly with the same + ``known_targets`` (e.g. ``fix_broken_links`` scanning N wiki files, or + ``_save_transcript`` stripping N assistant turns) — build the index + once and pass it via the ``norm_index`` parameter to avoid O(N·M) + redundant rebuilds. + """ + return {_normalize_target(t): t for t in known_targets} + + +def strip_ghost_wikilinks( + content: str, + known_targets: set[str], + *, + norm_index: dict[str, str] | None = None, +) -> tuple[str, list[str]]: + """Strip [[wikilinks]] whose targets do not exist in ``known_targets``. + + For each ``[[X]]`` or ``[[X|alias]]`` in ``content``: + + - If ``X`` is in ``known_targets`` exactly, the link is kept as-is. + - Otherwise, ``X`` is normalized (see :func:`_normalize_target`) and + matched against the normalized form of each known target. On a hit, + the link is rewritten to the canonical target form. + - Otherwise, the brackets are removed and the link becomes plain text + (the alias if provided, otherwise the slug rendered as words). + + Args: + content: Markdown text containing zero or more ``[[wikilinks]]``. + known_targets: Valid link targets, e.g. + ``{"concepts/attention", "summaries/paper"}``. + norm_index: Optional pre-built normalized index from + :func:`build_norm_index`. Pass this when calling in a loop + with the same ``known_targets`` to skip redundant rebuilds. + + Returns: + Tuple of ``(rewritten_content, ghost_targets)`` where + ``ghost_targets`` is the list of unresolved targets that were + stripped (one entry per occurrence, in document order). + """ + if norm_index is None: + norm_index = build_norm_index(known_targets) + + ghosts: list[str] = [] + + def _repl(m: re.Match) -> str: + raw = m.group(1) + if "|" in raw: + target, alias = raw.split("|", 1) + target = target.strip() + alias = alias.strip() + else: + target = raw.strip() + alias = None + + # Direct hit + if target in known_targets: + return m.group(0) + + # Fuzzy normalized hit → rewrite to canonical + canonical = norm_index.get(_normalize_target(target)) + if canonical is not None: + if alias: + return f"[[{canonical}|{alias}]]" + return f"[[{canonical}]]" + + # Ghost — strip brackets, leave readable display + ghosts.append(target) + if alias: + return alias + stem = target.rsplit("/", 1)[-1] + return stem.replace("-", " ").replace("_", " ") + + cleaned = _WIKILINK_RE.sub(_repl, content) + return cleaned, ghosts + + def _read_md(path: Path) -> str: """Read a Markdown file safely, returning empty string on error.""" try: @@ -51,6 +152,73 @@ def _extract_wikilinks(text: str) -> list[str]: return [link.split("|")[0].strip() for link in raw] +def list_existing_wiki_targets(wiki_dir: Path) -> set[str]: + """Return the set of currently-existing wikilink targets on disk. + + Includes every ``concepts/{stem}`` and ``summaries/{stem}`` for .md files + actually present in the wiki, plus ``index`` when ``index.md`` exists. + Used to seed the whitelist passed to :func:`strip_ghost_wikilinks` from + both the compile pipeline and any other code path that writes + LLM-generated content to the wiki (e.g. ``openkb query --save``). + """ + targets: set[str] = set() + concepts_dir = wiki_dir / "concepts" + summaries_dir = wiki_dir / "summaries" + if concepts_dir.is_dir(): + targets.update(f"concepts/{p.stem}" for p in concepts_dir.glob("*.md")) + if summaries_dir.is_dir(): + targets.update(f"summaries/{p.stem}" for p in summaries_dir.glob("*.md")) + if (wiki_dir / "index.md").exists(): + targets.add("index") + return targets + + +def fix_broken_links(wiki: Path) -> tuple[int, int]: + """Rewrite or strip broken [[wikilinks]] across the wiki in place. + + For each Markdown page under ``wiki`` (excluding ``reports/`` and + ``sources/`` and excluded files), runs :func:`strip_ghost_wikilinks` + against the set of valid targets currently on disk. Targets that match + fuzzily (case, ``_`` vs ``-``, NFKC) are rewritten to canonical form; + targets that have no match are demoted to plain text. + + Args: + wiki: Path to the wiki root directory. + + Returns: + Tuple of ``(files_changed, ghosts_stripped)``. + """ + pages = _all_wiki_pages(wiki) + # The same fuzzy normalization _all_wiki_pages stores both the full + # relative path (e.g. ``concepts/attention``) and the bare stem + # (``attention``). Use the full-path keys so that links like + # ``[[concepts/foo]]`` resolve against ``concepts/`` files only. + known_targets: set[str] = { + key for key in pages if "/" in key or key == "index" + } + # Build the normalized index once and reuse across every file — + # otherwise strip_ghost_wikilinks would rebuild it per file (O(F·M)). + norm_index = build_norm_index(known_targets) + + files_changed = 0 + ghosts_stripped = 0 + for md in wiki.rglob("*.md"): + if md.name in _EXCLUDED_FILES: + continue + rel_parts = md.relative_to(wiki).parts + if rel_parts and rel_parts[0] in ("reports", "sources"): + continue + text = _read_md(md) + cleaned, ghosts = strip_ghost_wikilinks( + text, known_targets, norm_index=norm_index, + ) + if cleaned != text: + md.write_text(cleaned, encoding="utf-8") + files_changed += 1 + ghosts_stripped += len(ghosts) + return files_changed, ghosts_stripped + + def find_broken_links(wiki: Path) -> list[str]: """Scan all wiki pages for [[wikilinks]] pointing to non-existent targets. diff --git a/tests/test_chat_slash_commands.py b/tests/test_chat_slash_commands.py index 56337901..2a781da5 100644 --- a/tests/test_chat_slash_commands.py +++ b/tests/test_chat_slash_commands.py @@ -216,3 +216,41 @@ async def test_slash_clear(tmp_path): with p: result = await _handle_slash("/clear", kb_dir, session, _STYLE) assert result == "new_session" + + +def test_save_transcript_strips_ghost_wikilinks(tmp_path): + """`/save` writes the chat transcript to wiki/explorations/. Assistant + responses may contain [[wikilinks]] to pages that don't exist on disk + (the agent's instructions encourage wikilinks but its view of which + pages exist can drift). The save path strips them before writing so + `openkb lint` doesn't surface them as broken links on the next run. + """ + from openkb.agent.chat import _save_transcript + + kb_dir = _setup_kb(tmp_path) + # A real concept page on disk → valid wikilink target. + (kb_dir / "wiki" / "concepts" / "attention.md").write_text( + "# Attention\n", encoding="utf-8", + ) + + session = _make_session(kb_dir) + session.user_turns = ["What is a transformer?"] + session.assistant_texts = [ + "A transformer uses [[concepts/attention]] to model relationships. " + "Unlike [[concepts/rnn]], it processes tokens in parallel and relies " + "on [[concepts/positional-encoding]] for order." + ] + session.title = "transformer-q" + + path = _save_transcript(kb_dir, session, name=None) + + text = path.read_text(encoding="utf-8") + # Valid link preserved + assert "[[concepts/attention]]" in text + # Ghost links stripped to plain text + assert "[[concepts/rnn]]" not in text + assert "rnn" in text + assert "[[concepts/positional-encoding]]" not in text + assert "positional encoding" in text + # User turn preserved verbatim (not stripped) + assert "What is a transformer?" in text diff --git a/tests/test_cli.py b/tests/test_cli.py index 83c3415b..8c4e108c 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -113,3 +113,48 @@ def test_query_enables_stream_when_stdout_is_tty(self, kb_dir): # Stream branch should NOT echo the answer again — run_query already # wrote tokens to stdout as they arrived. assert "the answer" not in result.output + + +class TestQuerySaveGhostStrip: + """`openkb query --save` writes the LLM answer to wiki/explorations/. + The agent's instructions encourage [[wikilinks]], but its view of which + pages exist can drift from disk. Ghost wikilinks in the saved file + would then surface as broken links the next time `openkb lint` runs. + The save path strips them before writing. + """ + + def test_save_strips_ghost_wikilinks(self, kb_dir): + # A real concept page exists on disk → valid wikilink target. + (kb_dir / "wiki" / "concepts" / "attention.md").write_text( + "# Attention\n", encoding="utf-8", + ) + + # The agent's answer includes one valid + two ghost wikilinks. + answer = ( + "Transformers rely on [[concepts/attention]] over the input. " + "They differ from [[concepts/rnn]] which processes sequentially, " + "and use [[concepts/multi-head-attention]] as a key building block." + ) + + async def fake_run_query(*_args, **_kwargs): + return answer + + with patch("openkb.cli._stream_to_tty", return_value=False), \ + patch("openkb.agent.query.run_query", side_effect=fake_run_query), \ + patch("openkb.cli._setup_llm_key"), \ + patch("openkb.cli.append_log"): + result = CliRunner().invoke( + cli, ["--kb-dir", str(kb_dir), "query", "transformers?", "--save"] + ) + + assert result.exit_code == 0, result.output + explore_files = list((kb_dir / "wiki" / "explorations").glob("*.md")) + assert len(explore_files) == 1 + saved = explore_files[0].read_text() + # Valid link preserved + assert "[[concepts/attention]]" in saved + # Ghost links stripped to plain text + assert "[[concepts/rnn]]" not in saved + assert "rnn" in saved + assert "[[concepts/multi-head-attention]]" not in saved + assert "multi head attention" in saved diff --git a/tests/test_compiler.py b/tests/test_compiler.py index 19fb6fcb..aebb3213 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -700,6 +700,10 @@ async def test_full_pipeline(self, tmp_path): "update": [], "related": [], }) + # The rewrite step (third sync call) returns raw Markdown. + summary_rewrite_response = ( + "# Summary\n\nThis document discusses [[concepts/transformer]]." + ) concept_page_response = json.dumps({ "brief": "NN architecture using self-attention", "content": "# Transformer\n\nA neural network architecture.", @@ -707,7 +711,11 @@ async def test_full_pipeline(self, tmp_path): with patch("openkb.agent.compiler.litellm") as mock_litellm: mock_litellm.completion = MagicMock( - side_effect=_mock_completion([summary_response, concepts_list_response]) + side_effect=_mock_completion([ + summary_response, + concepts_list_response, + summary_rewrite_response, + ]) ) mock_litellm.acompletion = AsyncMock( side_effect=_mock_acompletion([concept_page_response]) @@ -717,7 +725,10 @@ async def test_full_pipeline(self, tmp_path): # Verify summary written summary_path = wiki / "summaries" / "test-doc.md" assert summary_path.exists() - assert "full_text: sources/test-doc.md" in summary_path.read_text() + summary_text = summary_path.read_text() + assert "full_text: sources/test-doc.md" in summary_text + # Summary body comes from the rewrite step + assert "[[concepts/transformer]]" in summary_text # Verify concept written concept_path = wiki / "concepts" / "transformer.md" @@ -753,6 +764,169 @@ async def test_handles_bad_json(self, tmp_path): assert (wiki / "summaries" / "doc.md").exists() +class TestCompileShortDocFallbacks: + """Regression tests for the summary-rewrite resilience path. + + The rewrite call can fail (API error, empty response, parse error). + In every failure mode the v1 summary should be written to disk — + stripped against the current whitelist so it doesn't reintroduce + ghost wikilinks — never an empty file or missing file. + """ + + @staticmethod + def _setup_kb(tmp_path): + wiki = tmp_path / "wiki" + (wiki / "sources").mkdir(parents=True) + (wiki / "summaries").mkdir(parents=True) + (wiki / "concepts").mkdir(parents=True) + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + (tmp_path / ".openkb").mkdir() + source_path = wiki / "sources" / "doc.md" + source_path.write_text("Body.", encoding="utf-8") + return wiki, source_path + + @pytest.mark.asyncio + async def test_rewrite_empty_response_falls_back_to_v1(self, tmp_path): + wiki, source_path = self._setup_kb(tmp_path) + + v1_summary_content = ( + "# Summary\n\nDiscusses [[concepts/transformer]] and [[concepts/ghost]]." + ) + summary_response = json.dumps({ + "brief": "B", "content": v1_summary_content, + }) + plan_response = json.dumps({ + "create": [{"name": "transformer", "title": "Transformer"}], + "update": [], "related": [], + }) + # Rewrite returns an empty string → must fall back to v1 + rewrite_response = "" + concept_response = json.dumps({"brief": "C", "content": "# T\n\nBody."}) + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([ + summary_response, plan_response, rewrite_response, + ]) + ) + mock_litellm.acompletion = AsyncMock( + side_effect=_mock_acompletion([concept_response]) + ) + await compile_short_doc("doc", source_path, tmp_path, "gpt-4o-mini") + + summary_path = wiki / "summaries" / "doc.md" + assert summary_path.exists() + text = summary_path.read_text() + # The v1 content should be on disk (fallback) — stripped of ghosts. + assert "Discusses" in text + assert "[[concepts/transformer]]" in text # valid link kept + assert "[[concepts/ghost]]" not in text # ghost stripped + assert "ghost" in text # but plain text remains + + @pytest.mark.asyncio + async def test_rewrite_exception_falls_back_to_v1(self, tmp_path): + wiki, source_path = self._setup_kb(tmp_path) + + v1_summary_content = ( + "# Summary\n\nUses [[concepts/transformer]] mechanism." + ) + summary_response = json.dumps({ + "brief": "B", "content": v1_summary_content, + }) + plan_response = json.dumps({ + "create": [{"name": "transformer", "title": "Transformer"}], + "update": [], "related": [], + }) + concept_response = json.dumps({"brief": "C", "content": "# T\n\nBody."}) + + # Third sync call (rewrite) raises a simulated API error. + sync_call_count = {"n": 0} + + def sync_side_effect(*args, **kwargs): + idx = sync_call_count["n"] + sync_call_count["n"] += 1 + if idx == 2: # the summary-rewrite call + raise RuntimeError("simulated API failure") + mock_resp = MagicMock() + mock_resp.choices = [MagicMock()] + mock_resp.choices[0].message.content = [ + summary_response, plan_response, + ][idx] + mock_resp.usage = MagicMock(prompt_tokens=1, completion_tokens=1) + mock_resp.usage.prompt_tokens_details = None + return mock_resp + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock(side_effect=sync_side_effect) + mock_litellm.acompletion = AsyncMock( + side_effect=_mock_acompletion([concept_response]) + ) + # Must NOT raise out of compile_short_doc + await compile_short_doc("doc", source_path, tmp_path, "gpt-4o-mini") + + summary_path = wiki / "summaries" / "doc.md" + assert summary_path.exists() + text = summary_path.read_text() + assert "Uses" in text + assert "[[concepts/transformer]]" in text + + @pytest.mark.asyncio + async def test_plan_parse_failure_strips_v1_summary_ghosts(self, tmp_path): + wiki, source_path = self._setup_kb(tmp_path) + + v1_summary_content = ( + "# Summary\n\nReferences [[concepts/nonexistent]] heavily." + ) + summary_response = json.dumps({ + "brief": "B", "content": v1_summary_content, + }) + # Plan call returns non-JSON garbage → triggers early return + plan_response = "not valid json at all" + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([summary_response, plan_response]) + ) + await compile_short_doc("doc", source_path, tmp_path, "gpt-4o-mini") + + summary_path = wiki / "summaries" / "doc.md" + assert summary_path.exists() + text = summary_path.read_text() + # Ghost link should be stripped to plain text on fallback path + assert "[[concepts/nonexistent]]" not in text + assert "nonexistent" in text # display text preserved + assert "References" in text + + @pytest.mark.asyncio + async def test_empty_plan_strips_v1_summary_ghosts(self, tmp_path): + wiki, source_path = self._setup_kb(tmp_path) + + v1_summary_content = ( + "# Summary\n\nMentions [[concepts/imaginary]] briefly." + ) + summary_response = json.dumps({ + "brief": "B", "content": v1_summary_content, + }) + empty_plan_response = json.dumps({ + "create": [], "update": [], "related": [], + }) + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([summary_response, empty_plan_response]) + ) + await compile_short_doc("doc", source_path, tmp_path, "gpt-4o-mini") + + summary_path = wiki / "summaries" / "doc.md" + assert summary_path.exists() + text = summary_path.read_text() + assert "[[concepts/imaginary]]" not in text + assert "imaginary" in text # plain text preserved + + class TestCacheControl: """Verify cache_control breakpoints are emitted on the right messages so Anthropic prompt caching can hit on every reuse of the base context. @@ -786,12 +960,18 @@ async def test_short_doc_marks_doc_and_summary(self, tmp_path): "create": [{"name": "topic", "title": "Topic"}], "update": [], "related": [], }) + # 3rd sync call is the summary-rewrite (raw Markdown, not JSON). + summary_rewrite_response = "# Summary\n\nrewritten body" concept_response = json.dumps({"brief": "C", "content": "page body"}) captured_sync_calls: list[list[dict]] = [] captured_async_calls: list[list[dict]] = [] - sync_responses = [summary_response, plan_response] + sync_responses = [ + summary_response, + plan_response, + summary_rewrite_response, + ] def sync_side_effect(*args, **kwargs): captured_sync_calls.append(kwargs["messages"]) @@ -817,7 +997,7 @@ async def async_side_effect(*args, **kwargs): mock_litellm.acompletion = AsyncMock(side_effect=async_side_effect) await compile_short_doc("doc", src, tmp_path, "anthropic/claude-sonnet-4-5") - # Step 1 (summary): doc_msg carries the breakpoint. + # Step 1 (summary): doc_msg carries the breakpoint (BP1). summary_call = captured_sync_calls[0] assert summary_call[0]["role"] == "system" assert summary_call[1]["role"] == "user" @@ -825,7 +1005,8 @@ async def async_side_effect(*args, **kwargs): "doc_msg in summary call must carry an ephemeral cache_control marker" ) - # Step 2 (plan): doc_msg AND assistant summary both carry breakpoints. + # Step 2 (plan): doc_msg AND assistant summary both carry breakpoints + # (BP1 + BP2). Plan does NOT include the known_targets message. plan_call = captured_sync_calls[1] assert self._has_cache_breakpoint(plan_call[1]) assert plan_call[2]["role"] == "assistant" @@ -833,11 +1014,29 @@ async def async_side_effect(*args, **kwargs): "assistant summary in plan call must carry a cache_control marker" ) - # Step 3 (concept generation): same two breakpoints reused. + # Step 3 (concept generation): BP1 + BP2 + new BP3 (known_targets msg). assert captured_async_calls, "expected at least one async concept call" concept_call = captured_async_calls[0] assert self._has_cache_breakpoint(concept_call[1]) assert self._has_cache_breakpoint(concept_call[2]) + # New: BP3 is the known_targets user message at index 3, sitting + # between summary_msg and the per-concept user prompt. + assert concept_call[3]["role"] == "user" + assert self._has_cache_breakpoint(concept_call[3]), ( + "known_targets message in concept call must carry a cache_control marker" + ) + + # Step 4 (summary rewrite): same three breakpoints reused — this is + # the whole point of the BP3 design, the whitelist is cached not + # re-billed per call. + rewrite_call = captured_sync_calls[2] + assert self._has_cache_breakpoint(rewrite_call[1]) # BP1 + assert self._has_cache_breakpoint(rewrite_call[2]) # BP2 + assert rewrite_call[3]["role"] == "user" + assert self._has_cache_breakpoint(rewrite_call[3]), ( # BP3 + "known_targets message in summary-rewrite call must carry " + "a cache_control marker" + ) @pytest.mark.asyncio async def test_long_doc_marks_doc_message(self, tmp_path): diff --git a/tests/test_lint.py b/tests/test_lint.py index 526e4a9e..b89a34c2 100644 --- a/tests/test_lint.py +++ b/tests/test_lint.py @@ -6,11 +6,14 @@ import pytest from openkb.lint import ( + _normalize_target, + build_norm_index, check_index_sync, find_broken_links, find_missing_entries, find_orphans, run_structural_lint, + strip_ghost_wikilinks, ) @@ -224,3 +227,169 @@ def test_report_includes_broken_link_details(self, tmp_path): report = run_structural_lint(tmp_path) assert "missing" in report + + +class TestNormalizeTarget: + def test_lowercases(self): + assert _normalize_target("concepts/AI-Agents") == "concepts/ai-agents" + + def test_underscore_to_hyphen(self): + assert _normalize_target("concepts/gist_memory") == "concepts/gist-memory" + + def test_nfkc_fullwidth_paren(self): + # Full-width ) normalizes to ASCII ) via NFKC + assert _normalize_target("summaries/A(B)") == _normalize_target("summaries/A(B)") + + def test_collapses_repeated_hyphens(self): + assert _normalize_target("concepts/foo--bar") == "concepts/foo-bar" + + def test_preserves_path_separator(self): + assert _normalize_target("concepts/Foo") == "concepts/foo" + # Does not collapse the slash + assert "/" in _normalize_target("concepts/Foo") + + def test_strips_trailing_hyphens_per_segment(self): + assert _normalize_target("concepts/-foo-") == "concepts/foo" + + +class TestStripGhostWikilinks: + def test_keeps_direct_match(self): + out, ghosts = strip_ghost_wikilinks( + "See [[concepts/attention]] for details.", + {"concepts/attention"}, + ) + assert out == "See [[concepts/attention]] for details." + assert ghosts == [] + + def test_strips_unknown_target(self): + out, ghosts = strip_ghost_wikilinks( + "See [[concepts/missing]] for details.", + {"concepts/attention"}, + ) + assert out == "See missing for details." + assert ghosts == ["concepts/missing"] + + def test_rewrites_fuzzy_match_underscore_to_hyphen(self): + out, ghosts = strip_ghost_wikilinks( + "See [[concepts/gist_memory]] now.", + {"concepts/gist-memory"}, + ) + assert out == "See [[concepts/gist-memory]] now." + assert ghosts == [] + + def test_rewrites_fuzzy_match_case(self): + out, ghosts = strip_ghost_wikilinks( + "See [[concepts/AI-Agents]] now.", + {"concepts/ai-agents"}, + ) + assert out == "See [[concepts/ai-agents]] now." + assert ghosts == [] + + def test_rewrites_fuzzy_match_unicode(self): + # File on disk has full-width ) + known = {"summaries/Agent 对接说明文档(已修订)"} + # LLM wrote ASCII ) + out, ghosts = strip_ghost_wikilinks( + "See [[summaries/Agent 对接说明文档(已修订)]].", + known, + ) + assert "[[summaries/Agent 对接说明文档(已修订)]]" in out + assert ghosts == [] + + def test_preserves_alias_on_direct_match(self): + out, ghosts = strip_ghost_wikilinks( + "See [[concepts/attention|the attention mechanism]].", + {"concepts/attention"}, + ) + assert "[[concepts/attention|the attention mechanism]]" in out + assert ghosts == [] + + def test_preserves_alias_on_fuzzy_match(self): + out, ghosts = strip_ghost_wikilinks( + "See [[concepts/gist_memory|gist memory]].", + {"concepts/gist-memory"}, + ) + assert "[[concepts/gist-memory|gist memory]]" in out + + def test_uses_alias_as_display_when_stripped(self): + out, ghosts = strip_ghost_wikilinks( + "See [[concepts/missing|my term]] now.", + set(), + ) + assert out == "See my term now." + assert ghosts == ["concepts/missing"] + + def test_strips_bare_link_to_readable_text(self): + # "concepts/multi_head_attention" → "multi head attention" + out, ghosts = strip_ghost_wikilinks( + "Uses [[concepts/multi_head_attention]] heavily.", + set(), + ) + assert out == "Uses multi head attention heavily." + + def test_handles_multiple_links_mixed(self): + out, ghosts = strip_ghost_wikilinks( + "[[concepts/a]] and [[concepts/b]] and [[concepts/c]]", + {"concepts/a", "concepts/c"}, + ) + assert "[[concepts/a]]" in out + assert "[[concepts/c]]" in out + assert "[[concepts/b]]" not in out + assert ghosts == ["concepts/b"] + + def test_no_wikilinks_returns_unchanged(self): + text = "Plain markdown with no wikilinks at all." + out, ghosts = strip_ghost_wikilinks(text, {"concepts/foo"}) + assert out == text + assert ghosts == [] + + def test_empty_known_set_strips_all(self): + out, ghosts = strip_ghost_wikilinks( + "[[a]] [[b/c]] [[d]]", + set(), + ) + assert "[[" not in out + assert len(ghosts) == 3 + + def test_same_ghost_appearing_multiple_times(self): + out, ghosts = strip_ghost_wikilinks( + "[[concepts/x]] and [[concepts/x]] again", + set(), + ) + # Each occurrence is recorded separately so callers can count + assert ghosts == ["concepts/x", "concepts/x"] + + def test_accepts_prebuilt_norm_index_with_identical_result(self): + """Passing a pre-built ``norm_index`` should produce the same + result as letting the function build it internally — this is the + contract that lets ``fix_broken_links`` and ``_save_transcript`` + amortize the index build across many calls. + """ + known = {"concepts/gist-memory", "concepts/attention"} + text = ( + "See [[concepts/gist_memory]] and [[concepts/attention]] and " + "[[concepts/missing]]." + ) + + # Default (no norm_index passed) + out_a, ghosts_a = strip_ghost_wikilinks(text, known) + + # With pre-built norm_index + idx = build_norm_index(known) + out_b, ghosts_b = strip_ghost_wikilinks(text, known, norm_index=idx) + + assert out_a == out_b + assert ghosts_a == ghosts_b + # Sanity: fuzzy rewrite, plus one ghost + assert "[[concepts/gist-memory]]" in out_b + assert "[[concepts/missing]]" not in out_b + + +class TestBuildNormIndex: + def test_returns_normalized_to_canonical_map(self): + idx = build_norm_index({"concepts/Gist_Memory", "summaries/Paper"}) + assert idx["concepts/gist-memory"] == "concepts/Gist_Memory" + assert idx["summaries/paper"] == "summaries/Paper" + + def test_empty_set_returns_empty_dict(self): + assert build_norm_index(set()) == {}