VectifyAI · kdush · Apr 11, 2026 · Apr 11, 2026 · Apr 11, 2026 · May 13, 2026
diff --git a/README.md b/README.md
@@ -189,8 +189,20 @@ Settings are initialized by `openkb init`, and stored in `.openkb/config.yaml`:
 model: gpt-5.4                   # LLM model (any LiteLLM-supported provider)
 language: en                     # Wiki output language
 pageindex_threshold: 20          # PDF pages threshold for PageIndex
+storage_backend: sqlite          # Storage backend: sqlite (default) or json
 ```
 
+### Storage Backend
+
+OpenKB supports two storage backends for the file hash registry:
+
+| Backend | Description | Use Case |
+|---------|-------------|----------|
+| `sqlite` | SQLite database (default) | Better concurrency, scalability, recommended for production |
+| `json` | JSON file | Simple, human-readable, for small installations |
+
+Migration from JSON to SQLite happens automatically when you switch to `sqlite` backend and a `hashes.json` file exists. The JSON file is preserved but no longer used.
+
 Model names use `provider/model` LiteLLM [format](https://docs.litellm.ai/docs/providers) (OpenAI models can omit the prefix):
 
 | Provider | Model example |

diff --git a/openkb/cli.py b/openkb/cli.py
@@ -138,14 +138,15 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None:
     4. Else: compile_short_doc.
     """
     from openkb.agent.compiler import compile_long_doc, compile_short_doc
-    from openkb.state import HashRegistry
+    from openkb.state import get_registry
 
     logger = logging.getLogger(__name__)
     openkb_dir = kb_dir / ".openkb"
     config = load_config(openkb_dir / "config.yaml")
     _setup_llm_key(kb_dir)
     model: str = config.get("model", DEFAULT_CONFIG["model"])
-    registry = HashRegistry(openkb_dir / "hashes.json")
+    backend = config.get("storage_backend", "sqlite")
+    registry = get_registry(openkb_dir, backend=backend)
 
     # 2. Convert document
     click.echo(f"Adding: {file_path.name}")
@@ -299,9 +300,10 @@ def init():
         "model": model,
         "language": DEFAULT_CONFIG["language"],
         "pageindex_threshold": DEFAULT_CONFIG["pageindex_threshold"],
+        "storage_backend": DEFAULT_CONFIG["storage_backend"],
     }
     save_config(openkb_dir / "config.yaml", config)
-    (openkb_dir / "hashes.json").write_text(json.dumps({}), encoding="utf-8")
+    # SQLite DB 会在首次访问时由 get_registry() 自动创建，无需预创建
 
     # Write API key to KB-local .env (0600) if the user provided one
     if api_key:
@@ -544,20 +546,16 @@ async def run_lint(kb_dir: Path) -> Path | None:
     """
     from openkb.lint import run_structural_lint
     from openkb.agent.linter import run_knowledge_lint
+    from openkb.state import get_registry
 
     openkb_dir = kb_dir / ".openkb"
-
-    # Skip lint entirely when the KB has no indexed documents
-    hashes_file = openkb_dir / "hashes.json"
-    if hashes_file.exists():
-        hashes = json.loads(hashes_file.read_text(encoding="utf-8"))
-    else:
-        hashes = {}
+    config = load_config(openkb_dir / "config.yaml")
+    backend: str = config.get("storage_backend", "sqlite")
+    registry = get_registry(openkb_dir, backend=backend)
+    hashes = registry.all_entries()
     if not hashes:
         click.echo("Nothing to lint — no documents indexed yet. Run `openkb add` first.")
         return
-
-    config = load_config(openkb_dir / "config.yaml")
     _setup_llm_key(kb_dir)
     model: str = config.get("model", DEFAULT_CONFIG["model"])
 
@@ -601,13 +599,13 @@ def lint(ctx, fix):
 
 def print_list(kb_dir: Path) -> None:
     """Print all documents in the knowledge base. Usable from CLI and chat REPL."""
-    openkb_dir = kb_dir / ".openkb"
-    hashes_file = openkb_dir / "hashes.json"
-    if not hashes_file.exists():
-        click.echo("No documents indexed yet.")
-        return
+    from openkb.state import get_registry
 
-    hashes = json.loads(hashes_file.read_text(encoding="utf-8"))
+    openkb_dir = kb_dir / ".openkb"
+    config = load_config(openkb_dir / "config.yaml")
+    backend = config.get("storage_backend", "sqlite")
+    registry = get_registry(openkb_dir, backend=backend)
+    hashes = registry.all_entries()
     if not hashes:
         click.echo("No documents indexed yet.")
         return
@@ -688,11 +686,14 @@ def print_status(kb_dir: Path) -> None:
         click.echo(f"  {'raw':<20} {raw_count:<10}")
 
     # Hash registry summary
+    from openkb.state import get_registry
+
     openkb_dir = kb_dir / ".openkb"
-    hashes_file = openkb_dir / "hashes.json"
-    if hashes_file.exists():
-        hashes = json.loads(hashes_file.read_text(encoding="utf-8"))
-        click.echo(f"\n  Total indexed: {len(hashes)} document(s)")
+    config = load_config(openkb_dir / "config.yaml")
+    backend = config.get("storage_backend", "sqlite")
+    registry = get_registry(openkb_dir, backend=backend)
+    hashes = registry.all_entries()
+    click.echo(f"\n  Total indexed: {len(hashes)} document(s)")
 
     # Last compile time: newest file in wiki/summaries/
     summaries_dir = wiki_dir / "summaries"

diff --git a/openkb/config.py b/openkb/config.py
@@ -9,6 +9,7 @@
     "model": "gpt-5.4-mini",
     "language": "en",
     "pageindex_threshold": 20,
+    "storage_backend": "sqlite",
 }
 
 GLOBAL_CONFIG_DIR = Path.home() / ".config" / "openkb"

diff --git a/openkb/converter.py b/openkb/converter.py
@@ -11,7 +11,7 @@
 
 from openkb.config import load_config
 from openkb.images import copy_relative_images, extract_base64_images, convert_pdf_with_images
-from openkb.state import HashRegistry
+from openkb.state import get_registry
 
 logger = logging.getLogger(__name__)
 
@@ -50,12 +50,13 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
     openkb_dir = kb_dir / ".openkb"
     config = load_config(openkb_dir / "config.yaml")
     threshold: int = config.get("pageindex_threshold", 20)
-    registry = HashRegistry(openkb_dir / "hashes.json")
+    backend = config.get("storage_backend", "sqlite")
+    registry = get_registry(openkb_dir, backend=backend)
 
     # ------------------------------------------------------------------
     # 1. Hash check
     # ------------------------------------------------------------------
-    file_hash = HashRegistry.hash_file(src)
+    file_hash = registry.hash_file(src)
     if registry.is_known(file_hash):
         logger.info("Skipping already-known file: %s", src.name)
         return ConvertResult(skipped=True)