Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,20 @@ Settings are initialized by `openkb init`, and stored in `.openkb/config.yaml`:
model: gpt-5.4 # LLM model (any LiteLLM-supported provider)
language: en # Wiki output language
pageindex_threshold: 20 # PDF pages threshold for PageIndex
storage_backend: sqlite # Storage backend: sqlite (default) or json
```

### Storage Backend

OpenKB supports two storage backends for the file hash registry:

| Backend | Description | Use Case |
|---------|-------------|----------|
| `sqlite` | SQLite database (default) | Better concurrency, scalability, recommended for production |
| `json` | JSON file | Simple, human-readable, for small installations |

Migration from JSON to SQLite happens automatically when you switch to `sqlite` backend and a `hashes.json` file exists. The JSON file is preserved but no longer used.

Model names use `provider/model` LiteLLM [format](https://docs.litellm.ai/docs/providers) (OpenAI models can omit the prefix):

| Provider | Model example |
Expand Down
45 changes: 23 additions & 22 deletions openkb/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,14 +138,15 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None:
4. Else: compile_short_doc.
"""
from openkb.agent.compiler import compile_long_doc, compile_short_doc
from openkb.state import HashRegistry
from openkb.state import get_registry

logger = logging.getLogger(__name__)
openkb_dir = kb_dir / ".openkb"
config = load_config(openkb_dir / "config.yaml")
_setup_llm_key(kb_dir)
model: str = config.get("model", DEFAULT_CONFIG["model"])
registry = HashRegistry(openkb_dir / "hashes.json")
backend = config.get("storage_backend", "sqlite")
registry = get_registry(openkb_dir, backend=backend)

# 2. Convert document
click.echo(f"Adding: {file_path.name}")
Expand Down Expand Up @@ -299,9 +300,10 @@ def init():
"model": model,
"language": DEFAULT_CONFIG["language"],
"pageindex_threshold": DEFAULT_CONFIG["pageindex_threshold"],
"storage_backend": DEFAULT_CONFIG["storage_backend"],
}
save_config(openkb_dir / "config.yaml", config)
(openkb_dir / "hashes.json").write_text(json.dumps({}), encoding="utf-8")
# SQLite DB 会在首次访问时由 get_registry() 自动创建,无需预创建

# Write API key to KB-local .env (0600) if the user provided one
if api_key:
Expand Down Expand Up @@ -544,20 +546,16 @@ async def run_lint(kb_dir: Path) -> Path | None:
"""
from openkb.lint import run_structural_lint
from openkb.agent.linter import run_knowledge_lint
from openkb.state import get_registry

openkb_dir = kb_dir / ".openkb"

# Skip lint entirely when the KB has no indexed documents
hashes_file = openkb_dir / "hashes.json"
if hashes_file.exists():
hashes = json.loads(hashes_file.read_text(encoding="utf-8"))
else:
hashes = {}
config = load_config(openkb_dir / "config.yaml")
backend: str = config.get("storage_backend", "sqlite")
registry = get_registry(openkb_dir, backend=backend)
hashes = registry.all_entries()
if not hashes:
click.echo("Nothing to lint — no documents indexed yet. Run `openkb add` first.")
return

config = load_config(openkb_dir / "config.yaml")
_setup_llm_key(kb_dir)
model: str = config.get("model", DEFAULT_CONFIG["model"])

Expand Down Expand Up @@ -601,13 +599,13 @@ def lint(ctx, fix):

def print_list(kb_dir: Path) -> None:
"""Print all documents in the knowledge base. Usable from CLI and chat REPL."""
openkb_dir = kb_dir / ".openkb"
hashes_file = openkb_dir / "hashes.json"
if not hashes_file.exists():
click.echo("No documents indexed yet.")
return
from openkb.state import get_registry

hashes = json.loads(hashes_file.read_text(encoding="utf-8"))
openkb_dir = kb_dir / ".openkb"
config = load_config(openkb_dir / "config.yaml")
backend = config.get("storage_backend", "sqlite")
registry = get_registry(openkb_dir, backend=backend)
hashes = registry.all_entries()
if not hashes:
click.echo("No documents indexed yet.")
return
Expand Down Expand Up @@ -688,11 +686,14 @@ def print_status(kb_dir: Path) -> None:
click.echo(f" {'raw':<20} {raw_count:<10}")

# Hash registry summary
from openkb.state import get_registry

openkb_dir = kb_dir / ".openkb"
hashes_file = openkb_dir / "hashes.json"
if hashes_file.exists():
hashes = json.loads(hashes_file.read_text(encoding="utf-8"))
click.echo(f"\n Total indexed: {len(hashes)} document(s)")
config = load_config(openkb_dir / "config.yaml")
backend = config.get("storage_backend", "sqlite")
registry = get_registry(openkb_dir, backend=backend)
hashes = registry.all_entries()
click.echo(f"\n Total indexed: {len(hashes)} document(s)")

# Last compile time: newest file in wiki/summaries/
summaries_dir = wiki_dir / "summaries"
Expand Down
1 change: 1 addition & 0 deletions openkb/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"model": "gpt-5.4-mini",
"language": "en",
"pageindex_threshold": 20,
"storage_backend": "sqlite",
}

GLOBAL_CONFIG_DIR = Path.home() / ".config" / "openkb"
Expand Down
7 changes: 4 additions & 3 deletions openkb/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from openkb.config import load_config
from openkb.images import copy_relative_images, extract_base64_images, convert_pdf_with_images
from openkb.state import HashRegistry
from openkb.state import get_registry

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -50,12 +50,13 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
openkb_dir = kb_dir / ".openkb"
config = load_config(openkb_dir / "config.yaml")
threshold: int = config.get("pageindex_threshold", 20)
registry = HashRegistry(openkb_dir / "hashes.json")
backend = config.get("storage_backend", "sqlite")
registry = get_registry(openkb_dir, backend=backend)

# ------------------------------------------------------------------
# 1. Hash check
# ------------------------------------------------------------------
file_hash = HashRegistry.hash_file(src)
file_hash = registry.hash_file(src)
if registry.is_known(file_hash):
logger.info("Skipping already-known file: %s", src.name)
return ConvertResult(skipped=True)
Expand Down
Loading