From 1b5fade95e9a780cd366b9fc0baaeec9792955e3 Mon Sep 17 00:00:00 2001 From: Tai An Date: Sat, 9 May 2026 21:20:52 -0700 Subject: [PATCH 1/2] fix(embed): mark all tokens as output to suppress llama.cpp "overriding" INFO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Force logits_all=True in Llama.embed() so per-token batch.logits[i] flags are all set, regardless of pooling type. Previously, when pooling != NONE, add_sequence flipped most tokens to logits[i]=False, and llama.cpp printed init: embeddings required but some input tokens were not marked as outputs -> overriding once per embed input and silently overrode the flags. Pooling type only changes how per-token outputs are read back in decode_batch (llama_get_embeddings vs llama_get_embeddings_seq), not whether they are produced — so this aligns the per-token flags with what llama.cpp already needed and removes the noisy per-input override message. Fixes #2208. --- llama_cpp/llama.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 752c25dd3..2afa4c8e9 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1040,7 +1040,13 @@ def embed( # get pooling information pooling_type = self.pooling_type() - logits_all = pooling_type == llama_cpp.LLAMA_POOLING_TYPE_NONE + # In embedding mode every input token must be marked as an output, regardless of + # pooling type. llama.cpp would otherwise override per-token `logits[i]` and emit + # "embeddings required but some input tokens were not marked as outputs -> + # overriding" once per input. Pooling NONE vs MEAN/CLS only changes how the + # per-token outputs are read back (see decode_batch below), not whether they are + # produced. See abetlen/llama-cpp-python#2208. + logits_all = True if self.context_params.embeddings is False: raise RuntimeError( From 4609f9204e4cbfc2357e5b7e987b159349434dd0 Mon Sep 17 00:00:00 2001 From: abetlen Date: Sun, 10 May 2026 22:40:59 -0700 Subject: [PATCH 2/2] docs: update changelog for embedding output warning fix --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5031e5808..808a3647d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - feat: Update llama.cpp to ggerganov/llama.cpp@5d6f18a63 and sync Python bindings - fix: Correct batched embedding outputs for multi-sequence `embed()` calls by @Anai-Guo in #2205 - fix: Configure embedding contexts with enough sequence slots for batched `embed()` calls +- fix: Mark all embedding input tokens as outputs to avoid llama.cpp override warnings by @Anai-Guo in #2212 ## [0.3.22]