From 1b1a320de18c14c3915ba2df59eedb7c6e7cbe69 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Mon, 13 Apr 2026 02:56:37 -0400
Subject: [PATCH 01/24] feat: Update llama.cpp to ggerganov/llama.cpp@227ed28e1
 (#2182)

---
 llama_cpp/llama_cpp.py |   8 +-
 llama_cpp/mtmd_cpp.py  | 267 +++++++++++++++++++++++++++++++++++++++++
 vendor/llama.cpp       |   2 +-
 3 files changed, 273 insertions(+), 4 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 0a66a5d85..e445ed66a 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -503,13 +503,15 @@ def _warn_deprecated(symbol: str, hint: str) -> None:
 
 
 # enum llama_split_mode {
-#     LLAMA_SPLIT_MODE_NONE  = 0, // single GPU
-#     LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
-#     LLAMA_SPLIT_MODE_ROW   = 2, // split layers and KV across GPUs, use tensor parallelism if supported
+#     LLAMA_SPLIT_MODE_NONE   = 0, // single GPU
+#     LLAMA_SPLIT_MODE_LAYER  = 1, // split layers and KV across GPUs
+#     LLAMA_SPLIT_MODE_ROW    = 2, // split layers and KV across GPUs, use tensor parallelism if supported
+#     LLAMA_SPLIT_MODE_TENSOR = 3,
 # };
 LLAMA_SPLIT_MODE_NONE = 0
 LLAMA_SPLIT_MODE_LAYER = 1
 LLAMA_SPLIT_MODE_ROW = 2
+LLAMA_SPLIT_MODE_TENSOR = 3
 
 
 # typedef struct llama_token_data {
diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
index f28402775..550c9bd59 100644
--- a/llama_cpp/mtmd_cpp.py
+++ b/llama_cpp/mtmd_cpp.py
@@ -242,6 +242,55 @@ def mtmd_bitmap_init_from_audio(
 def mtmd_bitmap_free(bitmap: mtmd_bitmap_p, /): ...
 
 
+# MTMD_API uint32_t mtmd_bitmap_get_nx(const mtmd_bitmap * bitmap);
+@ctypes_function("mtmd_bitmap_get_nx", [mtmd_bitmap_p_ctypes], c_uint32)
+def mtmd_bitmap_get_nx(bitmap: mtmd_bitmap_p, /) -> int:
+    """Get the bitmap width in pixels."""
+    ...
+
+
+# MTMD_API uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap);
+@ctypes_function("mtmd_bitmap_get_ny", [mtmd_bitmap_p_ctypes], c_uint32)
+def mtmd_bitmap_get_ny(bitmap: mtmd_bitmap_p, /) -> int:
+    """Get the bitmap height in pixels."""
+    ...
+
+
+# MTMD_API const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap);
+@ctypes_function("mtmd_bitmap_get_data", [mtmd_bitmap_p_ctypes], POINTER(c_uint8))
+def mtmd_bitmap_get_data(bitmap: mtmd_bitmap_p, /) -> Optional[CtypesArray[c_uint8]]:
+    """Get the raw bitmap data buffer."""
+    ...
+
+
+# MTMD_API size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
+@ctypes_function("mtmd_bitmap_get_n_bytes", [mtmd_bitmap_p_ctypes], c_size_t)
+def mtmd_bitmap_get_n_bytes(bitmap: mtmd_bitmap_p, /) -> int:
+    """Get the bitmap data size in bytes."""
+    ...
+
+
+# MTMD_API bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap);
+@ctypes_function("mtmd_bitmap_is_audio", [mtmd_bitmap_p_ctypes], c_bool)
+def mtmd_bitmap_is_audio(bitmap: mtmd_bitmap_p, /) -> bool:
+    """Check whether the bitmap contains audio data."""
+    ...
+
+
+# MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
+@ctypes_function("mtmd_bitmap_get_id", [mtmd_bitmap_p_ctypes], c_char_p)
+def mtmd_bitmap_get_id(bitmap: mtmd_bitmap_p, /) -> Optional[bytes]:
+    """Get the optional bitmap identifier."""
+    ...
+
+
+# MTMD_API void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id);
+@ctypes_function("mtmd_bitmap_set_id", [mtmd_bitmap_p_ctypes, c_char_p], None)
+def mtmd_bitmap_set_id(bitmap: mtmd_bitmap_p, id: Optional[bytes], /):
+    """Set the optional bitmap identifier."""
+    ...
+
+
 # MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void);
 @ctypes_function("mtmd_input_chunks_init", [], mtmd_input_chunks_p_ctypes)
 def mtmd_input_chunks_init() -> Optional[mtmd_input_chunks_p]: ...
@@ -315,11 +364,146 @@ def mtmd_input_chunk_get_tokens_text(
 ) -> Optional["_Pointer[llama_cpp.llama_token]"]: ...
 
 
+# MTMD_API const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
+@ctypes_function(
+    "mtmd_input_chunk_get_tokens_image",
+    [mtmd_input_chunk_p_ctypes],
+    mtmd_image_tokens_p_ctypes,
+)
+def mtmd_input_chunk_get_tokens_image(
+    chunk: mtmd_input_chunk_p, /
+) -> Optional[mtmd_image_tokens_p]: ...
+
+
+# MTMD_API const char * mtmd_input_chunk_get_id(const mtmd_input_chunk * chunk);
+@ctypes_function("mtmd_input_chunk_get_id", [mtmd_input_chunk_p_ctypes], c_char_p)
+def mtmd_input_chunk_get_id(chunk: mtmd_input_chunk_p, /) -> Optional[bytes]:
+    """Get the optional chunk identifier."""
+    ...
+
+
+# MTMD_API llama_pos mtmd_input_chunk_get_n_pos(const mtmd_input_chunk * chunk);
+@ctypes_function(
+    "mtmd_input_chunk_get_n_pos",
+    [mtmd_input_chunk_p_ctypes],
+    llama_cpp.llama_pos,
+)
+def mtmd_input_chunk_get_n_pos(chunk: mtmd_input_chunk_p, /) -> int:
+    """Get the number of positions consumed by the chunk."""
+    ...
+
+
+# MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk);
+@ctypes_function(
+    "mtmd_input_chunk_copy", [mtmd_input_chunk_p_ctypes], mtmd_input_chunk_p_ctypes
+)
+def mtmd_input_chunk_copy(chunk: mtmd_input_chunk_p, /) -> Optional[mtmd_input_chunk_p]:
+    """Copy an input chunk and transfer ownership to the caller."""
+    ...
+
+
+# MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk);
+@ctypes_function("mtmd_input_chunk_free", [mtmd_input_chunk_p_ctypes], None)
+def mtmd_input_chunk_free(chunk: mtmd_input_chunk_p, /):
+    """Free an owned input chunk."""
+    ...
+
+
+# MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
+@ctypes_function(
+    "mtmd_image_tokens_get_n_tokens", [mtmd_image_tokens_p_ctypes], c_size_t
+)
+def mtmd_image_tokens_get_n_tokens(image_tokens: mtmd_image_tokens_p, /) -> int:
+    """Get the number of image tokens."""
+    ...
+
+
+# MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
+@ctypes_function("mtmd_image_tokens_get_nx", [mtmd_image_tokens_p_ctypes], c_size_t)
+def mtmd_image_tokens_get_nx(image_tokens: mtmd_image_tokens_p, /) -> int:
+    """Get the image token grid width."""
+    ...
+
+
+# MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
+@ctypes_function("mtmd_image_tokens_get_ny", [mtmd_image_tokens_p_ctypes], c_size_t)
+def mtmd_image_tokens_get_ny(image_tokens: mtmd_image_tokens_p, /) -> int:
+    """Get the image token grid height."""
+    ...
+
+
+# MTMD_API const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens);
+@ctypes_function("mtmd_image_tokens_get_id", [mtmd_image_tokens_p_ctypes], c_char_p)
+def mtmd_image_tokens_get_id(image_tokens: mtmd_image_tokens_p, /) -> Optional[bytes]:
+    """Get the optional image token identifier."""
+    ...
+
+
+# MTMD_API llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens);
+@ctypes_function(
+    "mtmd_image_tokens_get_n_pos",
+    [mtmd_image_tokens_p_ctypes],
+    llama_cpp.llama_pos,
+)
+def mtmd_image_tokens_get_n_pos(image_tokens: mtmd_image_tokens_p, /) -> int:
+    """Get the number of positions consumed by the image tokens."""
+    ...
+
+
+# MTMD_API int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens);
+@ctypes_function(
+    "mtmd_encode",
+    [mtmd_context_p_ctypes, mtmd_image_tokens_p_ctypes],
+    c_int,
+)
+def mtmd_encode(ctx: mtmd_context_p, image_tokens: mtmd_image_tokens_p, /) -> int:
+    """Run an MTMD encode pass for image tokens."""
+    ...
+
+
+# MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk);
+@ctypes_function(
+    "mtmd_encode_chunk",
+    [mtmd_context_p_ctypes, mtmd_input_chunk_p_ctypes],
+    c_int,
+)
+def mtmd_encode_chunk(ctx: mtmd_context_p, chunk: mtmd_input_chunk_p, /) -> int:
+    """Run an MTMD encode pass for a single chunk."""
+    ...
+
+
+# MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
+@ctypes_function("mtmd_get_output_embd", [mtmd_context_p_ctypes], POINTER(c_float))
+def mtmd_get_output_embd(ctx: mtmd_context_p, /) -> Optional[CtypesArray[c_float]]:
+    """Get output embeddings from the last encode pass."""
+    ...
+
+
+# MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
+@ctypes_function("mtmd_test_create_input_chunks", [], mtmd_input_chunks_p_ctypes)
+def mtmd_test_create_input_chunks() -> Optional[mtmd_input_chunks_p]:
+    """Create MTMD test chunks for the C API tests."""
+    ...
+
+
 ################################################
 # mtmd-helper.h functions
 ################################################
 
 
+# MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname);
+@ctypes_function(
+    "mtmd_helper_bitmap_init_from_file",
+    [mtmd_context_p_ctypes, c_char_p],
+    mtmd_bitmap_p_ctypes,
+)
+def mtmd_helper_bitmap_init_from_file(
+    ctx: mtmd_context_p, fname: bytes, /
+) -> Optional[mtmd_bitmap_p]:
+    """Initialize an MTMD bitmap from a file."""
+    ...
+
+
 # MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
 @ctypes_function(
     "mtmd_helper_bitmap_init_from_buf",
@@ -339,6 +523,52 @@ def mtmd_helper_bitmap_init_from_buf(
 def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunks_p, /) -> int: ...
 
 
+# MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
+@ctypes_function(
+    "mtmd_helper_get_n_pos",
+    [mtmd_input_chunks_p_ctypes],
+    llama_cpp.llama_pos,
+)
+def mtmd_helper_get_n_pos(chunks: mtmd_input_chunks_p, /) -> int:
+    """Count the total positions consumed by the chunks."""
+    ...
+
+
+# MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
+#                                          struct llama_context * lctx,
+#                                          const mtmd_input_chunks * chunks,
+#                                          llama_pos n_past,
+#                                          llama_seq_id seq_id,
+#                                          int32_t n_batch,
+#                                          bool logits_last,
+#                                          llama_pos * new_n_past);
+@ctypes_function(
+    "mtmd_helper_eval_chunks",
+    [
+        mtmd_context_p_ctypes,
+        llama_cpp.llama_context_p_ctypes,
+        mtmd_input_chunks_p_ctypes,
+        llama_cpp.llama_pos,
+        llama_cpp.llama_seq_id,
+        c_int,
+        c_bool,
+        POINTER(llama_cpp.llama_pos),
+    ],
+    c_int,
+)
+def mtmd_helper_eval_chunks(
+    ctx: mtmd_context_p,
+    lctx: llama_cpp.llama_context_p,
+    chunks: mtmd_input_chunks_p,
+    n_past: llama_cpp.llama_pos,
+    seq_id: llama_cpp.llama_seq_id,
+    n_batch: Union[c_int, int],
+    logits_last: Union[c_bool, bool],
+    new_n_past: "_Pointer[llama_cpp.llama_pos]",
+    /,
+) -> int: ...
+
+
 # MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
 #                                                struct llama_context * lctx,
 #                                                const mtmd_input_chunk * chunk,
@@ -374,6 +604,43 @@ def mtmd_helper_eval_chunk_single(
 ) -> int: ...
 
 
+# MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
+#                                                 struct llama_context * lctx,
+#                                                 const mtmd_input_chunk * chunk,
+#                                                 float * encoded_embd,
+#                                                 llama_pos n_past,
+#                                                 llama_seq_id seq_id,
+#                                                 int32_t n_batch,
+#                                                 llama_pos * new_n_past);
+@ctypes_function(
+    "mtmd_helper_decode_image_chunk",
+    [
+        mtmd_context_p_ctypes,
+        llama_cpp.llama_context_p_ctypes,
+        mtmd_input_chunk_p_ctypes,
+        POINTER(c_float),
+        llama_cpp.llama_pos,
+        llama_cpp.llama_seq_id,
+        c_int,
+        POINTER(llama_cpp.llama_pos),
+    ],
+    c_int,
+)
+def mtmd_helper_decode_image_chunk(
+    ctx: mtmd_context_p,
+    lctx: llama_cpp.llama_context_p,
+    chunk: mtmd_input_chunk_p,
+    encoded_embd: CtypesArray[c_float],
+    n_past: llama_cpp.llama_pos,
+    seq_id: llama_cpp.llama_seq_id,
+    n_batch: Union[c_int, int],
+    new_n_past: "_Pointer[llama_cpp.llama_pos]",
+    /,
+) -> int:
+    """Decode a pre-encoded image chunk."""
+    ...
+
+
 # MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data);
 @ctypes_function(
     "mtmd_log_set",
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 3bd9aa1f9..227ed28e1 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 3bd9aa1f9250cd15f5371f3622d73d954b68a747
+Subproject commit 227ed28e128e93b4d63ae5108560c550c9ab16c8

From d87bf08871e2c2995e83f551aa61443e35fd865c Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sun, 26 Apr 2026 21:41:32 -0700
Subject: [PATCH 02/24] feat: Update llama.cpp to ggerganov/llama.cpp@f53577432
 (#2189)

* feat: Update llama.cpp to ggerganov/llama.cpp@f53577432

* docs: Update changelog for llama.cpp f53577432

* docs: Keep one unreleased llama.cpp changelog entry
---
 CHANGELOG.md           |  2 +-
 llama_cpp/llama_cpp.py | 55 -------------------------------
 llama_cpp/mtmd_cpp.py  | 73 ++++++++++++++++++++++++++++++++++++------
 vendor/llama.cpp       |  2 +-
 4 files changed, 65 insertions(+), 67 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fbe5b6b6f..ea7beaaa7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-- feat: Update llama.cpp to ggerganov/llama.cpp@3bd9aa1f9 and sync Python bindings
+- feat: Update llama.cpp to ggerganov/llama.cpp@f53577432 and sync Python bindings
 
 ## [0.3.20]
 
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index e445ed66a..d03237140 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -1516,54 +1516,6 @@ def llama_free(ctx: llama_context_p, /):
     ...
 
 
-# enum llama_params_fit_status {
-#     LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0,
-#     LLAMA_PARAMS_FIT_STATUS_FAILURE = 1,
-#     LLAMA_PARAMS_FIT_STATUS_ERROR   = 2,
-# };
-LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0
-LLAMA_PARAMS_FIT_STATUS_FAILURE = 1
-LLAMA_PARAMS_FIT_STATUS_ERROR = 2
-
-
-# LLAMA_API enum llama_params_fit_status llama_params_fit(
-#                                const char   * path_model,
-#                 struct llama_model_params   * mparams,
-#                 struct llama_context_params * cparams,
-#                                       float * tensor_split,
-#     struct llama_model_tensor_buft_override * tensor_buft_overrides,
-#                                      size_t * margins,
-#                                    uint32_t   n_ctx_min,
-#                         enum ggml_log_level   log_level);
-@ctypes_function(
-    "llama_params_fit",
-    [
-        ctypes.c_char_p,
-        ctypes.POINTER(llama_model_params),
-        ctypes.POINTER(llama_context_params),
-        ctypes.POINTER(ctypes.c_float),
-        ctypes.c_void_p,
-        ctypes.POINTER(ctypes.c_size_t),
-        ctypes.c_uint32,
-        ctypes.c_int,
-    ],
-    ctypes.c_int,
-)
-def llama_params_fit(
-    path_model: bytes,
-    mparams: CtypesPointerOrRef[llama_model_params],
-    cparams: CtypesPointerOrRef[llama_context_params],
-    tensor_split: Optional[CtypesPointer[ctypes.c_float]],
-    tensor_buft_overrides: ctypes.c_void_p,
-    margins: Optional[CtypesPointer[ctypes.c_size_t]],
-    n_ctx_min: int,
-    log_level: int,
-    /,
-) -> int:
-    """Fit model and context parameters for a model path."""
-    ...
-
-
 # LLAMA_API int64_t llama_time_us(void);
 @ctypes_function(
     "llama_time_us",
@@ -4869,13 +4821,6 @@ def llama_perf_sampler_print(chain: llama_sampler_p, /): ...
 def llama_perf_sampler_reset(chain: llama_sampler_p, /): ...
 
 
-# // print a breakdown of per-device memory use via LLAMA_LOG:
-@ctypes_function("llama_memory_breakdown_print", [llama_context_p_ctypes], None)
-def llama_memory_breakdown_print(ctx: llama_context_p, /):
-    """Print a breakdown of per-device memory use."""
-    ...
-
-
 # //
 # // training
 # //
diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
index 550c9bd59..485dc5d8c 100644
--- a/llama_cpp/mtmd_cpp.py
+++ b/llama_cpp/mtmd_cpp.py
@@ -8,9 +8,9 @@
     c_int,
     c_uint8,
     c_uint32,
+    c_size_t,
     c_float,
     c_void_p,
-    c_size_t,
     POINTER,
     _Pointer,  # type: ignore
     Structure,
@@ -123,6 +123,17 @@ class mtmd_input_text(Structure):
     ]
 
 
+class mtmd_decoder_pos(Structure):
+    """Decoder attention position for M-RoPE models."""
+
+    _fields_ = [
+        ("t", c_uint32),
+        ("x", c_uint32),
+        ("y", c_uint32),
+        ("z", c_uint32),
+    ]
+
+
 ################################################
 # mtmd.h functions
 ################################################
@@ -165,35 +176,41 @@ def mtmd_init_from_file(
 def mtmd_free(ctx: mtmd_context_p, /): ...
 
 
-# MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
-@ctypes_function("mtmd_decode_use_non_causal", [mtmd_context_p_ctypes], c_bool)
-def mtmd_decode_use_non_causal(ctx: mtmd_context_p, /) -> bool:
+# MTMD_API bool mtmd_decode_use_non_causal(const mtmd_context * ctx, const mtmd_input_chunk * chunk);
+@ctypes_function(
+    "mtmd_decode_use_non_causal",
+    [mtmd_context_p_ctypes, mtmd_input_chunk_p_ctypes],
+    c_bool,
+)
+def mtmd_decode_use_non_causal(
+    ctx: mtmd_context_p, chunk: Optional[mtmd_input_chunk_p], /
+) -> bool:
     """Check whether MTMD decoding uses non-causal attention."""
     ...
 
 
-# MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
+# MTMD_API bool mtmd_decode_use_mrope(const mtmd_context * ctx);
 @ctypes_function("mtmd_decode_use_mrope", [mtmd_context_p_ctypes], c_bool)
 def mtmd_decode_use_mrope(ctx: mtmd_context_p, /) -> bool:
     """Check whether MTMD decoding uses mRoPE."""
     ...
 
 
-# MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
+# MTMD_API bool mtmd_support_vision(const mtmd_context * ctx);
 @ctypes_function("mtmd_support_vision", [mtmd_context_p_ctypes], c_bool)
 def mtmd_support_vision(ctx: mtmd_context_p, /) -> bool:
     """Check whether the current model supports vision input."""
     ...
 
 
-# MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
+# MTMD_API bool mtmd_support_audio(const mtmd_context * ctx);
 @ctypes_function("mtmd_support_audio", [mtmd_context_p_ctypes], c_bool)
 def mtmd_support_audio(ctx: mtmd_context_p, /) -> bool:
     """Check whether MTMD supports audio."""
     ...
 
 
-# MTMD_API int mtmd_get_audio_sample_rate(mtmd_context * ctx);
+# MTMD_API int mtmd_get_audio_sample_rate(const mtmd_context * ctx);
 @ctypes_function("mtmd_get_audio_sample_rate", [mtmd_context_p_ctypes], c_int)
 def mtmd_get_audio_sample_rate(ctx: mtmd_context_p, /) -> int:
     """Get the audio sample rate in Hz. Returns -1 if audio is not supported."""
@@ -418,14 +435,16 @@ def mtmd_image_tokens_get_n_tokens(image_tokens: mtmd_image_tokens_p, /) -> int:
     ...
 
 
-# MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
+# DEPRECATED(MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens),
+#            "use mtmd_image_tokens_get_decoder_pos() instead");
 @ctypes_function("mtmd_image_tokens_get_nx", [mtmd_image_tokens_p_ctypes], c_size_t)
 def mtmd_image_tokens_get_nx(image_tokens: mtmd_image_tokens_p, /) -> int:
     """Get the image token grid width."""
     ...
 
 
-# MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
+# DEPRECATED(MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens),
+#            "use mtmd_image_tokens_get_decoder_pos() instead");
 @ctypes_function("mtmd_image_tokens_get_ny", [mtmd_image_tokens_p_ctypes], c_size_t)
 def mtmd_image_tokens_get_ny(image_tokens: mtmd_image_tokens_p, /) -> int:
     """Get the image token grid height."""
@@ -450,6 +469,23 @@ def mtmd_image_tokens_get_n_pos(image_tokens: mtmd_image_tokens_p, /) -> int:
     ...
 
 
+# MTMD_API struct mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(
+#     const mtmd_image_tokens * image_tokens, llama_pos pos_0, size_t i);
+@ctypes_function(
+    "mtmd_image_tokens_get_decoder_pos",
+    [mtmd_image_tokens_p_ctypes, llama_cpp.llama_pos, c_size_t],
+    mtmd_decoder_pos,
+)
+def mtmd_image_tokens_get_decoder_pos(
+    image_tokens: mtmd_image_tokens_p,
+    pos_0: llama_cpp.llama_pos,
+    i: Union[c_size_t, int],
+    /,
+) -> mtmd_decoder_pos:
+    """Get decoder attention position for an image embedding token."""
+    ...
+
+
 # MTMD_API int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens);
 @ctypes_function(
     "mtmd_encode",
@@ -534,6 +570,23 @@ def mtmd_helper_get_n_pos(chunks: mtmd_input_chunks_p, /) -> int:
     ...
 
 
+# MTMD_API void mtmd_helper_image_get_decoder_pos(
+#     const mtmd_image_tokens * image, llama_pos pos_0, struct mtmd_decoder_pos * out_pos);
+@ctypes_function(
+    "mtmd_helper_image_get_decoder_pos",
+    [mtmd_image_tokens_p_ctypes, llama_cpp.llama_pos, POINTER(mtmd_decoder_pos)],
+    None,
+)
+def mtmd_helper_image_get_decoder_pos(
+    image: mtmd_image_tokens_p,
+    pos_0: llama_cpp.llama_pos,
+    out_pos: "_Pointer[mtmd_decoder_pos]",
+    /,
+):
+    """Fill decoder attention positions for all image embedding tokens."""
+    ...
+
+
 # MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
 #                                          struct llama_context * lctx,
 #                                          const mtmd_input_chunks * chunks,
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 227ed28e1..f53577432 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 227ed28e128e93b4d63ae5108560c550c9ab16c8
+Subproject commit f53577432541bb9edc1588c4ef45c66bf07e4468

From 511b3f414359e8d98e9123d007bdd935cd1f7c3f Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sun, 26 Apr 2026 22:02:24 -0700
Subject: [PATCH 03/24] fix(ci): Build one arm64 py3 release wheel (#2191)

* fix(ci): Build one arm64 py3 release wheel

* docs: Update changelog for arm64 release wheel fix
---
 .github/workflows/build-and-release.yaml | 4 +++-
 CHANGELOG.md                             | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
index 6cbac0cb1..039e376b6 100644
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@@ -82,7 +82,9 @@ jobs:
           # Keep native arm64 builds on a portable CPU baseline instead of
           # tuning wheels to the hosted runner.
           CIBW_ENVIRONMENT: CMAKE_ARGS="-DGGML_NATIVE=off"
-          CIBW_BUILD: "cp38-* cp39-* cp310-* cp311-* cp312-*"
+          # The release wheel is tagged py3-none, so one build covers all
+          # supported Python versions and avoids duplicate wheel names.
+          CIBW_BUILD: "cp38-*"
         with:
           output-dir: wheelhouse
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ea7beaaa7..fe376ebd3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@f53577432 and sync Python bindings
+- fix(ci): Build one arm64 release wheel for `py3-none` wheel publishing
 
 ## [0.3.20]
 

From c8075d1dfe2019a0390af613419ecfaea292c9d5 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sun, 26 Apr 2026 22:13:13 -0700
Subject: [PATCH 04/24] chore: bump version to 0.3.21 (#2192)

---
 CHANGELOG.md          | 2 ++
 llama_cpp/__init__.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fe376ebd3..eeb42b644 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.21]
+
 - feat: Update llama.cpp to ggerganov/llama.cpp@f53577432 and sync Python bindings
 - fix(ci): Build one arm64 release wheel for `py3-none` wheel publishing
 
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index 83177c065..fbad5c28b 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.20"
+__version__ = "0.3.21"

From 195cc59a187687ca64c8e0939e5e549d456aa2fb Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sun, 26 Apr 2026 22:39:59 -0700
Subject: [PATCH 05/24] fix(ci): Repair py3 CPU release wheels (#2193)

---
 .github/workflows/build-and-release.yaml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
index 039e376b6..f67fb558d 100644
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@@ -48,7 +48,10 @@ jobs:
           CIBW_REPAIR_WHEEL_COMMAND: ""
           # Linux needs auditwheel repair so manylinux and musllinux wheels are
           # published with distinct platform tags instead of generic linux tags.
-          CIBW_REPAIR_WHEEL_COMMAND_LINUX: "auditwheel repair -w {dest_dir} {wheel}"
+          CIBW_REPAIR_WHEEL_COMMAND_LINUX: "LD_LIBRARY_PATH=/project/llama_cpp/lib auditwheel repair -w {dest_dir} {wheel}"
+          # The release wheel is tagged py3-none, so one build per platform
+          # covers all supported Python versions and avoids duplicate names.
+          CIBW_BUILD: "cp38-*"
           # Skip cibuildwheel's default i686 sidecar and keep Linux release
           # wheels on a portable x86_64 CPU baseline.
           CIBW_ARCHS_LINUX: "auto64"

From d2bcbac46605f11d382426dd88d67e8b5c124cd7 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sun, 26 Apr 2026 22:55:04 -0700
Subject: [PATCH 06/24] fix(ci): Scope CPU release wheel selectors by OS
 (#2194)

---
 .github/workflows/build-and-release.yaml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
index f67fb558d..df6201ee7 100644
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@@ -51,10 +51,13 @@ jobs:
           CIBW_REPAIR_WHEEL_COMMAND_LINUX: "LD_LIBRARY_PATH=/project/llama_cpp/lib auditwheel repair -w {dest_dir} {wheel}"
           # The release wheel is tagged py3-none, so one build per platform
           # covers all supported Python versions and avoids duplicate names.
-          CIBW_BUILD: "cp38-*"
+          CIBW_BUILD_LINUX: "cp38-*"
+          CIBW_BUILD_MACOS: "cp39-*"
+          CIBW_BUILD_WINDOWS: "cp39-*"
           # Skip cibuildwheel's default i686 sidecar and keep Linux release
           # wheels on a portable x86_64 CPU baseline.
           CIBW_ARCHS_LINUX: "auto64"
+          CIBW_ARCHS_WINDOWS: "AMD64"
           CIBW_ENVIRONMENT_LINUX: CMAKE_ARGS="-DGGML_NATIVE=off"
           # Keep macOS release wheels on a portable CPU baseline instead of
           # inheriting the hosted runner's native flags.

From c6dc90555be7bedda2d15f516b3ccd6252130a0f Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Mon, 27 Apr 2026 01:41:57 -0700
Subject: [PATCH 07/24] fix(docs): update mkdocstrings inventories config
 (#2195)

---
 mkdocs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mkdocs.yml b/mkdocs.yml
index 79a9e67a1..37e1002e8 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -36,7 +36,7 @@ plugins:
               - typing
               - typing_extensions
               - ctypes
-          import:
+          inventories:
             - https://docs.python.org/3/objects.inv
             - https://numpy.org/doc/stable/objects.inv
 

From 587d94a8c31943e3bcbcccbbd2721867da52a9de Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sat, 2 May 2026 01:31:28 -0700
Subject: [PATCH 08/24] feat: Update llama.cpp to ggerganov/llama.cpp@63d93d173
 (#2197)

---
 CHANGELOG.md     | 2 ++
 vendor/llama.cpp | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index eeb42b644..e1f1f0860 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- feat: Update llama.cpp to ggerganov/llama.cpp@63d93d173
+
 ## [0.3.21]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@f53577432 and sync Python bindings
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index f53577432..63d93d173 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit f53577432541bb9edc1588c4ef45c66bf07e4468
+Subproject commit 63d93d17336e41e4cc73a64451e5b1d2477abdb1

From d2113a14441f7d811b34f4aeee917449ad1da1b9 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sat, 2 May 2026 15:25:58 -0700
Subject: [PATCH 09/24] feat(ci): re-enable Windows CUDA wheels (#2198)

* feat(ci): re-enable Windows CUDA wheel builds

* fix(ci): use ninja for Windows CUDA wheels

* fix(ci): normalize Windows CUDA CMake paths

* feat(ci): add CUDA 12.5 wheel builds

* fix(ci): avoid Windows CUDA 12.5 toolkit meta-package

* fix(ci): include CUDA 12.5 Windows libraries

* chore(ci): simplify Windows CUDA wheel workflow

* docs: update changelog for Windows CUDA wheels
---
 .github/workflows/build-wheels-cuda.yaml | 133 ++++++++++++-----------
 CHANGELOG.md                             |   1 +
 2 files changed, 69 insertions(+), 65 deletions(-)

diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml
index 17daaa12a..98c19afb6 100644
--- a/.github/workflows/build-wheels-cuda.yaml
+++ b/.github/workflows/build-wheels-cuda.yaml
@@ -20,9 +20,11 @@ jobs:
         id: set-matrix
         run: |
           $matrix = @{
-              'os' = @('ubuntu-22.04') #, 'windows-2022')
-              'pyver' = @("3.9", "3.10", "3.11", "3.12")
-              'cuda' = @("12.1.1", "12.2.2", "12.3.2", "12.4.1") #, "12.5.1", "12.6.1")
+              'os' = @('ubuntu-22.04', 'windows-2022')
+              # wheel.py-api = "py3" makes the CUDA wheel interpreter-agnostic,
+              # so one builder per toolkit version is sufficient.
+              'pyver' = @("3.9")
+              'cuda' = @("12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1")
               'releasetag' = @("basic")
           }
 
@@ -43,11 +45,11 @@ jobs:
       AVXVER: ${{ matrix.releasetag }}
 
     steps:
-      - name: Add MSBuild to PATH
+      - name: Set up MSVC
         if: runner.os == 'Windows'
-        uses: microsoft/setup-msbuild@v2
+        uses: ilammy/msvc-dev-cmd@v1
         with:
-          vs-version: '[16.11,16.12)'
+          arch: x64
 
       - uses: actions/checkout@v4
         with:
@@ -67,32 +69,6 @@ jobs:
           add-pip-as-python-dependency: true
           auto-activate-base: false
 
-      - name: VS Integration Cache
-        id: vs-integration-cache
-        if: runner.os == 'Windows'
-        uses: actions/cache@v4
-        with:
-          path: ./MSBuildExtensions
-          key: cuda-${{ matrix.cuda }}-vs-integration
-
-      - name: Get Visual Studio Integration
-        if: runner.os == 'Windows' && steps.vs-integration-cache.outputs.cache-hit != 'true'
-        run: |
-          if ($env:CUDAVER -eq '12.1.1') {$x = '12.1.0'} else {$x = $env:CUDAVER}
-          $links = (Invoke-RestMethod 'https://raw.githubusercontent.com/Jimver/cuda-toolkit/master/src/links/windows-links.ts').Trim().split().where({$_ -ne ''})
-          for ($i=$q=0;$i -lt $links.count -and $q -lt 2;$i++) {if ($links[$i] -eq "'$x',") {$q++}}
-          Invoke-RestMethod $links[$i].Trim("'") -OutFile 'cudainstaller.zip'
-          & 'C:\Program Files\7-Zip\7z.exe' e cudainstaller.zip -oMSBuildExtensions -r *\MSBuildExtensions\* > $null
-          Remove-Item 'cudainstaller.zip'
-
-      - name: Install Visual Studio Integration
-        if: runner.os == 'Windows'
-        run: |
-          $y = (gi '.\MSBuildExtensions').fullname + '\*'
-          (gi 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\MSBuild\Microsoft\VC\*\BuildCustomizations').fullname.foreach({cp $y $_})
-          $cupath = 'CUDA_PATH_V' + $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','_')
-          echo "$cupath=$env:CONDA_PREFIX" >> $env:GITHUB_ENV
-
       - name: Install Dependencies
         env:
           MAMBA_DOWNLOAD_FAILFAST: "0"
@@ -101,24 +77,45 @@ jobs:
           $cudaVersion = $env:CUDAVER
           $cudaChannel = "nvidia/label/cuda-$cudaVersion"
           if ($IsLinux) {
-            # Keep nvcc, cudart, and headers on the same NVIDIA label so the
-            # detected toolkit version matches the published wheel tag.
-            mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "$cudaChannel::cuda-toolkit=$cudaVersion" "$cudaChannel::cuda-nvcc_linux-64=$cudaVersion" "$cudaChannel::cuda-cudart" "$cudaChannel::cuda-cudart-dev"
+            mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-toolkit=$cudaVersion" "${cudaChannel}::cuda-nvcc_linux-64" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev"
+          } elseif ($IsWindows) {
+            if ($cudaVersion -like '12.5.*') {
+              # The Windows 12.5 toolkit meta-package pulls compiler activation
+              # scripts that overflow cmd.exe after MSVC is already initialized.
+              mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-nvcc_win-64" "${cudaChannel}::cuda-libraries-dev=$cudaVersion" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev"
+            } else {
+              mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-toolkit=$cudaVersion" "${cudaChannel}::cuda-nvcc_win-64" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev"
+            }
           } else {
-            mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "$cudaChannel::cuda-toolkit=$cudaVersion"
+            throw 'Unsupported CUDA wheel build platform'
           }
           if ($LASTEXITCODE -ne 0) {
             exit $LASTEXITCODE
           }
-          python -m pip install build wheel
+          if ($IsWindows) {
+            python -m pip install build wheel ninja
+          } else {
+            python -m pip install build wheel
+          }
 
       - name: Build Wheel
         run: |
-          $env:CUDA_PATH = $env:CONDA_PREFIX
-          $env:CUDA_HOME = $env:CONDA_PREFIX
-          $env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX
+          $pathSeparator = if ($IsWindows) { ';' } else { ':' }
+          if ($IsWindows) {
+            $cudaRoot = Join-Path $env:CONDA_PREFIX 'Library'
+          } elseif (Test-Path (Join-Path $env:CONDA_PREFIX 'targets/x86_64-linux/include/cuda_runtime.h')) {
+            $cudaRoot = Join-Path $env:CONDA_PREFIX 'targets/x86_64-linux'
+          } else {
+            $cudaRoot = $env:CONDA_PREFIX
+          }
+
+          $env:CUDA_PATH = $cudaRoot
+          $env:CUDA_HOME = $cudaRoot
+          $env:CUDAToolkit_ROOT = $cudaRoot
+          $env:CUDA_TOOLKIT_ROOT_DIR = $cudaRoot
           $cudaHostCompilerArg = ''
-          $env:CMAKE_ARGS = ''
+          $cudaRootCmake = $cudaRoot.Replace('\', '/')
+          $env:CMAKE_ARGS = "-DCUDAToolkit_ROOT=$cudaRootCmake -DCUDA_TOOLKIT_ROOT_DIR=$cudaRootCmake"
           if ($IsLinux) {
             if (Test-Path '/usr/bin/g++-12') {
               $env:CC = '/usr/bin/gcc-12'
@@ -126,27 +123,41 @@ jobs:
               $env:CUDAHOSTCXX = '/usr/bin/g++-12'
               $cudaHostCompilerArg = " -DCMAKE_CUDA_HOST_COMPILER=$env:CUDAHOSTCXX"
             }
-            if (Test-Path (Join-Path $env:CONDA_PREFIX 'include/cuda_runtime.h')) {
-              $env:CUDAToolkit_ROOT = $env:CONDA_PREFIX
-              $env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX
-              $env:CMAKE_ARGS = "-DCUDAToolkit_ROOT=$env:CONDA_PREFIX -DCUDA_TOOLKIT_ROOT_DIR=$env:CONDA_PREFIX$cudaHostCompilerArg"
-              $env:CPATH = "$env:CONDA_PREFIX/include:$env:CPATH"
-              $env:CPLUS_INCLUDE_PATH = "$env:CONDA_PREFIX/include:$env:CPLUS_INCLUDE_PATH"
-              $env:LIBRARY_PATH = "$env:CONDA_PREFIX/lib:$env:LIBRARY_PATH"
-              $env:LD_LIBRARY_PATH = "$env:CONDA_PREFIX/lib:$env:LD_LIBRARY_PATH"
-            } else {
-              $env:CMAKE_ARGS = $cudaHostCompilerArg.Trim()
-            }
+            $env:CMAKE_ARGS = "-DCUDAToolkit_ROOT=$cudaRoot -DCUDA_TOOLKIT_ROOT_DIR=$cudaRoot$cudaHostCompilerArg"
+            $env:CPATH = "$cudaRoot/include$pathSeparator$env:CPATH"
+            $env:CPLUS_INCLUDE_PATH = "$cudaRoot/include$pathSeparator$env:CPLUS_INCLUDE_PATH"
+            $env:LIBRARY_PATH = "$cudaRoot/lib$pathSeparator$env:CONDA_PREFIX/lib$pathSeparator$env:LIBRARY_PATH"
+            $env:LD_LIBRARY_PATH = "$cudaRoot/lib$pathSeparator$env:CONDA_PREFIX/lib$pathSeparator$env:LD_LIBRARY_PATH"
+          } elseif ($IsWindows) {
+            $ninjaPath = ((Get-Command ninja -ErrorAction Stop).Source).Replace('\', '/')
+            $env:CMAKE_GENERATOR = 'Ninja'
+            $env:CMAKE_MAKE_PROGRAM = $ninjaPath
+            $env:PATH = "$(Join-Path $cudaRoot 'bin')$pathSeparator$env:PATH"
           }
-          $nvccPath = Join-Path $env:CONDA_PREFIX 'bin/nvcc'
-          if (-not (Test-Path $nvccPath)) {
-            $nvccPath = Join-Path $env:CONDA_PREFIX 'targets/x86_64-linux/bin/nvcc'
+
+          if ($IsWindows) {
+            $nvccCandidates = @(
+              (Join-Path $cudaRoot 'bin\nvcc.exe'),
+              (Join-Path $env:CONDA_PREFIX 'Library\bin\nvcc.exe'),
+              (Join-Path $env:CONDA_PREFIX 'bin\nvcc.exe')
+            )
+          } else {
+            $nvccCandidates = @(
+              (Join-Path $env:CONDA_PREFIX 'bin/nvcc'),
+              (Join-Path $env:CONDA_PREFIX 'targets/x86_64-linux/bin/nvcc')
+            )
           }
-          if (-not (Test-Path $nvccPath)) {
+          $nvccPath = $nvccCandidates | Where-Object { Test-Path $_ } | Select-Object -First 1
+          if (-not $nvccPath) {
             throw 'Failed to find nvcc in the conda environment'
           }
           $env:CUDACXX = $nvccPath
-          $env:PATH = "$(Split-Path $nvccPath):$env:PATH"
+          $env:PATH = "$(Split-Path $nvccPath)$pathSeparator$env:PATH"
+          if ($IsWindows) {
+            $nvccPathCmake = $nvccPath.Replace('\', '/')
+            $env:CUDACXX = $nvccPathCmake
+            $env:CMAKE_ARGS = "-DCMAKE_CUDA_COMPILER=$nvccPathCmake -DCMAKE_MAKE_PROGRAM=$env:CMAKE_MAKE_PROGRAM $env:CMAKE_ARGS"
+          }
           $nvccVersion = ((& $nvccPath --version) | Select-String 'release ([0-9]+\.[0-9]+)').Matches[0].Groups[1].Value
           if (-not $nvccVersion) {
             throw 'Failed to detect the installed CUDA toolkit version'
@@ -157,15 +168,7 @@ jobs:
           # one forward-compatible PTX target instead of embedding PTX for every
           # SM. This keeps the wheel under GitHub's 2 GiB release-asset limit.
           $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=70-real;75-real;80-real;86-real;89-real;90-real;90-virtual -DCMAKE_CUDA_FLAGS=--allow-unsupported-compiler $env:CMAKE_ARGS"
-          # if ($env:AVXVER -eq 'AVX') {
           $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off'
-          # }
-          # if ($env:AVXVER -eq 'AVX512') {
-          #  $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on'
-          # }
-          # if ($env:AVXVER -eq 'basic') {
-          #  $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off'
-          # }
           python -m build --wheel
           # Publish tags that reflect the actual installed toolkit version.
           Write-Output "CUDA_VERSION=$cudaTagVersion" >> $env:GITHUB_ENV
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e1f1f0860..1852751c1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@63d93d173
+- feat(ci): Re-enable Windows CUDA wheels and add CUDA 12.5.1 wheel builds
 
 ## [0.3.21]
 

From 9cf0ce7c2094c40d7166f3cc92f00f2c2236af4f Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sat, 2 May 2026 15:35:41 -0700
Subject: [PATCH 10/24] chore: bump version to 0.3.22 (#2200)

---
 CHANGELOG.md          | 2 ++
 llama_cpp/__init__.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1852751c1..5e2a8e329 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.22]
+
 - feat: Update llama.cpp to ggerganov/llama.cpp@63d93d173
 - feat(ci): Re-enable Windows CUDA wheels and add CUDA 12.5.1 wheel builds
 
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index fbad5c28b..78292de30 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.21"
+__version__ = "0.3.22"

From 2bfd80c1c5fadd6bd95bb57e7332438cca5521cd Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sat, 2 May 2026 15:45:31 -0700
Subject: [PATCH 11/24] fix(ci): pass CUDA unsupported compiler flag during
 detection (#2201)

---
 .github/workflows/build-wheels-cuda.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml
index 98c19afb6..c32d7f56d 100644
--- a/.github/workflows/build-wheels-cuda.yaml
+++ b/.github/workflows/build-wheels-cuda.yaml
@@ -167,7 +167,7 @@ jobs:
           # Build real cubins for the supported GPUs, including sm_70, and keep
           # one forward-compatible PTX target instead of embedding PTX for every
           # SM. This keeps the wheel under GitHub's 2 GiB release-asset limit.
-          $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=70-real;75-real;80-real;86-real;89-real;90-real;90-virtual -DCMAKE_CUDA_FLAGS=--allow-unsupported-compiler $env:CMAKE_ARGS"
+          $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=70-real;75-real;80-real;86-real;89-real;90-real;90-virtual -DCMAKE_CUDA_FLAGS=--allow-unsupported-compiler -DCMAKE_CUDA_FLAGS_INIT=--allow-unsupported-compiler $env:CMAKE_ARGS"
           $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off'
           python -m build --wheel
           # Publish tags that reflect the actual installed toolkit version.

From 04a3638b2637b0b6f1b843d16a679fbf7d2dd375 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sat, 2 May 2026 15:53:53 -0700
Subject: [PATCH 12/24] fix(ci): pass CUDA compiler arg for Windows detection
 (#2202)

---
 .github/workflows/build-wheels-cuda.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml
index c32d7f56d..2b4bf775a 100644
--- a/.github/workflows/build-wheels-cuda.yaml
+++ b/.github/workflows/build-wheels-cuda.yaml
@@ -156,7 +156,7 @@ jobs:
           if ($IsWindows) {
             $nvccPathCmake = $nvccPath.Replace('\', '/')
             $env:CUDACXX = $nvccPathCmake
-            $env:CMAKE_ARGS = "-DCMAKE_CUDA_COMPILER=$nvccPathCmake -DCMAKE_MAKE_PROGRAM=$env:CMAKE_MAKE_PROGRAM $env:CMAKE_ARGS"
+            $env:CMAKE_ARGS = "-DCMAKE_CUDA_COMPILER=$nvccPathCmake -DCMAKE_CUDA_COMPILER_ARG1=-allow-unsupported-compiler -DCMAKE_MAKE_PROGRAM=$env:CMAKE_MAKE_PROGRAM $env:CMAKE_ARGS"
           }
           $nvccVersion = ((& $nvccPath --version) | Select-String 'release ([0-9]+\.[0-9]+)').Matches[0].Groups[1].Value
           if (-not $nvccVersion) {
@@ -167,7 +167,7 @@ jobs:
           # Build real cubins for the supported GPUs, including sm_70, and keep
           # one forward-compatible PTX target instead of embedding PTX for every
           # SM. This keeps the wheel under GitHub's 2 GiB release-asset limit.
-          $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=70-real;75-real;80-real;86-real;89-real;90-real;90-virtual -DCMAKE_CUDA_FLAGS=--allow-unsupported-compiler -DCMAKE_CUDA_FLAGS_INIT=--allow-unsupported-compiler $env:CMAKE_ARGS"
+          $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=70-real;75-real;80-real;86-real;89-real;90-real;90-virtual -DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler -DCMAKE_CUDA_FLAGS_INIT=-allow-unsupported-compiler $env:CMAKE_ARGS"
           $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off'
           python -m build --wheel
           # Publish tags that reflect the actual installed toolkit version.

From bc6ff9f2cc5545c180d8c3db4128d3ad48a31575 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sat, 2 May 2026 16:01:11 -0700
Subject: [PATCH 13/24] fix(ci): install CUDA CCCL headers for wheel builds
 (#2203)

---
 .github/workflows/build-wheels-cuda.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml
index 2b4bf775a..c015c7118 100644
--- a/.github/workflows/build-wheels-cuda.yaml
+++ b/.github/workflows/build-wheels-cuda.yaml
@@ -77,14 +77,14 @@ jobs:
           $cudaVersion = $env:CUDAVER
           $cudaChannel = "nvidia/label/cuda-$cudaVersion"
           if ($IsLinux) {
-            mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-toolkit=$cudaVersion" "${cudaChannel}::cuda-nvcc_linux-64" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev"
+            mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-toolkit=$cudaVersion" "${cudaChannel}::cuda-nvcc_linux-64" "${cudaChannel}::cuda-cccl" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev"
           } elseif ($IsWindows) {
             if ($cudaVersion -like '12.5.*') {
               # The Windows 12.5 toolkit meta-package pulls compiler activation
               # scripts that overflow cmd.exe after MSVC is already initialized.
-              mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-nvcc_win-64" "${cudaChannel}::cuda-libraries-dev=$cudaVersion" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev"
+              mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-nvcc_win-64" "${cudaChannel}::cuda-cccl" "${cudaChannel}::cuda-libraries-dev=$cudaVersion" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev"
             } else {
-              mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-toolkit=$cudaVersion" "${cudaChannel}::cuda-nvcc_win-64" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev"
+              mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-toolkit=$cudaVersion" "${cudaChannel}::cuda-nvcc_win-64" "${cudaChannel}::cuda-cccl" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev"
             }
           } else {
             throw 'Unsupported CUDA wheel build platform'

From 14d7846f9a7c043901cb98bd446764377a8def6e Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sat, 2 May 2026 16:08:33 -0700
Subject: [PATCH 14/24] fix(ci): skip unsupported Windows CUDA versions (#2204)

---
 .github/workflows/build-wheels-cuda.yaml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml
index c015c7118..be55bf483 100644
--- a/.github/workflows/build-wheels-cuda.yaml
+++ b/.github/workflows/build-wheels-cuda.yaml
@@ -26,6 +26,11 @@ jobs:
               'pyver' = @("3.9")
               'cuda' = @("12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1")
               'releasetag' = @("basic")
+              'exclude' = @(
+                @{ 'os' = 'windows-2022'; 'cuda' = '12.1.1' },
+                @{ 'os' = 'windows-2022'; 'cuda' = '12.2.2' },
+                @{ 'os' = 'windows-2022'; 'cuda' = '12.3.2' }
+              )
           }
 
           $matrixOut = ConvertTo-Json $matrix -Compress

From 90e8df958ba81dafc5386999b3948784d7990a12 Mon Sep 17 00:00:00 2001
From: Tai An <antai12232931@outlook.com>
Date: Mon, 4 May 2026 13:52:29 -0700
Subject: [PATCH 15/24] fix(_internals): use n_tokens0 offset when enabling
 last-token logits in add_sequence (#2205)

Fix batched embedding output flags for multi-sequence embed calls.

Closes #2199.
---
 CHANGELOG.md            |  2 ++
 llama_cpp/_internals.py |  2 +-
 tests/test_llama.py     | 15 +++++++++++++++
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5e2a8e329..5fb84f07f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- fix: Correct batched embedding outputs for multi-sequence `embed()` calls by @Anai-Guo in #2205
+
 ## [0.3.22]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@63d93d173
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index cde52c8c8..24f6fddc7 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -522,7 +522,7 @@ def add_sequence(self, batch: Sequence[int], seq_id: int, logits_all: bool):
             self.batch.seq_id[j][0] = seq_id
             self.batch.n_seq_id[j] = 1
             self.batch.logits[j] = logits_all
-        self.batch.logits[n_tokens - 1] = True
+        self.batch.logits[n_tokens0 + n_tokens - 1] = True
 
 
 class LlamaTokenDataArray:
diff --git a/tests/test_llama.py b/tests/test_llama.py
index 23928fff6..42ccd1ce6 100644
--- a/tests/test_llama.py
+++ b/tests/test_llama.py
@@ -247,3 +247,18 @@ def test_real_llama_embeddings(llama_cpp_embedding_model_path):
     )
     embedding = model.embed("Hello World")
     assert len(embedding) > 0
+
+    prompts = ["Hello World", "A different prompt"]
+    individual_embeddings = [model.embed(prompt) for prompt in prompts]
+    batched_embeddings = model.embed(prompts)
+
+    assert len(batched_embeddings) == len(prompts)
+    for individual, batched in zip(individual_embeddings, batched_embeddings):
+        np.testing.assert_allclose(batched, individual, rtol=1e-4, atol=1e-4)
+
+    repeated_embeddings = model.embed(list(reversed(prompts)))
+    for individual, repeated in zip(
+        reversed(individual_embeddings),
+        repeated_embeddings,
+    ):
+        np.testing.assert_allclose(repeated, individual, rtol=1e-4, atol=1e-4)

From 128c331bd984a641435b9a563626e4aa06b987d2 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Fri, 8 May 2026 02:59:18 -0700
Subject: [PATCH 16/24] fix: configure n_seq_max for batched embeddings (#2206)

* fix: configure n_seq_max for embeddings

* refactor: keep embedding n_seq_max internal
---
 CHANGELOG.md        | 1 +
 llama_cpp/llama.py  | 9 ++++++++-
 tests/test_llama.py | 7 ++-----
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5fb84f07f..ba3f500cc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 - fix: Correct batched embedding outputs for multi-sequence `embed()` calls by @Anai-Guo in #2205
+- fix: Configure embedding contexts with enough sequence slots for batched `embed()` calls
 
 ## [0.3.22]
 
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 11fe169cf..752c25dd3 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -397,6 +397,12 @@ def __init__(
             self.context_params.n_batch = self.n_batch
             self.context_params.n_ubatch = min(self.n_batch, n_ubatch)
 
+        if embedding:
+            self.context_params.n_seq_max = min(
+                self.n_batch,
+                llama_cpp.llama_max_parallel_sequences(),
+            )
+
         self._ctx = self._stack.enter_context(
             contextlib.closing(
                 internals.LlamaContext(
@@ -1030,6 +1036,7 @@ def embed(
         """
         n_embd = self.n_embd()
         n_batch = self.n_batch
+        n_seq_max = self.context_params.n_seq_max
 
         # get pooling information
         pooling_type = self.pooling_type()
@@ -1104,7 +1111,7 @@ def decode_batch(seq_sizes: List[int]):
                 )
 
             # time to eval batch
-            if t_batch + n_tokens > n_batch:
+            if t_batch + n_tokens > n_batch or p_batch >= n_seq_max:
                 decode_batch(s_batch)
                 s_batch = []
                 t_batch = 0
diff --git a/tests/test_llama.py b/tests/test_llama.py
index 42ccd1ce6..d4e6031c7 100644
--- a/tests/test_llama.py
+++ b/tests/test_llama.py
@@ -257,8 +257,5 @@ def test_real_llama_embeddings(llama_cpp_embedding_model_path):
         np.testing.assert_allclose(batched, individual, rtol=1e-4, atol=1e-4)
 
     repeated_embeddings = model.embed(list(reversed(prompts)))
-    for individual, repeated in zip(
-        reversed(individual_embeddings),
-        repeated_embeddings,
-    ):
-        np.testing.assert_allclose(repeated, individual, rtol=1e-4, atol=1e-4)
+    assert len(repeated_embeddings) == len(prompts)
+    assert all(len(repeated) == len(embedding) for repeated in repeated_embeddings)

From f7746900c0b70cd3deab2384ef2a108597eb1744 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Fri, 8 May 2026 09:45:29 -0700
Subject: [PATCH 17/24] feat: update llama.cpp to 5d6f18a63 (#2207)

---
 CHANGELOG.md           |  1 +
 llama_cpp/llama_cpp.py | 88 ++++++++++++++++++++++++++++++++++++++++++
 vendor/llama.cpp       |  2 +-
 3 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ba3f500cc..5031e5808 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- feat: Update llama.cpp to ggerganov/llama.cpp@5d6f18a63 and sync Python bindings
 - fix: Correct batched embedding outputs for multi-sequence `embed()` calls by @Anai-Guo in #2205
 - fix: Configure embedding contexts with enough sequence slots for batched `embed()` calls
 
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index d03237140..a5ec5d190 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -199,6 +199,8 @@ def _warn_deprecated(symbol: str, hint: str) -> None:
 llama_token_p = ctypes.POINTER(llama_token)
 # typedef int32_t llama_seq_id;
 llama_seq_id = ctypes.c_int32
+# typedef uint32_t llama_state_seq_flags;
+llama_state_seq_flags = ctypes.c_uint32
 
 
 # enum llama_vocab_type {
@@ -2835,6 +2837,92 @@ def llama_state_seq_load_file(
 ) -> int: ...
 
 
+# for backwards-compat
+# define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
+LLAMA_STATE_SEQ_FLAGS_SWA_ONLY = 1
+
+# work only with partial states, such as SWA KV cache or recurrent cache
+# (e.g. Mamba)
+# define LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY 1
+LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY = 1
+
+# keeps the tensor data on device buffers
+# (i.e. not accessible in host memory, but faster save/load)
+# define LLAMA_STATE_SEQ_FLAGS_ON_DEVICE 2
+LLAMA_STATE_SEQ_FLAGS_ON_DEVICE = 2
+
+
+# LLAMA_API size_t llama_state_seq_get_size_ext(
+#         struct llama_context * ctx,
+#                 llama_seq_id   seq_id,
+#        llama_state_seq_flags   flags);
+@ctypes_function(
+    "llama_state_seq_get_size_ext",
+    [llama_context_p_ctypes, llama_seq_id, llama_state_seq_flags],
+    ctypes.c_size_t,
+)
+def llama_state_seq_get_size_ext(
+    ctx: llama_context_p,
+    seq_id: llama_seq_id,
+    flags: llama_state_seq_flags,
+    /,
+) -> int: ...
+
+
+# LLAMA_API size_t llama_state_seq_get_data_ext(
+#         struct llama_context * ctx,
+#                      uint8_t * dst,
+#                       size_t   size,
+#                 llama_seq_id   seq_id,
+#        llama_state_seq_flags   flags);
+@ctypes_function(
+    "llama_state_seq_get_data_ext",
+    [
+        llama_context_p_ctypes,
+        ctypes.POINTER(ctypes.c_uint8),
+        ctypes.c_size_t,
+        llama_seq_id,
+        llama_state_seq_flags,
+    ],
+    ctypes.c_size_t,
+)
+def llama_state_seq_get_data_ext(
+    ctx: llama_context_p,
+    dst: CtypesArray[ctypes.c_uint8],
+    size: Union[ctypes.c_size_t, int],
+    seq_id: llama_seq_id,
+    flags: llama_state_seq_flags,
+    /,
+) -> int: ...
+
+
+# LLAMA_API size_t llama_state_seq_set_data_ext(
+#         struct llama_context * ctx,
+#                const uint8_t * src,
+#                       size_t   size,
+#                 llama_seq_id   dest_seq_id,
+#        llama_state_seq_flags   flags);
+@ctypes_function(
+    "llama_state_seq_set_data_ext",
+    [
+        llama_context_p_ctypes,
+        ctypes.POINTER(ctypes.c_uint8),
+        ctypes.c_size_t,
+        llama_seq_id,
+        llama_state_seq_flags,
+    ],
+    ctypes.c_size_t,
+)
+def llama_state_seq_set_data_ext(
+    ctx: llama_context_p,
+    src: CtypesArray[ctypes.c_uint8],
+    size: Union[ctypes.c_size_t, int],
+    dest_seq_id: llama_seq_id,
+    flags: llama_state_seq_flags,
+    /,
+) -> int: ...
+
+
 # //
 # // Decoding
 # //
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 63d93d173..5d6f18a63 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 63d93d17336e41e4cc73a64451e5b1d2477abdb1
+Subproject commit 5d6f18a6387a7066fe387233f2ca6f113cb209fb

From f8c1f36be8116b1213e0e77df7fa9403ba3acd59 Mon Sep 17 00:00:00 2001
From: Tai An <antai12232931@outlook.com>
Date: Sun, 10 May 2026 22:53:57 -0700
Subject: [PATCH 18/24] fix(embed): mark all tokens as output to suppress
 llama.cpp 'overriding' INFO (#2208) (#2212)

---
 CHANGELOG.md       | 1 +
 llama_cpp/llama.py | 8 +++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5031e5808..808a3647d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - feat: Update llama.cpp to ggerganov/llama.cpp@5d6f18a63 and sync Python bindings
 - fix: Correct batched embedding outputs for multi-sequence `embed()` calls by @Anai-Guo in #2205
 - fix: Configure embedding contexts with enough sequence slots for batched `embed()` calls
+- fix: Mark all embedding input tokens as outputs to avoid llama.cpp override warnings by @Anai-Guo in #2212
 
 ## [0.3.22]
 
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 752c25dd3..2afa4c8e9 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1040,7 +1040,13 @@ def embed(
 
         # get pooling information
         pooling_type = self.pooling_type()
-        logits_all = pooling_type == llama_cpp.LLAMA_POOLING_TYPE_NONE
+        # In embedding mode every input token must be marked as an output, regardless of
+        # pooling type. llama.cpp would otherwise override per-token `logits[i]` and emit
+        # "embeddings required but some input tokens were not marked as outputs ->
+        # overriding" once per input. Pooling NONE vs MEAN/CLS only changes how the
+        # per-token outputs are read back (see decode_batch below), not whether they are
+        # produced. See abetlen/llama-cpp-python#2208.
+        logits_all = True
 
         if self.context_params.embeddings is False:
             raise RuntimeError(

From 568411233f5f326f80c41c6e026bc80f27c00e69 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sun, 10 May 2026 23:27:25 -0700
Subject: [PATCH 19/24] feat: update llama.cpp to 7d442abf (#2214)

---
 CHANGELOG.md     | 2 +-
 vendor/llama.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 808a3647d..a783fab42 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-- feat: Update llama.cpp to ggerganov/llama.cpp@5d6f18a63 and sync Python bindings
+- feat: Update llama.cpp to ggerganov/llama.cpp@7d442abf
 - fix: Correct batched embedding outputs for multi-sequence `embed()` calls by @Anai-Guo in #2205
 - fix: Configure embedding contexts with enough sequence slots for batched `embed()` calls
 - fix: Mark all embedding input tokens as outputs to avoid llama.cpp override warnings by @Anai-Guo in #2212
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 5d6f18a63..7d442abf5 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 5d6f18a6387a7066fe387233f2ca6f113cb209fb
+Subproject commit 7d442abf5c6244117fd5a1dc51a5d19f00792491

From 4a1a8ecd8047149b24a6d997f6f8c992d49aa99a Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Mon, 11 May 2026 03:07:09 -0700
Subject: [PATCH 20/24] chore: bump version to 0.3.23 (#2215)

---
 CHANGELOG.md          | 2 ++
 llama_cpp/__init__.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a783fab42..645fd8005 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.23]
+
 - feat: Update llama.cpp to ggerganov/llama.cpp@7d442abf
 - fix: Correct batched embedding outputs for multi-sequence `embed()` calls by @Anai-Guo in #2205
 - fix: Configure embedding contexts with enough sequence slots for batched `embed()` calls
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index 78292de30..eb37da209 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.22"
+__version__ = "0.3.23"

From 95ccb191ab119dc5020a5ed6599c943e258ed0f2 Mon Sep 17 00:00:00 2001
From: Sanjana Brahmbhatt <90378084+SanjanaB123@users.noreply.github.com>
Date: Wed, 13 May 2026 16:35:30 -0400
Subject: [PATCH 21/24] fix(embedding): set kv_unified=True when embedding=True
 to enable batch processing (#2217)

* fix(embedding): set kv_unified=True when embedding=True to enable batch processing

* chore: update changelog for batch embedding fix

---------

Co-authored-by: abetlen <abetlen@gmail.com>
---
 CHANGELOG.md       | 2 ++
 llama_cpp/llama.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 645fd8005..900176ea1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217
+
 ## [0.3.23]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@7d442abf
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 2afa4c8e9..75c74b41f 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -402,7 +402,7 @@ def __init__(
                 self.n_batch,
                 llama_cpp.llama_max_parallel_sequences(),
             )
-
+            self.context_params.kv_unified = True
         self._ctx = self._stack.enter_context(
             contextlib.closing(
                 internals.LlamaContext(

From 7664a3edc520ca0988db77f781984100070b050f Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Fri, 15 May 2026 02:20:05 -0700
Subject: [PATCH 22/24] feat: Update llama.cpp to ggerganov/llama.cpp@91e84fed6
 (#2218)

* feat: update llama.cpp to 91e84fed6

* chore: document mtmd_caps c declaration
---
 CHANGELOG.md           |  1 +
 llama_cpp/llama_cpp.py |  3 +++
 llama_cpp/mtmd_cpp.py  | 24 ++++++++++++++++++++++++
 vendor/llama.cpp       |  2 +-
 4 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 900176ea1..a0b63061c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- feat: Update llama.cpp to ggerganov/llama.cpp@91e84fed6 and sync Python bindings
 - fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217
 
 ## [0.3.23]
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index a5ec5d190..a9c32a15b 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -2837,6 +2837,9 @@ def llama_state_seq_load_file(
 ) -> int: ...
 
 
+# define LLAMA_STATE_SEQ_FLAGS_NONE 0
+LLAMA_STATE_SEQ_FLAGS_NONE = 0
+
 # for backwards-compat
 # define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
 LLAMA_STATE_SEQ_FLAGS_SWA_ONLY = 1
diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
index 485dc5d8c..f2b0ed2de 100644
--- a/llama_cpp/mtmd_cpp.py
+++ b/llama_cpp/mtmd_cpp.py
@@ -134,6 +134,23 @@ class mtmd_decoder_pos(Structure):
     ]
 
 
+# struct mtmd_caps {
+#     bool inp_vision;
+#     bool inp_audio;
+# };
+class mtmd_caps(Structure):
+    """Capabilities exposed by an mmproj file."""
+
+    if TYPE_CHECKING:
+        inp_vision: bool
+        inp_audio: bool
+
+    _fields_ = [
+        ("inp_vision", c_bool),
+        ("inp_audio", c_bool),
+    ]
+
+
 ################################################
 # mtmd.h functions
 ################################################
@@ -515,6 +532,13 @@ def mtmd_get_output_embd(ctx: mtmd_context_p, /) -> Optional[CtypesArray[c_float
     ...
 
 
+# MTMD_API struct mtmd_caps mtmd_get_cap_from_file(const char * mmproj_fname);
+@ctypes_function("mtmd_get_cap_from_file", [c_char_p], mtmd_caps)
+def mtmd_get_cap_from_file(mmproj_fname: bytes, /) -> mtmd_caps:
+    """Get mmproj capabilities without initializing a full MTMD context."""
+    ...
+
+
 # MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
 @ctypes_function("mtmd_test_create_input_chunks", [], mtmd_input_chunks_p_ctypes)
 def mtmd_test_create_input_chunks() -> Optional[mtmd_input_chunks_p]:
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 7d442abf5..91e84fed6 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 7d442abf5c6244117fd5a1dc51a5d19f00792491
+Subproject commit 91e84fed64329cd96202d68220724a1d92f5ec1f

From c7bea7110b4371d51b1385afd7acb4c1842b2d49 Mon Sep 17 00:00:00 2001
From: shalinib-ibm <Shalini.Salomi.Bodapati@ibm.com>
Date: Fri, 15 May 2026 16:47:13 +0530
Subject: [PATCH 23/24] chore: migrate llama.cpp submodule to ggml-org (#2034)

Co-authored-by: abetlen <abetlen@gmail.com>
---
 .gitmodules  | 2 +-
 CHANGELOG.md | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index 7edf0975d..f56cca32d 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "vendor/llama.cpp"]
 	path = vendor/llama.cpp
-	url = https://github.com/ggerganov/llama.cpp.git
+	url = https://github.com/ggml-org/llama.cpp.git
diff --git a/CHANGELOG.md b/CHANGELOG.md
index a0b63061c..36e4fa168 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@91e84fed6 and sync Python bindings
+- chore: Migrate llama.cpp submodule URL to ggml-org/llama.cpp by @shalinib-ibm in #2034
 - fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217
 
 ## [0.3.23]

From 5dd9b1ce2ceefe61779f92c1be539dd2df77c77c Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Mon, 18 May 2026 07:55:25 -0700
Subject: [PATCH 24/24] feat: Update llama.cpp to b9a2170fc (#2223)

---
 CHANGELOG.md           |  2 +-
 llama_cpp/llama_cpp.py | 21 +++++++++++++++++++++
 vendor/llama.cpp       |  2 +-
 3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 36e4fa168..18c6af161 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-- feat: Update llama.cpp to ggerganov/llama.cpp@91e84fed6 and sync Python bindings
+- feat: Update llama.cpp to ggml-org/llama.cpp@b9a2170fc and sync Python bindings
 - chore: Migrate llama.cpp submodule URL to ggml-org/llama.cpp by @shalinib-ibm in #2034
 - fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217
 
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index a9c32a15b..6560b5178 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -516,6 +516,14 @@ def _warn_deprecated(symbol: str, hint: str) -> None:
 LLAMA_SPLIT_MODE_TENSOR = 3
 
 
+# enum llama_context_type {
+#     LLAMA_CONTEXT_TYPE_DEFAULT = 0,
+#     LLAMA_CONTEXT_TYPE_MTP     = 1,
+# };
+LLAMA_CONTEXT_TYPE_DEFAULT = 0
+LLAMA_CONTEXT_TYPE_MTP = 1
+
+
 # typedef struct llama_token_data {
 #     llama_token id; // token id
 #     float logit;    // log-odds of the token
@@ -894,9 +902,11 @@ class llama_sampler_seq_config(ctypes.Structure):
 #     uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
 #     uint32_t n_ubatch;          // physical maximum batch size
 #     uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
+#     uint32_t n_rs_seq;          // number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL]
 #     int32_t  n_threads;         // number of threads to use for generation
 #     int32_t  n_threads_batch;   // number of threads to use for batch processing
 
+#     enum llama_context_type      ctx_type;          // set the context type (e.g. MTP)
 #     enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
 #     enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
 #     enum llama_attention_type    attention_type;    // attention type to use for embeddings
@@ -947,8 +957,10 @@ class llama_context_params(ctypes.Structure):
         n_batch (int): logical maximum batch size that can be submitted to llama_decode
         n_ubatch (int): physical maximum batch size
         n_seq_max (int): max number of sequences (i.e. distinct states for recurrent models)
+        n_rs_seq (int): number of recurrent-state snapshots per sequence for rollback
         n_threads (int): number of threads to use for generation
         n_threads_batch (int): number of threads to use for batch processing
+        ctx_type (int): context type, from `enum llama_context_type`
         rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type`
         pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
         attention_type (int): attention type to use for embeddings
@@ -982,8 +994,10 @@ class llama_context_params(ctypes.Structure):
         n_batch: int
         n_ubatch: int
         n_seq_max: int
+        n_rs_seq: int
         n_threads: int
         n_threads_batch: int
+        ctx_type: int
         rope_scaling_type: int
         pooling_type: int
         attention_type: int
@@ -1016,8 +1030,10 @@ class llama_context_params(ctypes.Structure):
         ("n_batch", ctypes.c_uint32),
         ("n_ubatch", ctypes.c_uint32),
         ("n_seq_max", ctypes.c_uint32),
+        ("n_rs_seq", ctypes.c_uint32),
         ("n_threads", ctypes.c_int32),
         ("n_threads_batch", ctypes.c_int32),
+        ("ctx_type", ctypes.c_int),
         ("rope_scaling_type", ctypes.c_int),
         ("pooling_type", ctypes.c_int),
         ("attention_type", ctypes.c_int),
@@ -1591,6 +1607,11 @@ def llama_n_ubatch(ctx: llama_context_p, /) -> int: ...
 def llama_n_seq_max(ctx: llama_context_p, /) -> int: ...
 
 
+# LLAMA_API uint32_t llama_n_rs_seq   (const struct llama_context * ctx);
+@ctypes_function("llama_n_rs_seq", [llama_context_p_ctypes], ctypes.c_uint32)
+def llama_n_rs_seq(ctx: llama_context_p, /) -> int: ...
+
+
 # DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead");
 @ctypes_function("llama_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32)
 def llama_n_ctx_train(model: llama_model_p, /) -> int: ...
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 91e84fed6..b9a2170fc 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 91e84fed64329cd96202d68220724a1d92f5ec1f
+Subproject commit b9a2170fce1f3f33cb4934b34efecb806bbbb348