From 1b1a320de18c14c3915ba2df59eedb7c6e7cbe69 Mon Sep 17 00:00:00 2001 From: Andrei Date: Mon, 13 Apr 2026 02:56:37 -0400 Subject: [PATCH 01/24] feat: Update llama.cpp to ggerganov/llama.cpp@227ed28e1 (#2182) --- llama_cpp/llama_cpp.py | 8 +- llama_cpp/mtmd_cpp.py | 267 +++++++++++++++++++++++++++++++++++++++++ vendor/llama.cpp | 2 +- 3 files changed, 273 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 0a66a5d85..e445ed66a 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -503,13 +503,15 @@ def _warn_deprecated(symbol: str, hint: str) -> None: # enum llama_split_mode { -# LLAMA_SPLIT_MODE_NONE = 0, // single GPU -# LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs -# LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported +# LLAMA_SPLIT_MODE_NONE = 0, // single GPU +# LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs +# LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported +# LLAMA_SPLIT_MODE_TENSOR = 3, # }; LLAMA_SPLIT_MODE_NONE = 0 LLAMA_SPLIT_MODE_LAYER = 1 LLAMA_SPLIT_MODE_ROW = 2 +LLAMA_SPLIT_MODE_TENSOR = 3 # typedef struct llama_token_data { diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index f28402775..550c9bd59 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -242,6 +242,55 @@ def mtmd_bitmap_init_from_audio( def mtmd_bitmap_free(bitmap: mtmd_bitmap_p, /): ... +# MTMD_API uint32_t mtmd_bitmap_get_nx(const mtmd_bitmap * bitmap); +@ctypes_function("mtmd_bitmap_get_nx", [mtmd_bitmap_p_ctypes], c_uint32) +def mtmd_bitmap_get_nx(bitmap: mtmd_bitmap_p, /) -> int: + """Get the bitmap width in pixels.""" + ... + + +# MTMD_API uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap); +@ctypes_function("mtmd_bitmap_get_ny", [mtmd_bitmap_p_ctypes], c_uint32) +def mtmd_bitmap_get_ny(bitmap: mtmd_bitmap_p, /) -> int: + """Get the bitmap height in pixels.""" + ... + + +# MTMD_API const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap); +@ctypes_function("mtmd_bitmap_get_data", [mtmd_bitmap_p_ctypes], POINTER(c_uint8)) +def mtmd_bitmap_get_data(bitmap: mtmd_bitmap_p, /) -> Optional[CtypesArray[c_uint8]]: + """Get the raw bitmap data buffer.""" + ... + + +# MTMD_API size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap); +@ctypes_function("mtmd_bitmap_get_n_bytes", [mtmd_bitmap_p_ctypes], c_size_t) +def mtmd_bitmap_get_n_bytes(bitmap: mtmd_bitmap_p, /) -> int: + """Get the bitmap data size in bytes.""" + ... + + +# MTMD_API bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap); +@ctypes_function("mtmd_bitmap_is_audio", [mtmd_bitmap_p_ctypes], c_bool) +def mtmd_bitmap_is_audio(bitmap: mtmd_bitmap_p, /) -> bool: + """Check whether the bitmap contains audio data.""" + ... + + +# MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap); +@ctypes_function("mtmd_bitmap_get_id", [mtmd_bitmap_p_ctypes], c_char_p) +def mtmd_bitmap_get_id(bitmap: mtmd_bitmap_p, /) -> Optional[bytes]: + """Get the optional bitmap identifier.""" + ... + + +# MTMD_API void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id); +@ctypes_function("mtmd_bitmap_set_id", [mtmd_bitmap_p_ctypes, c_char_p], None) +def mtmd_bitmap_set_id(bitmap: mtmd_bitmap_p, id: Optional[bytes], /): + """Set the optional bitmap identifier.""" + ... + + # MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void); @ctypes_function("mtmd_input_chunks_init", [], mtmd_input_chunks_p_ctypes) def mtmd_input_chunks_init() -> Optional[mtmd_input_chunks_p]: ... @@ -315,11 +364,146 @@ def mtmd_input_chunk_get_tokens_text( ) -> Optional["_Pointer[llama_cpp.llama_token]"]: ... +# MTMD_API const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk); +@ctypes_function( + "mtmd_input_chunk_get_tokens_image", + [mtmd_input_chunk_p_ctypes], + mtmd_image_tokens_p_ctypes, +) +def mtmd_input_chunk_get_tokens_image( + chunk: mtmd_input_chunk_p, / +) -> Optional[mtmd_image_tokens_p]: ... + + +# MTMD_API const char * mtmd_input_chunk_get_id(const mtmd_input_chunk * chunk); +@ctypes_function("mtmd_input_chunk_get_id", [mtmd_input_chunk_p_ctypes], c_char_p) +def mtmd_input_chunk_get_id(chunk: mtmd_input_chunk_p, /) -> Optional[bytes]: + """Get the optional chunk identifier.""" + ... + + +# MTMD_API llama_pos mtmd_input_chunk_get_n_pos(const mtmd_input_chunk * chunk); +@ctypes_function( + "mtmd_input_chunk_get_n_pos", + [mtmd_input_chunk_p_ctypes], + llama_cpp.llama_pos, +) +def mtmd_input_chunk_get_n_pos(chunk: mtmd_input_chunk_p, /) -> int: + """Get the number of positions consumed by the chunk.""" + ... + + +# MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk); +@ctypes_function( + "mtmd_input_chunk_copy", [mtmd_input_chunk_p_ctypes], mtmd_input_chunk_p_ctypes +) +def mtmd_input_chunk_copy(chunk: mtmd_input_chunk_p, /) -> Optional[mtmd_input_chunk_p]: + """Copy an input chunk and transfer ownership to the caller.""" + ... + + +# MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk); +@ctypes_function("mtmd_input_chunk_free", [mtmd_input_chunk_p_ctypes], None) +def mtmd_input_chunk_free(chunk: mtmd_input_chunk_p, /): + """Free an owned input chunk.""" + ... + + +# MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); +@ctypes_function( + "mtmd_image_tokens_get_n_tokens", [mtmd_image_tokens_p_ctypes], c_size_t +) +def mtmd_image_tokens_get_n_tokens(image_tokens: mtmd_image_tokens_p, /) -> int: + """Get the number of image tokens.""" + ... + + +# MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens); +@ctypes_function("mtmd_image_tokens_get_nx", [mtmd_image_tokens_p_ctypes], c_size_t) +def mtmd_image_tokens_get_nx(image_tokens: mtmd_image_tokens_p, /) -> int: + """Get the image token grid width.""" + ... + + +# MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens); +@ctypes_function("mtmd_image_tokens_get_ny", [mtmd_image_tokens_p_ctypes], c_size_t) +def mtmd_image_tokens_get_ny(image_tokens: mtmd_image_tokens_p, /) -> int: + """Get the image token grid height.""" + ... + + +# MTMD_API const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens); +@ctypes_function("mtmd_image_tokens_get_id", [mtmd_image_tokens_p_ctypes], c_char_p) +def mtmd_image_tokens_get_id(image_tokens: mtmd_image_tokens_p, /) -> Optional[bytes]: + """Get the optional image token identifier.""" + ... + + +# MTMD_API llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens); +@ctypes_function( + "mtmd_image_tokens_get_n_pos", + [mtmd_image_tokens_p_ctypes], + llama_cpp.llama_pos, +) +def mtmd_image_tokens_get_n_pos(image_tokens: mtmd_image_tokens_p, /) -> int: + """Get the number of positions consumed by the image tokens.""" + ... + + +# MTMD_API int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens); +@ctypes_function( + "mtmd_encode", + [mtmd_context_p_ctypes, mtmd_image_tokens_p_ctypes], + c_int, +) +def mtmd_encode(ctx: mtmd_context_p, image_tokens: mtmd_image_tokens_p, /) -> int: + """Run an MTMD encode pass for image tokens.""" + ... + + +# MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk); +@ctypes_function( + "mtmd_encode_chunk", + [mtmd_context_p_ctypes, mtmd_input_chunk_p_ctypes], + c_int, +) +def mtmd_encode_chunk(ctx: mtmd_context_p, chunk: mtmd_input_chunk_p, /) -> int: + """Run an MTMD encode pass for a single chunk.""" + ... + + +# MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx); +@ctypes_function("mtmd_get_output_embd", [mtmd_context_p_ctypes], POINTER(c_float)) +def mtmd_get_output_embd(ctx: mtmd_context_p, /) -> Optional[CtypesArray[c_float]]: + """Get output embeddings from the last encode pass.""" + ... + + +# MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void); +@ctypes_function("mtmd_test_create_input_chunks", [], mtmd_input_chunks_p_ctypes) +def mtmd_test_create_input_chunks() -> Optional[mtmd_input_chunks_p]: + """Create MTMD test chunks for the C API tests.""" + ... + + ################################################ # mtmd-helper.h functions ################################################ +# MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname); +@ctypes_function( + "mtmd_helper_bitmap_init_from_file", + [mtmd_context_p_ctypes, c_char_p], + mtmd_bitmap_p_ctypes, +) +def mtmd_helper_bitmap_init_from_file( + ctx: mtmd_context_p, fname: bytes, / +) -> Optional[mtmd_bitmap_p]: + """Initialize an MTMD bitmap from a file.""" + ... + + # MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len); @ctypes_function( "mtmd_helper_bitmap_init_from_buf", @@ -339,6 +523,52 @@ def mtmd_helper_bitmap_init_from_buf( def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunks_p, /) -> int: ... +# MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks); +@ctypes_function( + "mtmd_helper_get_n_pos", + [mtmd_input_chunks_p_ctypes], + llama_cpp.llama_pos, +) +def mtmd_helper_get_n_pos(chunks: mtmd_input_chunks_p, /) -> int: + """Count the total positions consumed by the chunks.""" + ... + + +# MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx, +# struct llama_context * lctx, +# const mtmd_input_chunks * chunks, +# llama_pos n_past, +# llama_seq_id seq_id, +# int32_t n_batch, +# bool logits_last, +# llama_pos * new_n_past); +@ctypes_function( + "mtmd_helper_eval_chunks", + [ + mtmd_context_p_ctypes, + llama_cpp.llama_context_p_ctypes, + mtmd_input_chunks_p_ctypes, + llama_cpp.llama_pos, + llama_cpp.llama_seq_id, + c_int, + c_bool, + POINTER(llama_cpp.llama_pos), + ], + c_int, +) +def mtmd_helper_eval_chunks( + ctx: mtmd_context_p, + lctx: llama_cpp.llama_context_p, + chunks: mtmd_input_chunks_p, + n_past: llama_cpp.llama_pos, + seq_id: llama_cpp.llama_seq_id, + n_batch: Union[c_int, int], + logits_last: Union[c_bool, bool], + new_n_past: "_Pointer[llama_cpp.llama_pos]", + /, +) -> int: ... + + # MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, # struct llama_context * lctx, # const mtmd_input_chunk * chunk, @@ -374,6 +604,43 @@ def mtmd_helper_eval_chunk_single( ) -> int: ... +# MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx, +# struct llama_context * lctx, +# const mtmd_input_chunk * chunk, +# float * encoded_embd, +# llama_pos n_past, +# llama_seq_id seq_id, +# int32_t n_batch, +# llama_pos * new_n_past); +@ctypes_function( + "mtmd_helper_decode_image_chunk", + [ + mtmd_context_p_ctypes, + llama_cpp.llama_context_p_ctypes, + mtmd_input_chunk_p_ctypes, + POINTER(c_float), + llama_cpp.llama_pos, + llama_cpp.llama_seq_id, + c_int, + POINTER(llama_cpp.llama_pos), + ], + c_int, +) +def mtmd_helper_decode_image_chunk( + ctx: mtmd_context_p, + lctx: llama_cpp.llama_context_p, + chunk: mtmd_input_chunk_p, + encoded_embd: CtypesArray[c_float], + n_past: llama_cpp.llama_pos, + seq_id: llama_cpp.llama_seq_id, + n_batch: Union[c_int, int], + new_n_past: "_Pointer[llama_cpp.llama_pos]", + /, +) -> int: + """Decode a pre-encoded image chunk.""" + ... + + # MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data); @ctypes_function( "mtmd_log_set", diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 3bd9aa1f9..227ed28e1 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 3bd9aa1f9250cd15f5371f3622d73d954b68a747 +Subproject commit 227ed28e128e93b4d63ae5108560c550c9ab16c8 From d87bf08871e2c2995e83f551aa61443e35fd865c Mon Sep 17 00:00:00 2001 From: Andrei Date: Sun, 26 Apr 2026 21:41:32 -0700 Subject: [PATCH 02/24] feat: Update llama.cpp to ggerganov/llama.cpp@f53577432 (#2189) * feat: Update llama.cpp to ggerganov/llama.cpp@f53577432 * docs: Update changelog for llama.cpp f53577432 * docs: Keep one unreleased llama.cpp changelog entry --- CHANGELOG.md | 2 +- llama_cpp/llama_cpp.py | 55 ------------------------------- llama_cpp/mtmd_cpp.py | 73 ++++++++++++++++++++++++++++++++++++------ vendor/llama.cpp | 2 +- 4 files changed, 65 insertions(+), 67 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fbe5b6b6f..ea7beaaa7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -- feat: Update llama.cpp to ggerganov/llama.cpp@3bd9aa1f9 and sync Python bindings +- feat: Update llama.cpp to ggerganov/llama.cpp@f53577432 and sync Python bindings ## [0.3.20] diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index e445ed66a..d03237140 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1516,54 +1516,6 @@ def llama_free(ctx: llama_context_p, /): ... -# enum llama_params_fit_status { -# LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, -# LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, -# LLAMA_PARAMS_FIT_STATUS_ERROR = 2, -# }; -LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0 -LLAMA_PARAMS_FIT_STATUS_FAILURE = 1 -LLAMA_PARAMS_FIT_STATUS_ERROR = 2 - - -# LLAMA_API enum llama_params_fit_status llama_params_fit( -# const char * path_model, -# struct llama_model_params * mparams, -# struct llama_context_params * cparams, -# float * tensor_split, -# struct llama_model_tensor_buft_override * tensor_buft_overrides, -# size_t * margins, -# uint32_t n_ctx_min, -# enum ggml_log_level log_level); -@ctypes_function( - "llama_params_fit", - [ - ctypes.c_char_p, - ctypes.POINTER(llama_model_params), - ctypes.POINTER(llama_context_params), - ctypes.POINTER(ctypes.c_float), - ctypes.c_void_p, - ctypes.POINTER(ctypes.c_size_t), - ctypes.c_uint32, - ctypes.c_int, - ], - ctypes.c_int, -) -def llama_params_fit( - path_model: bytes, - mparams: CtypesPointerOrRef[llama_model_params], - cparams: CtypesPointerOrRef[llama_context_params], - tensor_split: Optional[CtypesPointer[ctypes.c_float]], - tensor_buft_overrides: ctypes.c_void_p, - margins: Optional[CtypesPointer[ctypes.c_size_t]], - n_ctx_min: int, - log_level: int, - /, -) -> int: - """Fit model and context parameters for a model path.""" - ... - - # LLAMA_API int64_t llama_time_us(void); @ctypes_function( "llama_time_us", @@ -4869,13 +4821,6 @@ def llama_perf_sampler_print(chain: llama_sampler_p, /): ... def llama_perf_sampler_reset(chain: llama_sampler_p, /): ... -# // print a breakdown of per-device memory use via LLAMA_LOG: -@ctypes_function("llama_memory_breakdown_print", [llama_context_p_ctypes], None) -def llama_memory_breakdown_print(ctx: llama_context_p, /): - """Print a breakdown of per-device memory use.""" - ... - - # // # // training # // diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 550c9bd59..485dc5d8c 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -8,9 +8,9 @@ c_int, c_uint8, c_uint32, + c_size_t, c_float, c_void_p, - c_size_t, POINTER, _Pointer, # type: ignore Structure, @@ -123,6 +123,17 @@ class mtmd_input_text(Structure): ] +class mtmd_decoder_pos(Structure): + """Decoder attention position for M-RoPE models.""" + + _fields_ = [ + ("t", c_uint32), + ("x", c_uint32), + ("y", c_uint32), + ("z", c_uint32), + ] + + ################################################ # mtmd.h functions ################################################ @@ -165,35 +176,41 @@ def mtmd_init_from_file( def mtmd_free(ctx: mtmd_context_p, /): ... -# MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx); -@ctypes_function("mtmd_decode_use_non_causal", [mtmd_context_p_ctypes], c_bool) -def mtmd_decode_use_non_causal(ctx: mtmd_context_p, /) -> bool: +# MTMD_API bool mtmd_decode_use_non_causal(const mtmd_context * ctx, const mtmd_input_chunk * chunk); +@ctypes_function( + "mtmd_decode_use_non_causal", + [mtmd_context_p_ctypes, mtmd_input_chunk_p_ctypes], + c_bool, +) +def mtmd_decode_use_non_causal( + ctx: mtmd_context_p, chunk: Optional[mtmd_input_chunk_p], / +) -> bool: """Check whether MTMD decoding uses non-causal attention.""" ... -# MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx); +# MTMD_API bool mtmd_decode_use_mrope(const mtmd_context * ctx); @ctypes_function("mtmd_decode_use_mrope", [mtmd_context_p_ctypes], c_bool) def mtmd_decode_use_mrope(ctx: mtmd_context_p, /) -> bool: """Check whether MTMD decoding uses mRoPE.""" ... -# MTMD_API bool mtmd_support_vision(mtmd_context * ctx); +# MTMD_API bool mtmd_support_vision(const mtmd_context * ctx); @ctypes_function("mtmd_support_vision", [mtmd_context_p_ctypes], c_bool) def mtmd_support_vision(ctx: mtmd_context_p, /) -> bool: """Check whether the current model supports vision input.""" ... -# MTMD_API bool mtmd_support_audio(mtmd_context * ctx); +# MTMD_API bool mtmd_support_audio(const mtmd_context * ctx); @ctypes_function("mtmd_support_audio", [mtmd_context_p_ctypes], c_bool) def mtmd_support_audio(ctx: mtmd_context_p, /) -> bool: """Check whether MTMD supports audio.""" ... -# MTMD_API int mtmd_get_audio_sample_rate(mtmd_context * ctx); +# MTMD_API int mtmd_get_audio_sample_rate(const mtmd_context * ctx); @ctypes_function("mtmd_get_audio_sample_rate", [mtmd_context_p_ctypes], c_int) def mtmd_get_audio_sample_rate(ctx: mtmd_context_p, /) -> int: """Get the audio sample rate in Hz. Returns -1 if audio is not supported.""" @@ -418,14 +435,16 @@ def mtmd_image_tokens_get_n_tokens(image_tokens: mtmd_image_tokens_p, /) -> int: ... -# MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens); +# DEPRECATED(MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens), +# "use mtmd_image_tokens_get_decoder_pos() instead"); @ctypes_function("mtmd_image_tokens_get_nx", [mtmd_image_tokens_p_ctypes], c_size_t) def mtmd_image_tokens_get_nx(image_tokens: mtmd_image_tokens_p, /) -> int: """Get the image token grid width.""" ... -# MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens); +# DEPRECATED(MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens), +# "use mtmd_image_tokens_get_decoder_pos() instead"); @ctypes_function("mtmd_image_tokens_get_ny", [mtmd_image_tokens_p_ctypes], c_size_t) def mtmd_image_tokens_get_ny(image_tokens: mtmd_image_tokens_p, /) -> int: """Get the image token grid height.""" @@ -450,6 +469,23 @@ def mtmd_image_tokens_get_n_pos(image_tokens: mtmd_image_tokens_p, /) -> int: ... +# MTMD_API struct mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos( +# const mtmd_image_tokens * image_tokens, llama_pos pos_0, size_t i); +@ctypes_function( + "mtmd_image_tokens_get_decoder_pos", + [mtmd_image_tokens_p_ctypes, llama_cpp.llama_pos, c_size_t], + mtmd_decoder_pos, +) +def mtmd_image_tokens_get_decoder_pos( + image_tokens: mtmd_image_tokens_p, + pos_0: llama_cpp.llama_pos, + i: Union[c_size_t, int], + /, +) -> mtmd_decoder_pos: + """Get decoder attention position for an image embedding token.""" + ... + + # MTMD_API int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens); @ctypes_function( "mtmd_encode", @@ -534,6 +570,23 @@ def mtmd_helper_get_n_pos(chunks: mtmd_input_chunks_p, /) -> int: ... +# MTMD_API void mtmd_helper_image_get_decoder_pos( +# const mtmd_image_tokens * image, llama_pos pos_0, struct mtmd_decoder_pos * out_pos); +@ctypes_function( + "mtmd_helper_image_get_decoder_pos", + [mtmd_image_tokens_p_ctypes, llama_cpp.llama_pos, POINTER(mtmd_decoder_pos)], + None, +) +def mtmd_helper_image_get_decoder_pos( + image: mtmd_image_tokens_p, + pos_0: llama_cpp.llama_pos, + out_pos: "_Pointer[mtmd_decoder_pos]", + /, +): + """Fill decoder attention positions for all image embedding tokens.""" + ... + + # MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx, # struct llama_context * lctx, # const mtmd_input_chunks * chunks, diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 227ed28e1..f53577432 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 227ed28e128e93b4d63ae5108560c550c9ab16c8 +Subproject commit f53577432541bb9edc1588c4ef45c66bf07e4468 From 511b3f414359e8d98e9123d007bdd935cd1f7c3f Mon Sep 17 00:00:00 2001 From: Andrei Date: Sun, 26 Apr 2026 22:02:24 -0700 Subject: [PATCH 03/24] fix(ci): Build one arm64 py3 release wheel (#2191) * fix(ci): Build one arm64 py3 release wheel * docs: Update changelog for arm64 release wheel fix --- .github/workflows/build-and-release.yaml | 4 +++- CHANGELOG.md | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index 6cbac0cb1..039e376b6 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -82,7 +82,9 @@ jobs: # Keep native arm64 builds on a portable CPU baseline instead of # tuning wheels to the hosted runner. CIBW_ENVIRONMENT: CMAKE_ARGS="-DGGML_NATIVE=off" - CIBW_BUILD: "cp38-* cp39-* cp310-* cp311-* cp312-*" + # The release wheel is tagged py3-none, so one build covers all + # supported Python versions and avoids duplicate wheel names. + CIBW_BUILD: "cp38-*" with: output-dir: wheelhouse diff --git a/CHANGELOG.md b/CHANGELOG.md index ea7beaaa7..fe376ebd3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] - feat: Update llama.cpp to ggerganov/llama.cpp@f53577432 and sync Python bindings +- fix(ci): Build one arm64 release wheel for `py3-none` wheel publishing ## [0.3.20] From c8075d1dfe2019a0390af613419ecfaea292c9d5 Mon Sep 17 00:00:00 2001 From: Andrei Date: Sun, 26 Apr 2026 22:13:13 -0700 Subject: [PATCH 04/24] chore: bump version to 0.3.21 (#2192) --- CHANGELOG.md | 2 ++ llama_cpp/__init__.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fe376ebd3..eeb42b644 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.21] + - feat: Update llama.cpp to ggerganov/llama.cpp@f53577432 and sync Python bindings - fix(ci): Build one arm64 release wheel for `py3-none` wheel publishing diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 83177c065..fbad5c28b 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.20" +__version__ = "0.3.21" From 195cc59a187687ca64c8e0939e5e549d456aa2fb Mon Sep 17 00:00:00 2001 From: Andrei Date: Sun, 26 Apr 2026 22:39:59 -0700 Subject: [PATCH 05/24] fix(ci): Repair py3 CPU release wheels (#2193) --- .github/workflows/build-and-release.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index 039e376b6..f67fb558d 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -48,7 +48,10 @@ jobs: CIBW_REPAIR_WHEEL_COMMAND: "" # Linux needs auditwheel repair so manylinux and musllinux wheels are # published with distinct platform tags instead of generic linux tags. - CIBW_REPAIR_WHEEL_COMMAND_LINUX: "auditwheel repair -w {dest_dir} {wheel}" + CIBW_REPAIR_WHEEL_COMMAND_LINUX: "LD_LIBRARY_PATH=/project/llama_cpp/lib auditwheel repair -w {dest_dir} {wheel}" + # The release wheel is tagged py3-none, so one build per platform + # covers all supported Python versions and avoids duplicate names. + CIBW_BUILD: "cp38-*" # Skip cibuildwheel's default i686 sidecar and keep Linux release # wheels on a portable x86_64 CPU baseline. CIBW_ARCHS_LINUX: "auto64" From d2bcbac46605f11d382426dd88d67e8b5c124cd7 Mon Sep 17 00:00:00 2001 From: Andrei Date: Sun, 26 Apr 2026 22:55:04 -0700 Subject: [PATCH 06/24] fix(ci): Scope CPU release wheel selectors by OS (#2194) --- .github/workflows/build-and-release.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index f67fb558d..df6201ee7 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -51,10 +51,13 @@ jobs: CIBW_REPAIR_WHEEL_COMMAND_LINUX: "LD_LIBRARY_PATH=/project/llama_cpp/lib auditwheel repair -w {dest_dir} {wheel}" # The release wheel is tagged py3-none, so one build per platform # covers all supported Python versions and avoids duplicate names. - CIBW_BUILD: "cp38-*" + CIBW_BUILD_LINUX: "cp38-*" + CIBW_BUILD_MACOS: "cp39-*" + CIBW_BUILD_WINDOWS: "cp39-*" # Skip cibuildwheel's default i686 sidecar and keep Linux release # wheels on a portable x86_64 CPU baseline. CIBW_ARCHS_LINUX: "auto64" + CIBW_ARCHS_WINDOWS: "AMD64" CIBW_ENVIRONMENT_LINUX: CMAKE_ARGS="-DGGML_NATIVE=off" # Keep macOS release wheels on a portable CPU baseline instead of # inheriting the hosted runner's native flags. From c6dc90555be7bedda2d15f516b3ccd6252130a0f Mon Sep 17 00:00:00 2001 From: Andrei Date: Mon, 27 Apr 2026 01:41:57 -0700 Subject: [PATCH 07/24] fix(docs): update mkdocstrings inventories config (#2195) --- mkdocs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs.yml b/mkdocs.yml index 79a9e67a1..37e1002e8 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -36,7 +36,7 @@ plugins: - typing - typing_extensions - ctypes - import: + inventories: - https://docs.python.org/3/objects.inv - https://numpy.org/doc/stable/objects.inv From 587d94a8c31943e3bcbcccbbd2721867da52a9de Mon Sep 17 00:00:00 2001 From: Andrei Date: Sat, 2 May 2026 01:31:28 -0700 Subject: [PATCH 08/24] feat: Update llama.cpp to ggerganov/llama.cpp@63d93d173 (#2197) --- CHANGELOG.md | 2 ++ vendor/llama.cpp | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eeb42b644..e1f1f0860 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- feat: Update llama.cpp to ggerganov/llama.cpp@63d93d173 + ## [0.3.21] - feat: Update llama.cpp to ggerganov/llama.cpp@f53577432 and sync Python bindings diff --git a/vendor/llama.cpp b/vendor/llama.cpp index f53577432..63d93d173 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit f53577432541bb9edc1588c4ef45c66bf07e4468 +Subproject commit 63d93d17336e41e4cc73a64451e5b1d2477abdb1 From d2113a14441f7d811b34f4aeee917449ad1da1b9 Mon Sep 17 00:00:00 2001 From: Andrei Date: Sat, 2 May 2026 15:25:58 -0700 Subject: [PATCH 09/24] feat(ci): re-enable Windows CUDA wheels (#2198) * feat(ci): re-enable Windows CUDA wheel builds * fix(ci): use ninja for Windows CUDA wheels * fix(ci): normalize Windows CUDA CMake paths * feat(ci): add CUDA 12.5 wheel builds * fix(ci): avoid Windows CUDA 12.5 toolkit meta-package * fix(ci): include CUDA 12.5 Windows libraries * chore(ci): simplify Windows CUDA wheel workflow * docs: update changelog for Windows CUDA wheels --- .github/workflows/build-wheels-cuda.yaml | 133 ++++++++++++----------- CHANGELOG.md | 1 + 2 files changed, 69 insertions(+), 65 deletions(-) diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml index 17daaa12a..98c19afb6 100644 --- a/.github/workflows/build-wheels-cuda.yaml +++ b/.github/workflows/build-wheels-cuda.yaml @@ -20,9 +20,11 @@ jobs: id: set-matrix run: | $matrix = @{ - 'os' = @('ubuntu-22.04') #, 'windows-2022') - 'pyver' = @("3.9", "3.10", "3.11", "3.12") - 'cuda' = @("12.1.1", "12.2.2", "12.3.2", "12.4.1") #, "12.5.1", "12.6.1") + 'os' = @('ubuntu-22.04', 'windows-2022') + # wheel.py-api = "py3" makes the CUDA wheel interpreter-agnostic, + # so one builder per toolkit version is sufficient. + 'pyver' = @("3.9") + 'cuda' = @("12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1") 'releasetag' = @("basic") } @@ -43,11 +45,11 @@ jobs: AVXVER: ${{ matrix.releasetag }} steps: - - name: Add MSBuild to PATH + - name: Set up MSVC if: runner.os == 'Windows' - uses: microsoft/setup-msbuild@v2 + uses: ilammy/msvc-dev-cmd@v1 with: - vs-version: '[16.11,16.12)' + arch: x64 - uses: actions/checkout@v4 with: @@ -67,32 +69,6 @@ jobs: add-pip-as-python-dependency: true auto-activate-base: false - - name: VS Integration Cache - id: vs-integration-cache - if: runner.os == 'Windows' - uses: actions/cache@v4 - with: - path: ./MSBuildExtensions - key: cuda-${{ matrix.cuda }}-vs-integration - - - name: Get Visual Studio Integration - if: runner.os == 'Windows' && steps.vs-integration-cache.outputs.cache-hit != 'true' - run: | - if ($env:CUDAVER -eq '12.1.1') {$x = '12.1.0'} else {$x = $env:CUDAVER} - $links = (Invoke-RestMethod 'https://raw.githubusercontent.com/Jimver/cuda-toolkit/master/src/links/windows-links.ts').Trim().split().where({$_ -ne ''}) - for ($i=$q=0;$i -lt $links.count -and $q -lt 2;$i++) {if ($links[$i] -eq "'$x',") {$q++}} - Invoke-RestMethod $links[$i].Trim("'") -OutFile 'cudainstaller.zip' - & 'C:\Program Files\7-Zip\7z.exe' e cudainstaller.zip -oMSBuildExtensions -r *\MSBuildExtensions\* > $null - Remove-Item 'cudainstaller.zip' - - - name: Install Visual Studio Integration - if: runner.os == 'Windows' - run: | - $y = (gi '.\MSBuildExtensions').fullname + '\*' - (gi 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\MSBuild\Microsoft\VC\*\BuildCustomizations').fullname.foreach({cp $y $_}) - $cupath = 'CUDA_PATH_V' + $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','_') - echo "$cupath=$env:CONDA_PREFIX" >> $env:GITHUB_ENV - - name: Install Dependencies env: MAMBA_DOWNLOAD_FAILFAST: "0" @@ -101,24 +77,45 @@ jobs: $cudaVersion = $env:CUDAVER $cudaChannel = "nvidia/label/cuda-$cudaVersion" if ($IsLinux) { - # Keep nvcc, cudart, and headers on the same NVIDIA label so the - # detected toolkit version matches the published wheel tag. - mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "$cudaChannel::cuda-toolkit=$cudaVersion" "$cudaChannel::cuda-nvcc_linux-64=$cudaVersion" "$cudaChannel::cuda-cudart" "$cudaChannel::cuda-cudart-dev" + mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-toolkit=$cudaVersion" "${cudaChannel}::cuda-nvcc_linux-64" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev" + } elseif ($IsWindows) { + if ($cudaVersion -like '12.5.*') { + # The Windows 12.5 toolkit meta-package pulls compiler activation + # scripts that overflow cmd.exe after MSVC is already initialized. + mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-nvcc_win-64" "${cudaChannel}::cuda-libraries-dev=$cudaVersion" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev" + } else { + mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-toolkit=$cudaVersion" "${cudaChannel}::cuda-nvcc_win-64" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev" + } } else { - mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "$cudaChannel::cuda-toolkit=$cudaVersion" + throw 'Unsupported CUDA wheel build platform' } if ($LASTEXITCODE -ne 0) { exit $LASTEXITCODE } - python -m pip install build wheel + if ($IsWindows) { + python -m pip install build wheel ninja + } else { + python -m pip install build wheel + } - name: Build Wheel run: | - $env:CUDA_PATH = $env:CONDA_PREFIX - $env:CUDA_HOME = $env:CONDA_PREFIX - $env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX + $pathSeparator = if ($IsWindows) { ';' } else { ':' } + if ($IsWindows) { + $cudaRoot = Join-Path $env:CONDA_PREFIX 'Library' + } elseif (Test-Path (Join-Path $env:CONDA_PREFIX 'targets/x86_64-linux/include/cuda_runtime.h')) { + $cudaRoot = Join-Path $env:CONDA_PREFIX 'targets/x86_64-linux' + } else { + $cudaRoot = $env:CONDA_PREFIX + } + + $env:CUDA_PATH = $cudaRoot + $env:CUDA_HOME = $cudaRoot + $env:CUDAToolkit_ROOT = $cudaRoot + $env:CUDA_TOOLKIT_ROOT_DIR = $cudaRoot $cudaHostCompilerArg = '' - $env:CMAKE_ARGS = '' + $cudaRootCmake = $cudaRoot.Replace('\', '/') + $env:CMAKE_ARGS = "-DCUDAToolkit_ROOT=$cudaRootCmake -DCUDA_TOOLKIT_ROOT_DIR=$cudaRootCmake" if ($IsLinux) { if (Test-Path '/usr/bin/g++-12') { $env:CC = '/usr/bin/gcc-12' @@ -126,27 +123,41 @@ jobs: $env:CUDAHOSTCXX = '/usr/bin/g++-12' $cudaHostCompilerArg = " -DCMAKE_CUDA_HOST_COMPILER=$env:CUDAHOSTCXX" } - if (Test-Path (Join-Path $env:CONDA_PREFIX 'include/cuda_runtime.h')) { - $env:CUDAToolkit_ROOT = $env:CONDA_PREFIX - $env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX - $env:CMAKE_ARGS = "-DCUDAToolkit_ROOT=$env:CONDA_PREFIX -DCUDA_TOOLKIT_ROOT_DIR=$env:CONDA_PREFIX$cudaHostCompilerArg" - $env:CPATH = "$env:CONDA_PREFIX/include:$env:CPATH" - $env:CPLUS_INCLUDE_PATH = "$env:CONDA_PREFIX/include:$env:CPLUS_INCLUDE_PATH" - $env:LIBRARY_PATH = "$env:CONDA_PREFIX/lib:$env:LIBRARY_PATH" - $env:LD_LIBRARY_PATH = "$env:CONDA_PREFIX/lib:$env:LD_LIBRARY_PATH" - } else { - $env:CMAKE_ARGS = $cudaHostCompilerArg.Trim() - } + $env:CMAKE_ARGS = "-DCUDAToolkit_ROOT=$cudaRoot -DCUDA_TOOLKIT_ROOT_DIR=$cudaRoot$cudaHostCompilerArg" + $env:CPATH = "$cudaRoot/include$pathSeparator$env:CPATH" + $env:CPLUS_INCLUDE_PATH = "$cudaRoot/include$pathSeparator$env:CPLUS_INCLUDE_PATH" + $env:LIBRARY_PATH = "$cudaRoot/lib$pathSeparator$env:CONDA_PREFIX/lib$pathSeparator$env:LIBRARY_PATH" + $env:LD_LIBRARY_PATH = "$cudaRoot/lib$pathSeparator$env:CONDA_PREFIX/lib$pathSeparator$env:LD_LIBRARY_PATH" + } elseif ($IsWindows) { + $ninjaPath = ((Get-Command ninja -ErrorAction Stop).Source).Replace('\', '/') + $env:CMAKE_GENERATOR = 'Ninja' + $env:CMAKE_MAKE_PROGRAM = $ninjaPath + $env:PATH = "$(Join-Path $cudaRoot 'bin')$pathSeparator$env:PATH" } - $nvccPath = Join-Path $env:CONDA_PREFIX 'bin/nvcc' - if (-not (Test-Path $nvccPath)) { - $nvccPath = Join-Path $env:CONDA_PREFIX 'targets/x86_64-linux/bin/nvcc' + + if ($IsWindows) { + $nvccCandidates = @( + (Join-Path $cudaRoot 'bin\nvcc.exe'), + (Join-Path $env:CONDA_PREFIX 'Library\bin\nvcc.exe'), + (Join-Path $env:CONDA_PREFIX 'bin\nvcc.exe') + ) + } else { + $nvccCandidates = @( + (Join-Path $env:CONDA_PREFIX 'bin/nvcc'), + (Join-Path $env:CONDA_PREFIX 'targets/x86_64-linux/bin/nvcc') + ) } - if (-not (Test-Path $nvccPath)) { + $nvccPath = $nvccCandidates | Where-Object { Test-Path $_ } | Select-Object -First 1 + if (-not $nvccPath) { throw 'Failed to find nvcc in the conda environment' } $env:CUDACXX = $nvccPath - $env:PATH = "$(Split-Path $nvccPath):$env:PATH" + $env:PATH = "$(Split-Path $nvccPath)$pathSeparator$env:PATH" + if ($IsWindows) { + $nvccPathCmake = $nvccPath.Replace('\', '/') + $env:CUDACXX = $nvccPathCmake + $env:CMAKE_ARGS = "-DCMAKE_CUDA_COMPILER=$nvccPathCmake -DCMAKE_MAKE_PROGRAM=$env:CMAKE_MAKE_PROGRAM $env:CMAKE_ARGS" + } $nvccVersion = ((& $nvccPath --version) | Select-String 'release ([0-9]+\.[0-9]+)').Matches[0].Groups[1].Value if (-not $nvccVersion) { throw 'Failed to detect the installed CUDA toolkit version' @@ -157,15 +168,7 @@ jobs: # one forward-compatible PTX target instead of embedding PTX for every # SM. This keeps the wheel under GitHub's 2 GiB release-asset limit. $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=70-real;75-real;80-real;86-real;89-real;90-real;90-virtual -DCMAKE_CUDA_FLAGS=--allow-unsupported-compiler $env:CMAKE_ARGS" - # if ($env:AVXVER -eq 'AVX') { $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' - # } - # if ($env:AVXVER -eq 'AVX512') { - # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' - # } - # if ($env:AVXVER -eq 'basic') { - # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' - # } python -m build --wheel # Publish tags that reflect the actual installed toolkit version. Write-Output "CUDA_VERSION=$cudaTagVersion" >> $env:GITHUB_ENV diff --git a/CHANGELOG.md b/CHANGELOG.md index e1f1f0860..1852751c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] - feat: Update llama.cpp to ggerganov/llama.cpp@63d93d173 +- feat(ci): Re-enable Windows CUDA wheels and add CUDA 12.5.1 wheel builds ## [0.3.21] From 9cf0ce7c2094c40d7166f3cc92f00f2c2236af4f Mon Sep 17 00:00:00 2001 From: Andrei Date: Sat, 2 May 2026 15:35:41 -0700 Subject: [PATCH 10/24] chore: bump version to 0.3.22 (#2200) --- CHANGELOG.md | 2 ++ llama_cpp/__init__.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1852751c1..5e2a8e329 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.22] + - feat: Update llama.cpp to ggerganov/llama.cpp@63d93d173 - feat(ci): Re-enable Windows CUDA wheels and add CUDA 12.5.1 wheel builds diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index fbad5c28b..78292de30 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.21" +__version__ = "0.3.22" From 2bfd80c1c5fadd6bd95bb57e7332438cca5521cd Mon Sep 17 00:00:00 2001 From: Andrei Date: Sat, 2 May 2026 15:45:31 -0700 Subject: [PATCH 11/24] fix(ci): pass CUDA unsupported compiler flag during detection (#2201) --- .github/workflows/build-wheels-cuda.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml index 98c19afb6..c32d7f56d 100644 --- a/.github/workflows/build-wheels-cuda.yaml +++ b/.github/workflows/build-wheels-cuda.yaml @@ -167,7 +167,7 @@ jobs: # Build real cubins for the supported GPUs, including sm_70, and keep # one forward-compatible PTX target instead of embedding PTX for every # SM. This keeps the wheel under GitHub's 2 GiB release-asset limit. - $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=70-real;75-real;80-real;86-real;89-real;90-real;90-virtual -DCMAKE_CUDA_FLAGS=--allow-unsupported-compiler $env:CMAKE_ARGS" + $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=70-real;75-real;80-real;86-real;89-real;90-real;90-virtual -DCMAKE_CUDA_FLAGS=--allow-unsupported-compiler -DCMAKE_CUDA_FLAGS_INIT=--allow-unsupported-compiler $env:CMAKE_ARGS" $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' python -m build --wheel # Publish tags that reflect the actual installed toolkit version. From 04a3638b2637b0b6f1b843d16a679fbf7d2dd375 Mon Sep 17 00:00:00 2001 From: Andrei Date: Sat, 2 May 2026 15:53:53 -0700 Subject: [PATCH 12/24] fix(ci): pass CUDA compiler arg for Windows detection (#2202) --- .github/workflows/build-wheels-cuda.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml index c32d7f56d..2b4bf775a 100644 --- a/.github/workflows/build-wheels-cuda.yaml +++ b/.github/workflows/build-wheels-cuda.yaml @@ -156,7 +156,7 @@ jobs: if ($IsWindows) { $nvccPathCmake = $nvccPath.Replace('\', '/') $env:CUDACXX = $nvccPathCmake - $env:CMAKE_ARGS = "-DCMAKE_CUDA_COMPILER=$nvccPathCmake -DCMAKE_MAKE_PROGRAM=$env:CMAKE_MAKE_PROGRAM $env:CMAKE_ARGS" + $env:CMAKE_ARGS = "-DCMAKE_CUDA_COMPILER=$nvccPathCmake -DCMAKE_CUDA_COMPILER_ARG1=-allow-unsupported-compiler -DCMAKE_MAKE_PROGRAM=$env:CMAKE_MAKE_PROGRAM $env:CMAKE_ARGS" } $nvccVersion = ((& $nvccPath --version) | Select-String 'release ([0-9]+\.[0-9]+)').Matches[0].Groups[1].Value if (-not $nvccVersion) { @@ -167,7 +167,7 @@ jobs: # Build real cubins for the supported GPUs, including sm_70, and keep # one forward-compatible PTX target instead of embedding PTX for every # SM. This keeps the wheel under GitHub's 2 GiB release-asset limit. - $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=70-real;75-real;80-real;86-real;89-real;90-real;90-virtual -DCMAKE_CUDA_FLAGS=--allow-unsupported-compiler -DCMAKE_CUDA_FLAGS_INIT=--allow-unsupported-compiler $env:CMAKE_ARGS" + $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=70-real;75-real;80-real;86-real;89-real;90-real;90-virtual -DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler -DCMAKE_CUDA_FLAGS_INIT=-allow-unsupported-compiler $env:CMAKE_ARGS" $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' python -m build --wheel # Publish tags that reflect the actual installed toolkit version. From bc6ff9f2cc5545c180d8c3db4128d3ad48a31575 Mon Sep 17 00:00:00 2001 From: Andrei Date: Sat, 2 May 2026 16:01:11 -0700 Subject: [PATCH 13/24] fix(ci): install CUDA CCCL headers for wheel builds (#2203) --- .github/workflows/build-wheels-cuda.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml index 2b4bf775a..c015c7118 100644 --- a/.github/workflows/build-wheels-cuda.yaml +++ b/.github/workflows/build-wheels-cuda.yaml @@ -77,14 +77,14 @@ jobs: $cudaVersion = $env:CUDAVER $cudaChannel = "nvidia/label/cuda-$cudaVersion" if ($IsLinux) { - mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-toolkit=$cudaVersion" "${cudaChannel}::cuda-nvcc_linux-64" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev" + mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-toolkit=$cudaVersion" "${cudaChannel}::cuda-nvcc_linux-64" "${cudaChannel}::cuda-cccl" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev" } elseif ($IsWindows) { if ($cudaVersion -like '12.5.*') { # The Windows 12.5 toolkit meta-package pulls compiler activation # scripts that overflow cmd.exe after MSVC is already initialized. - mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-nvcc_win-64" "${cudaChannel}::cuda-libraries-dev=$cudaVersion" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev" + mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-nvcc_win-64" "${cudaChannel}::cuda-cccl" "${cudaChannel}::cuda-libraries-dev=$cudaVersion" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev" } else { - mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-toolkit=$cudaVersion" "${cudaChannel}::cuda-nvcc_win-64" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev" + mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-toolkit=$cudaVersion" "${cudaChannel}::cuda-nvcc_win-64" "${cudaChannel}::cuda-cccl" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev" } } else { throw 'Unsupported CUDA wheel build platform' From 14d7846f9a7c043901cb98bd446764377a8def6e Mon Sep 17 00:00:00 2001 From: Andrei Date: Sat, 2 May 2026 16:08:33 -0700 Subject: [PATCH 14/24] fix(ci): skip unsupported Windows CUDA versions (#2204) --- .github/workflows/build-wheels-cuda.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml index c015c7118..be55bf483 100644 --- a/.github/workflows/build-wheels-cuda.yaml +++ b/.github/workflows/build-wheels-cuda.yaml @@ -26,6 +26,11 @@ jobs: 'pyver' = @("3.9") 'cuda' = @("12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1") 'releasetag' = @("basic") + 'exclude' = @( + @{ 'os' = 'windows-2022'; 'cuda' = '12.1.1' }, + @{ 'os' = 'windows-2022'; 'cuda' = '12.2.2' }, + @{ 'os' = 'windows-2022'; 'cuda' = '12.3.2' } + ) } $matrixOut = ConvertTo-Json $matrix -Compress From 90e8df958ba81dafc5386999b3948784d7990a12 Mon Sep 17 00:00:00 2001 From: Tai An Date: Mon, 4 May 2026 13:52:29 -0700 Subject: [PATCH 15/24] fix(_internals): use n_tokens0 offset when enabling last-token logits in add_sequence (#2205) Fix batched embedding output flags for multi-sequence embed calls. Closes #2199. --- CHANGELOG.md | 2 ++ llama_cpp/_internals.py | 2 +- tests/test_llama.py | 15 +++++++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e2a8e329..5fb84f07f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- fix: Correct batched embedding outputs for multi-sequence `embed()` calls by @Anai-Guo in #2205 + ## [0.3.22] - feat: Update llama.cpp to ggerganov/llama.cpp@63d93d173 diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index cde52c8c8..24f6fddc7 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -522,7 +522,7 @@ def add_sequence(self, batch: Sequence[int], seq_id: int, logits_all: bool): self.batch.seq_id[j][0] = seq_id self.batch.n_seq_id[j] = 1 self.batch.logits[j] = logits_all - self.batch.logits[n_tokens - 1] = True + self.batch.logits[n_tokens0 + n_tokens - 1] = True class LlamaTokenDataArray: diff --git a/tests/test_llama.py b/tests/test_llama.py index 23928fff6..42ccd1ce6 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -247,3 +247,18 @@ def test_real_llama_embeddings(llama_cpp_embedding_model_path): ) embedding = model.embed("Hello World") assert len(embedding) > 0 + + prompts = ["Hello World", "A different prompt"] + individual_embeddings = [model.embed(prompt) for prompt in prompts] + batched_embeddings = model.embed(prompts) + + assert len(batched_embeddings) == len(prompts) + for individual, batched in zip(individual_embeddings, batched_embeddings): + np.testing.assert_allclose(batched, individual, rtol=1e-4, atol=1e-4) + + repeated_embeddings = model.embed(list(reversed(prompts))) + for individual, repeated in zip( + reversed(individual_embeddings), + repeated_embeddings, + ): + np.testing.assert_allclose(repeated, individual, rtol=1e-4, atol=1e-4) From 128c331bd984a641435b9a563626e4aa06b987d2 Mon Sep 17 00:00:00 2001 From: Andrei Date: Fri, 8 May 2026 02:59:18 -0700 Subject: [PATCH 16/24] fix: configure n_seq_max for batched embeddings (#2206) * fix: configure n_seq_max for embeddings * refactor: keep embedding n_seq_max internal --- CHANGELOG.md | 1 + llama_cpp/llama.py | 9 ++++++++- tests/test_llama.py | 7 ++----- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5fb84f07f..ba3f500cc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] - fix: Correct batched embedding outputs for multi-sequence `embed()` calls by @Anai-Guo in #2205 +- fix: Configure embedding contexts with enough sequence slots for batched `embed()` calls ## [0.3.22] diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 11fe169cf..752c25dd3 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -397,6 +397,12 @@ def __init__( self.context_params.n_batch = self.n_batch self.context_params.n_ubatch = min(self.n_batch, n_ubatch) + if embedding: + self.context_params.n_seq_max = min( + self.n_batch, + llama_cpp.llama_max_parallel_sequences(), + ) + self._ctx = self._stack.enter_context( contextlib.closing( internals.LlamaContext( @@ -1030,6 +1036,7 @@ def embed( """ n_embd = self.n_embd() n_batch = self.n_batch + n_seq_max = self.context_params.n_seq_max # get pooling information pooling_type = self.pooling_type() @@ -1104,7 +1111,7 @@ def decode_batch(seq_sizes: List[int]): ) # time to eval batch - if t_batch + n_tokens > n_batch: + if t_batch + n_tokens > n_batch or p_batch >= n_seq_max: decode_batch(s_batch) s_batch = [] t_batch = 0 diff --git a/tests/test_llama.py b/tests/test_llama.py index 42ccd1ce6..d4e6031c7 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -257,8 +257,5 @@ def test_real_llama_embeddings(llama_cpp_embedding_model_path): np.testing.assert_allclose(batched, individual, rtol=1e-4, atol=1e-4) repeated_embeddings = model.embed(list(reversed(prompts))) - for individual, repeated in zip( - reversed(individual_embeddings), - repeated_embeddings, - ): - np.testing.assert_allclose(repeated, individual, rtol=1e-4, atol=1e-4) + assert len(repeated_embeddings) == len(prompts) + assert all(len(repeated) == len(embedding) for repeated in repeated_embeddings) From f7746900c0b70cd3deab2384ef2a108597eb1744 Mon Sep 17 00:00:00 2001 From: Andrei Date: Fri, 8 May 2026 09:45:29 -0700 Subject: [PATCH 17/24] feat: update llama.cpp to 5d6f18a63 (#2207) --- CHANGELOG.md | 1 + llama_cpp/llama_cpp.py | 88 ++++++++++++++++++++++++++++++++++++++++++ vendor/llama.cpp | 2 +- 3 files changed, 90 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ba3f500cc..5031e5808 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- feat: Update llama.cpp to ggerganov/llama.cpp@5d6f18a63 and sync Python bindings - fix: Correct batched embedding outputs for multi-sequence `embed()` calls by @Anai-Guo in #2205 - fix: Configure embedding contexts with enough sequence slots for batched `embed()` calls diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index d03237140..a5ec5d190 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -199,6 +199,8 @@ def _warn_deprecated(symbol: str, hint: str) -> None: llama_token_p = ctypes.POINTER(llama_token) # typedef int32_t llama_seq_id; llama_seq_id = ctypes.c_int32 +# typedef uint32_t llama_state_seq_flags; +llama_state_seq_flags = ctypes.c_uint32 # enum llama_vocab_type { @@ -2835,6 +2837,92 @@ def llama_state_seq_load_file( ) -> int: ... +# for backwards-compat +# define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1 +LLAMA_STATE_SEQ_FLAGS_SWA_ONLY = 1 + +# work only with partial states, such as SWA KV cache or recurrent cache +# (e.g. Mamba) +# define LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY 1 +LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY = 1 + +# keeps the tensor data on device buffers +# (i.e. not accessible in host memory, but faster save/load) +# define LLAMA_STATE_SEQ_FLAGS_ON_DEVICE 2 +LLAMA_STATE_SEQ_FLAGS_ON_DEVICE = 2 + + +# LLAMA_API size_t llama_state_seq_get_size_ext( +# struct llama_context * ctx, +# llama_seq_id seq_id, +# llama_state_seq_flags flags); +@ctypes_function( + "llama_state_seq_get_size_ext", + [llama_context_p_ctypes, llama_seq_id, llama_state_seq_flags], + ctypes.c_size_t, +) +def llama_state_seq_get_size_ext( + ctx: llama_context_p, + seq_id: llama_seq_id, + flags: llama_state_seq_flags, + /, +) -> int: ... + + +# LLAMA_API size_t llama_state_seq_get_data_ext( +# struct llama_context * ctx, +# uint8_t * dst, +# size_t size, +# llama_seq_id seq_id, +# llama_state_seq_flags flags); +@ctypes_function( + "llama_state_seq_get_data_ext", + [ + llama_context_p_ctypes, + ctypes.POINTER(ctypes.c_uint8), + ctypes.c_size_t, + llama_seq_id, + llama_state_seq_flags, + ], + ctypes.c_size_t, +) +def llama_state_seq_get_data_ext( + ctx: llama_context_p, + dst: CtypesArray[ctypes.c_uint8], + size: Union[ctypes.c_size_t, int], + seq_id: llama_seq_id, + flags: llama_state_seq_flags, + /, +) -> int: ... + + +# LLAMA_API size_t llama_state_seq_set_data_ext( +# struct llama_context * ctx, +# const uint8_t * src, +# size_t size, +# llama_seq_id dest_seq_id, +# llama_state_seq_flags flags); +@ctypes_function( + "llama_state_seq_set_data_ext", + [ + llama_context_p_ctypes, + ctypes.POINTER(ctypes.c_uint8), + ctypes.c_size_t, + llama_seq_id, + llama_state_seq_flags, + ], + ctypes.c_size_t, +) +def llama_state_seq_set_data_ext( + ctx: llama_context_p, + src: CtypesArray[ctypes.c_uint8], + size: Union[ctypes.c_size_t, int], + dest_seq_id: llama_seq_id, + flags: llama_state_seq_flags, + /, +) -> int: ... + + # // # // Decoding # // diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 63d93d173..5d6f18a63 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 63d93d17336e41e4cc73a64451e5b1d2477abdb1 +Subproject commit 5d6f18a6387a7066fe387233f2ca6f113cb209fb From f8c1f36be8116b1213e0e77df7fa9403ba3acd59 Mon Sep 17 00:00:00 2001 From: Tai An Date: Sun, 10 May 2026 22:53:57 -0700 Subject: [PATCH 18/24] fix(embed): mark all tokens as output to suppress llama.cpp 'overriding' INFO (#2208) (#2212) --- CHANGELOG.md | 1 + llama_cpp/llama.py | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5031e5808..808a3647d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - feat: Update llama.cpp to ggerganov/llama.cpp@5d6f18a63 and sync Python bindings - fix: Correct batched embedding outputs for multi-sequence `embed()` calls by @Anai-Guo in #2205 - fix: Configure embedding contexts with enough sequence slots for batched `embed()` calls +- fix: Mark all embedding input tokens as outputs to avoid llama.cpp override warnings by @Anai-Guo in #2212 ## [0.3.22] diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 752c25dd3..2afa4c8e9 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1040,7 +1040,13 @@ def embed( # get pooling information pooling_type = self.pooling_type() - logits_all = pooling_type == llama_cpp.LLAMA_POOLING_TYPE_NONE + # In embedding mode every input token must be marked as an output, regardless of + # pooling type. llama.cpp would otherwise override per-token `logits[i]` and emit + # "embeddings required but some input tokens were not marked as outputs -> + # overriding" once per input. Pooling NONE vs MEAN/CLS only changes how the + # per-token outputs are read back (see decode_batch below), not whether they are + # produced. See abetlen/llama-cpp-python#2208. + logits_all = True if self.context_params.embeddings is False: raise RuntimeError( From 568411233f5f326f80c41c6e026bc80f27c00e69 Mon Sep 17 00:00:00 2001 From: Andrei Date: Sun, 10 May 2026 23:27:25 -0700 Subject: [PATCH 19/24] feat: update llama.cpp to 7d442abf (#2214) --- CHANGELOG.md | 2 +- vendor/llama.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 808a3647d..a783fab42 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -- feat: Update llama.cpp to ggerganov/llama.cpp@5d6f18a63 and sync Python bindings +- feat: Update llama.cpp to ggerganov/llama.cpp@7d442abf - fix: Correct batched embedding outputs for multi-sequence `embed()` calls by @Anai-Guo in #2205 - fix: Configure embedding contexts with enough sequence slots for batched `embed()` calls - fix: Mark all embedding input tokens as outputs to avoid llama.cpp override warnings by @Anai-Guo in #2212 diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 5d6f18a63..7d442abf5 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 5d6f18a6387a7066fe387233f2ca6f113cb209fb +Subproject commit 7d442abf5c6244117fd5a1dc51a5d19f00792491 From 4a1a8ecd8047149b24a6d997f6f8c992d49aa99a Mon Sep 17 00:00:00 2001 From: Andrei Date: Mon, 11 May 2026 03:07:09 -0700 Subject: [PATCH 20/24] chore: bump version to 0.3.23 (#2215) --- CHANGELOG.md | 2 ++ llama_cpp/__init__.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a783fab42..645fd8005 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.23] + - feat: Update llama.cpp to ggerganov/llama.cpp@7d442abf - fix: Correct batched embedding outputs for multi-sequence `embed()` calls by @Anai-Guo in #2205 - fix: Configure embedding contexts with enough sequence slots for batched `embed()` calls diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 78292de30..eb37da209 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.22" +__version__ = "0.3.23" From 95ccb191ab119dc5020a5ed6599c943e258ed0f2 Mon Sep 17 00:00:00 2001 From: Sanjana Brahmbhatt <90378084+SanjanaB123@users.noreply.github.com> Date: Wed, 13 May 2026 16:35:30 -0400 Subject: [PATCH 21/24] fix(embedding): set kv_unified=True when embedding=True to enable batch processing (#2217) * fix(embedding): set kv_unified=True when embedding=True to enable batch processing * chore: update changelog for batch embedding fix --------- Co-authored-by: abetlen --- CHANGELOG.md | 2 ++ llama_cpp/llama.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 645fd8005..900176ea1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217 + ## [0.3.23] - feat: Update llama.cpp to ggerganov/llama.cpp@7d442abf diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 2afa4c8e9..75c74b41f 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -402,7 +402,7 @@ def __init__( self.n_batch, llama_cpp.llama_max_parallel_sequences(), ) - + self.context_params.kv_unified = True self._ctx = self._stack.enter_context( contextlib.closing( internals.LlamaContext( From 7664a3edc520ca0988db77f781984100070b050f Mon Sep 17 00:00:00 2001 From: Andrei Date: Fri, 15 May 2026 02:20:05 -0700 Subject: [PATCH 22/24] feat: Update llama.cpp to ggerganov/llama.cpp@91e84fed6 (#2218) * feat: update llama.cpp to 91e84fed6 * chore: document mtmd_caps c declaration --- CHANGELOG.md | 1 + llama_cpp/llama_cpp.py | 3 +++ llama_cpp/mtmd_cpp.py | 24 ++++++++++++++++++++++++ vendor/llama.cpp | 2 +- 4 files changed, 29 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 900176ea1..a0b63061c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- feat: Update llama.cpp to ggerganov/llama.cpp@91e84fed6 and sync Python bindings - fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217 ## [0.3.23] diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index a5ec5d190..a9c32a15b 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -2837,6 +2837,9 @@ def llama_state_seq_load_file( ) -> int: ... +# define LLAMA_STATE_SEQ_FLAGS_NONE 0 +LLAMA_STATE_SEQ_FLAGS_NONE = 0 + # for backwards-compat # define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1 LLAMA_STATE_SEQ_FLAGS_SWA_ONLY = 1 diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 485dc5d8c..f2b0ed2de 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -134,6 +134,23 @@ class mtmd_decoder_pos(Structure): ] +# struct mtmd_caps { +# bool inp_vision; +# bool inp_audio; +# }; +class mtmd_caps(Structure): + """Capabilities exposed by an mmproj file.""" + + if TYPE_CHECKING: + inp_vision: bool + inp_audio: bool + + _fields_ = [ + ("inp_vision", c_bool), + ("inp_audio", c_bool), + ] + + ################################################ # mtmd.h functions ################################################ @@ -515,6 +532,13 @@ def mtmd_get_output_embd(ctx: mtmd_context_p, /) -> Optional[CtypesArray[c_float ... +# MTMD_API struct mtmd_caps mtmd_get_cap_from_file(const char * mmproj_fname); +@ctypes_function("mtmd_get_cap_from_file", [c_char_p], mtmd_caps) +def mtmd_get_cap_from_file(mmproj_fname: bytes, /) -> mtmd_caps: + """Get mmproj capabilities without initializing a full MTMD context.""" + ... + + # MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void); @ctypes_function("mtmd_test_create_input_chunks", [], mtmd_input_chunks_p_ctypes) def mtmd_test_create_input_chunks() -> Optional[mtmd_input_chunks_p]: diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 7d442abf5..91e84fed6 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 7d442abf5c6244117fd5a1dc51a5d19f00792491 +Subproject commit 91e84fed64329cd96202d68220724a1d92f5ec1f From c7bea7110b4371d51b1385afd7acb4c1842b2d49 Mon Sep 17 00:00:00 2001 From: shalinib-ibm Date: Fri, 15 May 2026 16:47:13 +0530 Subject: [PATCH 23/24] chore: migrate llama.cpp submodule to ggml-org (#2034) Co-authored-by: abetlen --- .gitmodules | 2 +- CHANGELOG.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 7edf0975d..f56cca32d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "vendor/llama.cpp"] path = vendor/llama.cpp - url = https://github.com/ggerganov/llama.cpp.git + url = https://github.com/ggml-org/llama.cpp.git diff --git a/CHANGELOG.md b/CHANGELOG.md index a0b63061c..36e4fa168 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] - feat: Update llama.cpp to ggerganov/llama.cpp@91e84fed6 and sync Python bindings +- chore: Migrate llama.cpp submodule URL to ggml-org/llama.cpp by @shalinib-ibm in #2034 - fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217 ## [0.3.23] From 5dd9b1ce2ceefe61779f92c1be539dd2df77c77c Mon Sep 17 00:00:00 2001 From: Andrei Date: Mon, 18 May 2026 07:55:25 -0700 Subject: [PATCH 24/24] feat: Update llama.cpp to b9a2170fc (#2223) --- CHANGELOG.md | 2 +- llama_cpp/llama_cpp.py | 21 +++++++++++++++++++++ vendor/llama.cpp | 2 +- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 36e4fa168..18c6af161 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -- feat: Update llama.cpp to ggerganov/llama.cpp@91e84fed6 and sync Python bindings +- feat: Update llama.cpp to ggml-org/llama.cpp@b9a2170fc and sync Python bindings - chore: Migrate llama.cpp submodule URL to ggml-org/llama.cpp by @shalinib-ibm in #2034 - fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217 diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index a9c32a15b..6560b5178 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -516,6 +516,14 @@ def _warn_deprecated(symbol: str, hint: str) -> None: LLAMA_SPLIT_MODE_TENSOR = 3 +# enum llama_context_type { +# LLAMA_CONTEXT_TYPE_DEFAULT = 0, +# LLAMA_CONTEXT_TYPE_MTP = 1, +# }; +LLAMA_CONTEXT_TYPE_DEFAULT = 0 +LLAMA_CONTEXT_TYPE_MTP = 1 + + # typedef struct llama_token_data { # llama_token id; // token id # float logit; // log-odds of the token @@ -894,9 +902,11 @@ class llama_sampler_seq_config(ctypes.Structure): # uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode # uint32_t n_ubatch; // physical maximum batch size # uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models) +# uint32_t n_rs_seq; // number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL] # int32_t n_threads; // number of threads to use for generation # int32_t n_threads_batch; // number of threads to use for batch processing +# enum llama_context_type ctx_type; // set the context type (e.g. MTP) # enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` # enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id # enum llama_attention_type attention_type; // attention type to use for embeddings @@ -947,8 +957,10 @@ class llama_context_params(ctypes.Structure): n_batch (int): logical maximum batch size that can be submitted to llama_decode n_ubatch (int): physical maximum batch size n_seq_max (int): max number of sequences (i.e. distinct states for recurrent models) + n_rs_seq (int): number of recurrent-state snapshots per sequence for rollback n_threads (int): number of threads to use for generation n_threads_batch (int): number of threads to use for batch processing + ctx_type (int): context type, from `enum llama_context_type` rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type` pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer) attention_type (int): attention type to use for embeddings @@ -982,8 +994,10 @@ class llama_context_params(ctypes.Structure): n_batch: int n_ubatch: int n_seq_max: int + n_rs_seq: int n_threads: int n_threads_batch: int + ctx_type: int rope_scaling_type: int pooling_type: int attention_type: int @@ -1016,8 +1030,10 @@ class llama_context_params(ctypes.Structure): ("n_batch", ctypes.c_uint32), ("n_ubatch", ctypes.c_uint32), ("n_seq_max", ctypes.c_uint32), + ("n_rs_seq", ctypes.c_uint32), ("n_threads", ctypes.c_int32), ("n_threads_batch", ctypes.c_int32), + ("ctx_type", ctypes.c_int), ("rope_scaling_type", ctypes.c_int), ("pooling_type", ctypes.c_int), ("attention_type", ctypes.c_int), @@ -1591,6 +1607,11 @@ def llama_n_ubatch(ctx: llama_context_p, /) -> int: ... def llama_n_seq_max(ctx: llama_context_p, /) -> int: ... +# LLAMA_API uint32_t llama_n_rs_seq (const struct llama_context * ctx); +@ctypes_function("llama_n_rs_seq", [llama_context_p_ctypes], ctypes.c_uint32) +def llama_n_rs_seq(ctx: llama_context_p, /) -> int: ... + + # DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead"); @ctypes_function("llama_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32) def llama_n_ctx_train(model: llama_model_p, /) -> int: ... diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 91e84fed6..b9a2170fc 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 91e84fed64329cd96202d68220724a1d92f5ec1f +Subproject commit b9a2170fce1f3f33cb4934b34efecb806bbbb348