diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c4c744d9e..56c5ffb557 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.29] + +- feat(example): use MTMD batch encoding by @abetlen in #2301 - feat(example): support server video inputs and Gemma text tool calls by @abetlen in #2291 -- feat: update llama.cpp to ggml-org/llama.cpp@ac4cddeb0 +- feat: update llama.cpp to ggml-org/llama.cpp@f05cf4676 - fix(example): support multi-step Responses tool streaming by @abetlen in #2288 - fix(ci): Repair Linux accelerator wheels for manylinux publishing diff --git a/CMakeLists.txt b/CMakeLists.txt index 9b2744cdce..0474863a48 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -176,6 +176,15 @@ if (LLAMA_BUILD) # Building llava add_subdirectory(vendor/llama.cpp/tools/mtmd) + # The Python package only ships mtmd as a shared library. + # Upstream mtmd also defines CLI compatibility wrappers, but those are + # not installed here and can fail to link in minimal Docker toolchains. + foreach(target llama-llava-cli llama-gemma3-cli llama-minicpmv-cli llama-qwen2vl-cli llama-mtmd-cli llama-mtmd-debug) + if (TARGET ${target}) + set_target_properties(${target} PROPERTIES EXCLUDE_FROM_ALL TRUE) + endif() + endforeach() + if (WIN32) set_target_properties(mtmd PROPERTIES CUDA_ARCHITECTURES OFF) endif() diff --git a/docker/simple/Dockerfile b/docker/simple/Dockerfile index bad4f456ff..22b2335a3e 100644 --- a/docker/simple/Dockerfile +++ b/docker/simple/Dockerfile @@ -27,7 +27,7 @@ RUN python3 -m pip install --upgrade pip RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context -RUN CMAKE_ARGS="${CMAKE_ARGS}" pip install . --verbose +RUN CC=gcc CXX=g++ CMAKE_ARGS="${CMAKE_ARGS}" pip install . --verbose # Set environment variable for the host ENV HOST=0.0.0.0 diff --git a/examples/server/README.md b/examples/server/README.md index ff04374fc5..b2819a244f 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -291,7 +291,7 @@ See [Hugging Face response parsing](https://huggingface.co/docs/transformers/cha ## Multimodal `model.mtmd` -`model.mtmd` loads a llama.cpp multimodal projector and enables OpenAI-style image and audio content parts. +`model.mtmd` loads a llama.cpp multimodal projector and enables OpenAI-style image, audio, and video content parts. ```json { @@ -305,8 +305,10 @@ See [Hugging Face response parsing](https://huggingface.co/docs/transformers/cha "path": ".cache/mtmd-embeddings", "max_bytes": 1073741824 }, + "batch_max_tokens": 1024, "image_max_bytes": 20971520, "audio_max_bytes": 104857600, + "video_max_bytes": 536870912, "image_timeout_seconds": 10.0 } } @@ -317,11 +319,13 @@ See [Hugging Face response parsing](https://huggingface.co/docs/transformers/cha | --- | --- | | `mmproj_path` | Local multimodal projector path. | | `mmproj_from_pretrained` | Hugging Face projector source. | -| `embedding_cache.path` | Directory for cached image and audio embeddings. | +| `embedding_cache.path` | Directory for cached image, audio, and video embeddings. | | `embedding_cache.max_bytes` | Maximum embedding cache size. | +| `batch_max_tokens` | Maximum number of media output tokens per MTMD projector-side encode batch. | | `image_max_bytes` | Maximum image payload size. | | `audio_max_bytes` | Maximum audio payload size. | -| `image_timeout_seconds` | Timeout for remote image and audio URL fetches. | +| `video_max_bytes` | Maximum video payload size. | +| `image_timeout_seconds` | Timeout for remote image, audio, and video URL fetches. | Send image inputs with OpenAI chat content parts. diff --git a/examples/server/server.py b/examples/server/server.py index e8034a2146..16f8c9f7e5 100644 --- a/examples/server/server.py +++ b/examples/server/server.py @@ -3223,6 +3223,7 @@ class MTMDOptions(BaseModel): embedding_cache: Optional["ConfigFile.MTMDEmbeddingCacheOptions"] = None allowed_media_domains: Optional[List[str]] = None allowed_local_media_path: Optional[str] = None + batch_max_tokens: int = Field(default=1024, ge=1) image_max_bytes: int = Field(default=20 * 1024 * 1024, ge=1) audio_max_bytes: int = Field(default=100 * 1024 * 1024, ge=1) video_max_bytes: int = Field(default=512 * 1024 * 1024, ge=1) @@ -10410,6 +10411,21 @@ class MTMDLoadedMedia: class MTMDProcessor: + @dataclass + class MediaChunk: + kind: Literal["image", "audio", "video"] + key: str + chunk: Any + n_tokens: int + decode_n_pos: int + non_causal: bool + embeddings: Optional[np.ndarray] = None + + @dataclass + class ParsedChunk: + text_tokens: Optional[List[int]] = None + media: Optional["MTMDProcessor.MediaChunk"] = None + def __init__( self, *, @@ -10422,6 +10438,7 @@ def __init__( n_ubatch: int, n_threads_batch: int, mmproj_path: str, + batch_max_tokens: int, embedding_cache: Optional[MTMDEmbeddingCache], allowed_media_domains: Optional[List[str]], allowed_local_media_path: Optional[str], @@ -10437,6 +10454,7 @@ def __init__( self.n_ubatch = n_ubatch self.mmproj_path = mmproj_path self.embedding_cache = embedding_cache + self.batch_max_tokens = batch_max_tokens self.model_fingerprint = MTMDEmbeddingCache.fingerprint_file(model_path) self.mmproj_fingerprint = MTMDEmbeddingCache.fingerprint_file(mmproj_path) self.allowed_media_domains = ( @@ -10456,6 +10474,7 @@ def __init__( self.lock = threading.Lock() params = mtmd_cpp.mtmd_context_params_default() params.n_threads = max(1, n_threads_batch) + params.batch_max_tokens = batch_max_tokens self.ctx = mtmd_cpp.mtmd_init_from_file( mmproj_path.encode("utf-8"), llama_model, @@ -10705,37 +10724,91 @@ def _media_identity_tokens( tokens.append(-1 - (int.from_bytes(digest[:4], "little") & 0x3FFFFFFF)) return tokens - def _encode_media_chunk( - self, - *, - kind: Literal["image", "audio", "video"], - key: str, - chunk: Any, - ) -> np.ndarray: - n_tokens = int(mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk)) - if self.embedding_cache is not None: - cached = self.embedding_cache.load(key) - if ( - cached is not None - and cached.embeddings.shape == (n_tokens, self.n_embd_inp) - ): - return cached.embeddings - result = int(mtmd_cpp.mtmd_encode_chunk(self.ctx, chunk)) - if result != 0: - raise CompletionRequestValidationError( - f"failed to encode {kind} chunk: error code {result}" - ) - output = mtmd_cpp.mtmd_get_output_embd(self.ctx) - if output is None: - raise CompletionRequestValidationError(f"MTMD {kind} encoder returned no embeddings") + def _embeddings_from_pointer(self, output: Any, n_tokens: int) -> np.ndarray: flat = np.ctypeslib.as_array(output, shape=(n_tokens * self.n_embd_inp,)) - embeddings = np.array(flat, dtype=np.float32, copy=True).reshape( + return np.array(flat, dtype=np.float32, copy=True).reshape( n_tokens, self.n_embd_inp, ) - if self.embedding_cache is not None: - self.embedding_cache.save(key, embeddings) - return embeddings + + def _load_cached_media_chunk(self, media_chunk: "MTMDProcessor.MediaChunk") -> bool: + if self.embedding_cache is None: + return False + cached = self.embedding_cache.load(media_chunk.key) + if cached is None or cached.embeddings.shape != ( + media_chunk.n_tokens, + self.n_embd_inp, + ): + return False + media_chunk.embeddings = cached.embeddings + return True + + def _save_media_chunk(self, media_chunk: "MTMDProcessor.MediaChunk") -> None: + if self.embedding_cache is None or media_chunk.embeddings is None: + return + self.embedding_cache.save(media_chunk.key, media_chunk.embeddings) + + def _encode_media_batch( + self, + media_chunks: Sequence["MTMDProcessor.MediaChunk"], + start_index: int, + ) -> int: + batch = mtmd_cpp.mtmd_batch_init(self.ctx) + if batch is None: + raise CompletionRequestValidationError("failed to create MTMD media batch") + try: + first = media_chunks[start_index] + result = int(mtmd_cpp.mtmd_batch_add_chunk(batch, first.chunk)) + if result != 0: + raise CompletionRequestValidationError( + f"failed to add {first.kind} chunk to MTMD batch: error code {result}" + ) + group = [first] + next_index = start_index + 1 + while next_index < len(media_chunks): + candidate = media_chunks[next_index] + result = int(mtmd_cpp.mtmd_batch_add_chunk(batch, candidate.chunk)) + if result == 0: + group.append(candidate) + next_index += 1 + continue + if result in {2, 3}: + break + raise CompletionRequestValidationError( + f"failed to add {candidate.kind} chunk to MTMD batch: error code {result}" + ) + result = int(mtmd_cpp.mtmd_batch_encode(batch)) + if result != 0: + raise CompletionRequestValidationError( + f"failed to encode MTMD media batch: error code {result}" + ) + for media_chunk in group: + output = mtmd_cpp.mtmd_batch_get_output_embd(batch, media_chunk.chunk) + if output is None: + raise CompletionRequestValidationError( + f"MTMD {media_chunk.kind} encoder returned no embeddings" + ) + media_chunk.embeddings = self._embeddings_from_pointer( + output, + media_chunk.n_tokens, + ) + self._save_media_chunk(media_chunk) + return len(group) + finally: + mtmd_cpp.mtmd_batch_free(batch) + + def _encode_media_chunks( + self, + media_chunks: Sequence["MTMDProcessor.MediaChunk"], + ) -> None: + uncached = [ + media_chunk + for media_chunk in media_chunks + if not self._load_cached_media_chunk(media_chunk) + ] + index = 0 + while index < len(uncached): + index += self._encode_media_batch(uncached, index) def _positions_for_chunk(self, chunk: Any, start_pos: int) -> np.ndarray: n_tokens = int(mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk)) @@ -10858,12 +10931,8 @@ def _build_prompt_plan_locked( raise CompletionRequestValidationError( f"failed to tokenize MTMD prompt: error code {result}" ) - segments: List[PromptSegment] = [] - identity_tokens: List[int] = [] - text_tokens: List[int] = [] - text_token_index_by_pos: Dict[int, int] = {} - identity_pos = 0 - decode_pos = 0 + parsed_chunks: List[MTMDProcessor.ParsedChunk] = [] + media_chunks: List[MTMDProcessor.MediaChunk] = [] video_index = 0 used_media_keys = set() n_chunks = int(mtmd_cpp.mtmd_input_chunks_size(chunks)) @@ -10884,24 +10953,9 @@ def _build_prompt_plan_locked( else [] ) if tokens: - start_pos = identity_pos - segments.append( - PromptSegment( - kind="text", - start_pos=start_pos, - n_pos=len(tokens), - identity_tokens=list(tokens), - decode_start_pos=decode_pos, - decode_n_pos=len(tokens), - text_tokens=list(tokens), - ) + parsed_chunks.append( + MTMDProcessor.ParsedChunk(text_tokens=tokens) ) - for offset, token in enumerate(tokens): - text_token_index_by_pos[start_pos + offset] = len(text_tokens) - text_tokens.append(token) - identity_tokens.extend(tokens) - identity_pos += len(tokens) - decode_pos += len(tokens) continue if chunk_type == mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_IMAGE: chunk_kind: Literal["image", "audio"] = "image" @@ -10951,37 +11005,84 @@ def _build_prompt_plan_locked( decode_n_pos = int(mtmd_cpp.mtmd_input_chunk_get_n_pos(chunk)) if decode_n_pos <= 0: raise CompletionRequestValidationError("MTMD media chunk has no decoder positions") - embeddings = self._encode_media_chunk(kind=kind, key=key, chunk=chunk) - n_tokens = int(embeddings.shape[0]) + n_tokens = int(mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk)) if n_tokens <= 0: - raise CompletionRequestValidationError("MTMD media chunk has no embeddings") + raise CompletionRequestValidationError("MTMD media chunk has no embedding tokens") non_causal = bool(mtmd_cpp.mtmd_decode_use_non_causal(self.ctx, chunk)) - segment_identity = self._media_identity_tokens(kind, key, n_tokens) - positions = self._positions_for_chunk(chunk, decode_pos) - segment = PromptSegment( + media_chunk = MTMDProcessor.MediaChunk( kind=kind, - start_pos=identity_pos, - n_pos=n_tokens, - identity_tokens=segment_identity, - decode_start_pos=decode_pos, + key=key, + chunk=chunk, + n_tokens=n_tokens, decode_n_pos=decode_n_pos, - media=PromptSegment.Media( - embeddings=embeddings, - positions=positions, - non_causal=non_causal, - ), + non_causal=non_causal, ) - if non_causal and embeddings.shape[0] > min(self.n_batch, self.n_ubatch): + parsed_chunks.append(MTMDProcessor.ParsedChunk(media=media_chunk)) + media_chunks.append(media_chunk) + if used_media_keys != {media.key for media in loaded_media}: + raise CompletionRequestValidationError("not all media inputs were consumed by MTMD") + self._encode_media_chunks(media_chunks) + segments: List[PromptSegment] = [] + identity_tokens: List[int] = [] + text_tokens: List[int] = [] + text_token_index_by_pos: Dict[int, int] = {} + identity_pos = 0 + decode_pos = 0 + for parsed_chunk in parsed_chunks: + if parsed_chunk.text_tokens is not None: + tokens = parsed_chunk.text_tokens + start_pos = identity_pos + segments.append( + PromptSegment( + kind="text", + start_pos=start_pos, + n_pos=len(tokens), + identity_tokens=list(tokens), + decode_start_pos=decode_pos, + decode_n_pos=len(tokens), + text_tokens=list(tokens), + ) + ) + for offset, token in enumerate(tokens): + text_token_index_by_pos[start_pos + offset] = len(text_tokens) + text_tokens.append(token) + identity_tokens.extend(tokens) + identity_pos += len(tokens) + decode_pos += len(tokens) + continue + media_chunk = parsed_chunk.media + if media_chunk is None or media_chunk.embeddings is None: + raise CompletionRequestValidationError("MTMD media chunk has no embeddings") + embeddings = media_chunk.embeddings + if media_chunk.non_causal and embeddings.shape[0] > min(self.n_batch, self.n_ubatch): raise CompletionRequestValidationError( - f"non-causal {kind} embedding chunk exceeds model batch limits; " + f"non-causal {media_chunk.kind} embedding chunk exceeds model batch limits; " "increase n_batch and n_ubatch" ) - segments.append(segment) + segment_identity = self._media_identity_tokens( + media_chunk.kind, + media_chunk.key, + media_chunk.n_tokens, + ) + positions = self._positions_for_chunk(media_chunk.chunk, decode_pos) + segments.append( + PromptSegment( + kind=media_chunk.kind, + start_pos=identity_pos, + n_pos=media_chunk.n_tokens, + identity_tokens=segment_identity, + decode_start_pos=decode_pos, + decode_n_pos=media_chunk.decode_n_pos, + media=PromptSegment.Media( + embeddings=embeddings, + positions=positions, + non_causal=media_chunk.non_causal, + ), + ) + ) identity_tokens.extend(segment_identity) - identity_pos += n_tokens - decode_pos += decode_n_pos - if used_media_keys != {media.key for media in loaded_media}: - raise CompletionRequestValidationError("not all media inputs were consumed by MTMD") + identity_pos += media_chunk.n_tokens + decode_pos += media_chunk.decode_n_pos return PromptPlan( text=prompt, generation_prompt=generation_prompt, @@ -16211,6 +16312,7 @@ def main() -> None: n_ubatch=model.n_ubatch, n_threads_batch=model.n_threads_batch, mmproj_path=mmproj_path, + batch_max_tokens=config.model.mtmd.batch_max_tokens, embedding_cache=embedding_cache, allowed_media_domains=config.model.mtmd.allowed_media_domains, allowed_local_media_path=config.model.mtmd.allowed_local_media_path, diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 13668893fe..42f807ef61 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.28" +__version__ = "0.3.29" diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 919cefb357..46eb2c879b 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -76,6 +76,9 @@ mtmd_input_chunks_p = NewType("mtmd_input_chunks_p", int) mtmd_input_chunks_p_ctypes = c_void_p +mtmd_batch_p = NewType("mtmd_batch_p", int) +mtmd_batch_p_ctypes = c_void_p + # Enums MTMD_INPUT_CHUNK_TYPE_TEXT = 0 MTMD_INPUT_CHUNK_TYPE_IMAGE = 1 @@ -102,6 +105,7 @@ class mtmd_context_params(Structure): image_max_tokens: int cb_eval: llama_cpp.ggml_backend_sched_eval_callback cb_eval_user_data: c_void_p + batch_max_tokens: int _fields_ = [ ("use_gpu", c_bool), @@ -115,6 +119,7 @@ class mtmd_context_params(Structure): ("image_max_tokens", c_int), ("cb_eval", llama_cpp.ggml_backend_sched_eval_callback), ("cb_eval_user_data", c_void_p), + ("batch_max_tokens", c_int), ] @@ -596,7 +601,7 @@ def mtmd_image_tokens_get_decoder_pos( c_int, ) def mtmd_encode(ctx: mtmd_context_p, image_tokens: mtmd_image_tokens_p, /) -> int: - """Run an MTMD encode pass for image tokens.""" + """Run a deprecated MTMD encode pass for image tokens.""" ... @@ -618,6 +623,55 @@ def mtmd_get_output_embd(ctx: mtmd_context_p, /) -> Optional[CtypesArray[c_float ... +# MTMD_API mtmd_batch * mtmd_batch_init(mtmd_context * ctx); +@ctypes_function("mtmd_batch_init", [mtmd_context_p_ctypes], mtmd_batch_p_ctypes) +def mtmd_batch_init(ctx: mtmd_context_p, /) -> Optional[mtmd_batch_p]: + """Initialize an MTMD media chunk batch for a context.""" + ... + + +# MTMD_API void mtmd_batch_free(mtmd_batch * batch); +@ctypes_function("mtmd_batch_free", [mtmd_batch_p_ctypes], None) +def mtmd_batch_free(batch: mtmd_batch_p, /): ... + + +# MTMD_API int32_t mtmd_batch_add_chunk(mtmd_batch * batch, const mtmd_input_chunk * chunk); +@ctypes_function( + "mtmd_batch_add_chunk", + [mtmd_batch_p_ctypes, mtmd_input_chunk_p_ctypes], + c_int, +) +def mtmd_batch_add_chunk( + batch: mtmd_batch_p, + chunk: mtmd_input_chunk_p, + /, +) -> int: + """Add a media chunk to an MTMD batch.""" + ... + + +# MTMD_API int32_t mtmd_batch_encode(mtmd_batch * batch); +@ctypes_function("mtmd_batch_encode", [mtmd_batch_p_ctypes], c_int) +def mtmd_batch_encode(batch: mtmd_batch_p, /) -> int: + """Run an MTMD encode pass for all chunks in a batch.""" + ... + + +# MTMD_API float * mtmd_batch_get_output_embd(mtmd_batch * batch, const mtmd_input_chunk * chunk); +@ctypes_function( + "mtmd_batch_get_output_embd", + [mtmd_batch_p_ctypes, mtmd_input_chunk_p_ctypes], + POINTER(c_float), +) +def mtmd_batch_get_output_embd( + batch: mtmd_batch_p, + chunk: mtmd_input_chunk_p, + /, +) -> Optional[CtypesArray[c_float]]: + """Get output embeddings for a chunk from the last batch encode pass.""" + ... + + # MTMD_API struct mtmd_caps mtmd_get_cap_from_file(const char * mmproj_fname); @ctypes_function("mtmd_get_cap_from_file", [c_char_p], mtmd_caps) def mtmd_get_cap_from_file(mmproj_fname: bytes, /) -> mtmd_caps: diff --git a/vendor/llama.cpp b/vendor/llama.cpp index ac4cddeb0d..f05cf4676a 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit ac4cddeb0dbd778f650bf568f6f08344a06abe3a +Subproject commit f05cf4676af46c2f017c0e6ba25b6e20204f700e