diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5c4c744d9e..56c5ffb557 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,8 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.29]
+
+- feat(example): use MTMD batch encoding by @abetlen in #2301
 - feat(example): support server video inputs and Gemma text tool calls by @abetlen in #2291
-- feat: update llama.cpp to ggml-org/llama.cpp@ac4cddeb0
+- feat: update llama.cpp to ggml-org/llama.cpp@f05cf4676
 - fix(example): support multi-step Responses tool streaming by @abetlen in #2288
 - fix(ci): Repair Linux accelerator wheels for manylinux publishing
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9b2744cdce..0474863a48 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -176,6 +176,15 @@ if (LLAMA_BUILD)
         # Building llava
         add_subdirectory(vendor/llama.cpp/tools/mtmd)
 
+        # The Python package only ships mtmd as a shared library.
+        # Upstream mtmd also defines CLI compatibility wrappers, but those are
+        # not installed here and can fail to link in minimal Docker toolchains.
+        foreach(target llama-llava-cli llama-gemma3-cli llama-minicpmv-cli llama-qwen2vl-cli llama-mtmd-cli llama-mtmd-debug)
+            if (TARGET ${target})
+                set_target_properties(${target} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+            endif()
+        endforeach()
+
         if (WIN32)
             set_target_properties(mtmd PROPERTIES CUDA_ARCHITECTURES OFF)
         endif()
diff --git a/docker/simple/Dockerfile b/docker/simple/Dockerfile
index bad4f456ff..22b2335a3e 100644
--- a/docker/simple/Dockerfile
+++ b/docker/simple/Dockerfile
@@ -27,7 +27,7 @@ RUN python3 -m pip install --upgrade pip
 
 RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context
 
-RUN CMAKE_ARGS="${CMAKE_ARGS}" pip install . --verbose
+RUN CC=gcc CXX=g++ CMAKE_ARGS="${CMAKE_ARGS}" pip install . --verbose
 
 # Set environment variable for the host
 ENV HOST=0.0.0.0
diff --git a/examples/server/README.md b/examples/server/README.md
index ff04374fc5..b2819a244f 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -291,7 +291,7 @@ See [Hugging Face response parsing](https://huggingface.co/docs/transformers/cha
 
 ## Multimodal `model.mtmd`
 
-`model.mtmd` loads a llama.cpp multimodal projector and enables OpenAI-style image and audio content parts.
+`model.mtmd` loads a llama.cpp multimodal projector and enables OpenAI-style image, audio, and video content parts.
 
 ```json
 {
@@ -305,8 +305,10 @@ See [Hugging Face response parsing](https://huggingface.co/docs/transformers/cha
         "path": ".cache/mtmd-embeddings",
         "max_bytes": 1073741824
       },
+      "batch_max_tokens": 1024,
       "image_max_bytes": 20971520,
       "audio_max_bytes": 104857600,
+      "video_max_bytes": 536870912,
       "image_timeout_seconds": 10.0
     }
   }
@@ -317,11 +319,13 @@ See [Hugging Face response parsing](https://huggingface.co/docs/transformers/cha
 | --- | --- |
 | `mmproj_path` | Local multimodal projector path. |
 | `mmproj_from_pretrained` | Hugging Face projector source. |
-| `embedding_cache.path` | Directory for cached image and audio embeddings. |
+| `embedding_cache.path` | Directory for cached image, audio, and video embeddings. |
 | `embedding_cache.max_bytes` | Maximum embedding cache size. |
+| `batch_max_tokens` | Maximum number of media output tokens per MTMD projector-side encode batch. |
 | `image_max_bytes` | Maximum image payload size. |
 | `audio_max_bytes` | Maximum audio payload size. |
-| `image_timeout_seconds` | Timeout for remote image and audio URL fetches. |
+| `video_max_bytes` | Maximum video payload size. |
+| `image_timeout_seconds` | Timeout for remote image, audio, and video URL fetches. |
 
 Send image inputs with OpenAI chat content parts.
 
diff --git a/examples/server/server.py b/examples/server/server.py
index e8034a2146..16f8c9f7e5 100644
--- a/examples/server/server.py
+++ b/examples/server/server.py
@@ -3223,6 +3223,7 @@ class MTMDOptions(BaseModel):
         embedding_cache: Optional["ConfigFile.MTMDEmbeddingCacheOptions"] = None
         allowed_media_domains: Optional[List[str]] = None
         allowed_local_media_path: Optional[str] = None
+        batch_max_tokens: int = Field(default=1024, ge=1)
         image_max_bytes: int = Field(default=20 * 1024 * 1024, ge=1)
         audio_max_bytes: int = Field(default=100 * 1024 * 1024, ge=1)
         video_max_bytes: int = Field(default=512 * 1024 * 1024, ge=1)
@@ -10410,6 +10411,21 @@ class MTMDLoadedMedia:
 
 
 class MTMDProcessor:
+    @dataclass
+    class MediaChunk:
+        kind: Literal["image", "audio", "video"]
+        key: str
+        chunk: Any
+        n_tokens: int
+        decode_n_pos: int
+        non_causal: bool
+        embeddings: Optional[np.ndarray] = None
+
+    @dataclass
+    class ParsedChunk:
+        text_tokens: Optional[List[int]] = None
+        media: Optional["MTMDProcessor.MediaChunk"] = None
+
     def __init__(
         self,
         *,
@@ -10422,6 +10438,7 @@ def __init__(
         n_ubatch: int,
         n_threads_batch: int,
         mmproj_path: str,
+        batch_max_tokens: int,
         embedding_cache: Optional[MTMDEmbeddingCache],
         allowed_media_domains: Optional[List[str]],
         allowed_local_media_path: Optional[str],
@@ -10437,6 +10454,7 @@ def __init__(
         self.n_ubatch = n_ubatch
         self.mmproj_path = mmproj_path
         self.embedding_cache = embedding_cache
+        self.batch_max_tokens = batch_max_tokens
         self.model_fingerprint = MTMDEmbeddingCache.fingerprint_file(model_path)
         self.mmproj_fingerprint = MTMDEmbeddingCache.fingerprint_file(mmproj_path)
         self.allowed_media_domains = (
@@ -10456,6 +10474,7 @@ def __init__(
         self.lock = threading.Lock()
         params = mtmd_cpp.mtmd_context_params_default()
         params.n_threads = max(1, n_threads_batch)
+        params.batch_max_tokens = batch_max_tokens
         self.ctx = mtmd_cpp.mtmd_init_from_file(
             mmproj_path.encode("utf-8"),
             llama_model,
@@ -10705,37 +10724,91 @@ def _media_identity_tokens(
             tokens.append(-1 - (int.from_bytes(digest[:4], "little") & 0x3FFFFFFF))
         return tokens
 
-    def _encode_media_chunk(
-        self,
-        *,
-        kind: Literal["image", "audio", "video"],
-        key: str,
-        chunk: Any,
-    ) -> np.ndarray:
-        n_tokens = int(mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk))
-        if self.embedding_cache is not None:
-            cached = self.embedding_cache.load(key)
-            if (
-                cached is not None
-                and cached.embeddings.shape == (n_tokens, self.n_embd_inp)
-            ):
-                return cached.embeddings
-        result = int(mtmd_cpp.mtmd_encode_chunk(self.ctx, chunk))
-        if result != 0:
-            raise CompletionRequestValidationError(
-                f"failed to encode {kind} chunk: error code {result}"
-            )
-        output = mtmd_cpp.mtmd_get_output_embd(self.ctx)
-        if output is None:
-            raise CompletionRequestValidationError(f"MTMD {kind} encoder returned no embeddings")
+    def _embeddings_from_pointer(self, output: Any, n_tokens: int) -> np.ndarray:
         flat = np.ctypeslib.as_array(output, shape=(n_tokens * self.n_embd_inp,))
-        embeddings = np.array(flat, dtype=np.float32, copy=True).reshape(
+        return np.array(flat, dtype=np.float32, copy=True).reshape(
             n_tokens,
             self.n_embd_inp,
         )
-        if self.embedding_cache is not None:
-            self.embedding_cache.save(key, embeddings)
-        return embeddings
+
+    def _load_cached_media_chunk(self, media_chunk: "MTMDProcessor.MediaChunk") -> bool:
+        if self.embedding_cache is None:
+            return False
+        cached = self.embedding_cache.load(media_chunk.key)
+        if cached is None or cached.embeddings.shape != (
+            media_chunk.n_tokens,
+            self.n_embd_inp,
+        ):
+            return False
+        media_chunk.embeddings = cached.embeddings
+        return True
+
+    def _save_media_chunk(self, media_chunk: "MTMDProcessor.MediaChunk") -> None:
+        if self.embedding_cache is None or media_chunk.embeddings is None:
+            return
+        self.embedding_cache.save(media_chunk.key, media_chunk.embeddings)
+
+    def _encode_media_batch(
+        self,
+        media_chunks: Sequence["MTMDProcessor.MediaChunk"],
+        start_index: int,
+    ) -> int:
+        batch = mtmd_cpp.mtmd_batch_init(self.ctx)
+        if batch is None:
+            raise CompletionRequestValidationError("failed to create MTMD media batch")
+        try:
+            first = media_chunks[start_index]
+            result = int(mtmd_cpp.mtmd_batch_add_chunk(batch, first.chunk))
+            if result != 0:
+                raise CompletionRequestValidationError(
+                    f"failed to add {first.kind} chunk to MTMD batch: error code {result}"
+                )
+            group = [first]
+            next_index = start_index + 1
+            while next_index < len(media_chunks):
+                candidate = media_chunks[next_index]
+                result = int(mtmd_cpp.mtmd_batch_add_chunk(batch, candidate.chunk))
+                if result == 0:
+                    group.append(candidate)
+                    next_index += 1
+                    continue
+                if result in {2, 3}:
+                    break
+                raise CompletionRequestValidationError(
+                    f"failed to add {candidate.kind} chunk to MTMD batch: error code {result}"
+                )
+            result = int(mtmd_cpp.mtmd_batch_encode(batch))
+            if result != 0:
+                raise CompletionRequestValidationError(
+                    f"failed to encode MTMD media batch: error code {result}"
+                )
+            for media_chunk in group:
+                output = mtmd_cpp.mtmd_batch_get_output_embd(batch, media_chunk.chunk)
+                if output is None:
+                    raise CompletionRequestValidationError(
+                        f"MTMD {media_chunk.kind} encoder returned no embeddings"
+                    )
+                media_chunk.embeddings = self._embeddings_from_pointer(
+                    output,
+                    media_chunk.n_tokens,
+                )
+                self._save_media_chunk(media_chunk)
+            return len(group)
+        finally:
+            mtmd_cpp.mtmd_batch_free(batch)
+
+    def _encode_media_chunks(
+        self,
+        media_chunks: Sequence["MTMDProcessor.MediaChunk"],
+    ) -> None:
+        uncached = [
+            media_chunk
+            for media_chunk in media_chunks
+            if not self._load_cached_media_chunk(media_chunk)
+        ]
+        index = 0
+        while index < len(uncached):
+            index += self._encode_media_batch(uncached, index)
 
     def _positions_for_chunk(self, chunk: Any, start_pos: int) -> np.ndarray:
         n_tokens = int(mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk))
@@ -10858,12 +10931,8 @@ def _build_prompt_plan_locked(
                 raise CompletionRequestValidationError(
                     f"failed to tokenize MTMD prompt: error code {result}"
                 )
-            segments: List[PromptSegment] = []
-            identity_tokens: List[int] = []
-            text_tokens: List[int] = []
-            text_token_index_by_pos: Dict[int, int] = {}
-            identity_pos = 0
-            decode_pos = 0
+            parsed_chunks: List[MTMDProcessor.ParsedChunk] = []
+            media_chunks: List[MTMDProcessor.MediaChunk] = []
             video_index = 0
             used_media_keys = set()
             n_chunks = int(mtmd_cpp.mtmd_input_chunks_size(chunks))
@@ -10884,24 +10953,9 @@ def _build_prompt_plan_locked(
                         else []
                     )
                     if tokens:
-                        start_pos = identity_pos
-                        segments.append(
-                            PromptSegment(
-                                kind="text",
-                                start_pos=start_pos,
-                                n_pos=len(tokens),
-                                identity_tokens=list(tokens),
-                                decode_start_pos=decode_pos,
-                                decode_n_pos=len(tokens),
-                                text_tokens=list(tokens),
-                            )
+                        parsed_chunks.append(
+                            MTMDProcessor.ParsedChunk(text_tokens=tokens)
                         )
-                        for offset, token in enumerate(tokens):
-                            text_token_index_by_pos[start_pos + offset] = len(text_tokens)
-                            text_tokens.append(token)
-                        identity_tokens.extend(tokens)
-                        identity_pos += len(tokens)
-                        decode_pos += len(tokens)
                     continue
                 if chunk_type == mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_IMAGE:
                     chunk_kind: Literal["image", "audio"] = "image"
@@ -10951,37 +11005,84 @@ def _build_prompt_plan_locked(
                 decode_n_pos = int(mtmd_cpp.mtmd_input_chunk_get_n_pos(chunk))
                 if decode_n_pos <= 0:
                     raise CompletionRequestValidationError("MTMD media chunk has no decoder positions")
-                embeddings = self._encode_media_chunk(kind=kind, key=key, chunk=chunk)
-                n_tokens = int(embeddings.shape[0])
+                n_tokens = int(mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk))
                 if n_tokens <= 0:
-                    raise CompletionRequestValidationError("MTMD media chunk has no embeddings")
+                    raise CompletionRequestValidationError("MTMD media chunk has no embedding tokens")
                 non_causal = bool(mtmd_cpp.mtmd_decode_use_non_causal(self.ctx, chunk))
-                segment_identity = self._media_identity_tokens(kind, key, n_tokens)
-                positions = self._positions_for_chunk(chunk, decode_pos)
-                segment = PromptSegment(
+                media_chunk = MTMDProcessor.MediaChunk(
                     kind=kind,
-                    start_pos=identity_pos,
-                    n_pos=n_tokens,
-                    identity_tokens=segment_identity,
-                    decode_start_pos=decode_pos,
+                    key=key,
+                    chunk=chunk,
+                    n_tokens=n_tokens,
                     decode_n_pos=decode_n_pos,
-                    media=PromptSegment.Media(
-                        embeddings=embeddings,
-                        positions=positions,
-                        non_causal=non_causal,
-                    ),
+                    non_causal=non_causal,
                 )
-                if non_causal and embeddings.shape[0] > min(self.n_batch, self.n_ubatch):
+                parsed_chunks.append(MTMDProcessor.ParsedChunk(media=media_chunk))
+                media_chunks.append(media_chunk)
+            if used_media_keys != {media.key for media in loaded_media}:
+                raise CompletionRequestValidationError("not all media inputs were consumed by MTMD")
+            self._encode_media_chunks(media_chunks)
+            segments: List[PromptSegment] = []
+            identity_tokens: List[int] = []
+            text_tokens: List[int] = []
+            text_token_index_by_pos: Dict[int, int] = {}
+            identity_pos = 0
+            decode_pos = 0
+            for parsed_chunk in parsed_chunks:
+                if parsed_chunk.text_tokens is not None:
+                    tokens = parsed_chunk.text_tokens
+                    start_pos = identity_pos
+                    segments.append(
+                        PromptSegment(
+                            kind="text",
+                            start_pos=start_pos,
+                            n_pos=len(tokens),
+                            identity_tokens=list(tokens),
+                            decode_start_pos=decode_pos,
+                            decode_n_pos=len(tokens),
+                            text_tokens=list(tokens),
+                        )
+                    )
+                    for offset, token in enumerate(tokens):
+                        text_token_index_by_pos[start_pos + offset] = len(text_tokens)
+                        text_tokens.append(token)
+                    identity_tokens.extend(tokens)
+                    identity_pos += len(tokens)
+                    decode_pos += len(tokens)
+                    continue
+                media_chunk = parsed_chunk.media
+                if media_chunk is None or media_chunk.embeddings is None:
+                    raise CompletionRequestValidationError("MTMD media chunk has no embeddings")
+                embeddings = media_chunk.embeddings
+                if media_chunk.non_causal and embeddings.shape[0] > min(self.n_batch, self.n_ubatch):
                     raise CompletionRequestValidationError(
-                        f"non-causal {kind} embedding chunk exceeds model batch limits; "
+                        f"non-causal {media_chunk.kind} embedding chunk exceeds model batch limits; "
                         "increase n_batch and n_ubatch"
                     )
-                segments.append(segment)
+                segment_identity = self._media_identity_tokens(
+                    media_chunk.kind,
+                    media_chunk.key,
+                    media_chunk.n_tokens,
+                )
+                positions = self._positions_for_chunk(media_chunk.chunk, decode_pos)
+                segments.append(
+                    PromptSegment(
+                        kind=media_chunk.kind,
+                        start_pos=identity_pos,
+                        n_pos=media_chunk.n_tokens,
+                        identity_tokens=segment_identity,
+                        decode_start_pos=decode_pos,
+                        decode_n_pos=media_chunk.decode_n_pos,
+                        media=PromptSegment.Media(
+                            embeddings=embeddings,
+                            positions=positions,
+                            non_causal=media_chunk.non_causal,
+                        ),
+                    )
+                )
                 identity_tokens.extend(segment_identity)
-                identity_pos += n_tokens
-                decode_pos += decode_n_pos
-            if used_media_keys != {media.key for media in loaded_media}:
-                raise CompletionRequestValidationError("not all media inputs were consumed by MTMD")
+                identity_pos += media_chunk.n_tokens
+                decode_pos += media_chunk.decode_n_pos
             return PromptPlan(
                 text=prompt,
                 generation_prompt=generation_prompt,
@@ -16211,6 +16312,7 @@ def main() -> None:
             n_ubatch=model.n_ubatch,
             n_threads_batch=model.n_threads_batch,
             mmproj_path=mmproj_path,
+            batch_max_tokens=config.model.mtmd.batch_max_tokens,
             embedding_cache=embedding_cache,
             allowed_media_domains=config.model.mtmd.allowed_media_domains,
             allowed_local_media_path=config.model.mtmd.allowed_local_media_path,
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index 13668893fe..42f807ef61 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.28"
+__version__ = "0.3.29"
diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
index 919cefb357..46eb2c879b 100644
--- a/llama_cpp/mtmd_cpp.py
+++ b/llama_cpp/mtmd_cpp.py
@@ -76,6 +76,9 @@
 mtmd_input_chunks_p = NewType("mtmd_input_chunks_p", int)
 mtmd_input_chunks_p_ctypes = c_void_p
 
+mtmd_batch_p = NewType("mtmd_batch_p", int)
+mtmd_batch_p_ctypes = c_void_p
+
 # Enums
 MTMD_INPUT_CHUNK_TYPE_TEXT = 0
 MTMD_INPUT_CHUNK_TYPE_IMAGE = 1
@@ -102,6 +105,7 @@ class mtmd_context_params(Structure):
         image_max_tokens: int
         cb_eval: llama_cpp.ggml_backend_sched_eval_callback
         cb_eval_user_data: c_void_p
+        batch_max_tokens: int
 
     _fields_ = [
         ("use_gpu", c_bool),
@@ -115,6 +119,7 @@ class mtmd_context_params(Structure):
         ("image_max_tokens", c_int),
         ("cb_eval", llama_cpp.ggml_backend_sched_eval_callback),
         ("cb_eval_user_data", c_void_p),
+        ("batch_max_tokens", c_int),
     ]
 
 
@@ -596,7 +601,7 @@ def mtmd_image_tokens_get_decoder_pos(
     c_int,
 )
 def mtmd_encode(ctx: mtmd_context_p, image_tokens: mtmd_image_tokens_p, /) -> int:
-    """Run an MTMD encode pass for image tokens."""
+    """Run a deprecated MTMD encode pass for image tokens."""
     ...
 
 
@@ -618,6 +623,55 @@ def mtmd_get_output_embd(ctx: mtmd_context_p, /) -> Optional[CtypesArray[c_float
     ...
 
 
+# MTMD_API mtmd_batch * mtmd_batch_init(mtmd_context * ctx);
+@ctypes_function("mtmd_batch_init", [mtmd_context_p_ctypes], mtmd_batch_p_ctypes)
+def mtmd_batch_init(ctx: mtmd_context_p, /) -> Optional[mtmd_batch_p]:
+    """Initialize an MTMD media chunk batch for a context."""
+    ...
+
+
+# MTMD_API void mtmd_batch_free(mtmd_batch * batch);
+@ctypes_function("mtmd_batch_free", [mtmd_batch_p_ctypes], None)
+def mtmd_batch_free(batch: mtmd_batch_p, /): ...
+
+
+# MTMD_API int32_t mtmd_batch_add_chunk(mtmd_batch * batch, const mtmd_input_chunk * chunk);
+@ctypes_function(
+    "mtmd_batch_add_chunk",
+    [mtmd_batch_p_ctypes, mtmd_input_chunk_p_ctypes],
+    c_int,
+)
+def mtmd_batch_add_chunk(
+    batch: mtmd_batch_p,
+    chunk: mtmd_input_chunk_p,
+    /,
+) -> int:
+    """Add a media chunk to an MTMD batch."""
+    ...
+
+
+# MTMD_API int32_t mtmd_batch_encode(mtmd_batch * batch);
+@ctypes_function("mtmd_batch_encode", [mtmd_batch_p_ctypes], c_int)
+def mtmd_batch_encode(batch: mtmd_batch_p, /) -> int:
+    """Run an MTMD encode pass for all chunks in a batch."""
+    ...
+
+
+# MTMD_API float * mtmd_batch_get_output_embd(mtmd_batch * batch, const mtmd_input_chunk * chunk);
+@ctypes_function(
+    "mtmd_batch_get_output_embd",
+    [mtmd_batch_p_ctypes, mtmd_input_chunk_p_ctypes],
+    POINTER(c_float),
+)
+def mtmd_batch_get_output_embd(
+    batch: mtmd_batch_p,
+    chunk: mtmd_input_chunk_p,
+    /,
+) -> Optional[CtypesArray[c_float]]:
+    """Get output embeddings for a chunk from the last batch encode pass."""
+    ...
+
+
 # MTMD_API struct mtmd_caps mtmd_get_cap_from_file(const char * mmproj_fname);
 @ctypes_function("mtmd_get_cap_from_file", [c_char_p], mtmd_caps)
 def mtmd_get_cap_from_file(mmproj_fname: bytes, /) -> mtmd_caps:
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index ac4cddeb0d..f05cf4676a 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit ac4cddeb0dbd778f650bf568f6f08344a06abe3a
+Subproject commit f05cf4676af46c2f017c0e6ba25b6e20204f700e