wundervaflja
diff --git a/‎llama_cpp/llama_cpp.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+27-5Lines changed: 27 additions & 5 deletions b/‎llama_cpp/llama_cpp.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+27-5Lines changed: 27 additions & 5 deletions
diff --git a/‎vendor/llama.cpp
Copy file name to clipboard b/‎vendor/llama.cpp
Copy file name to clipboard
@@ -290,13 +290,14 @@ def llama_mlock_supported() -> bool:
 
 # // TODO: not great API - very likely to change
 # // Initialize the llama + ggml backend
+# // If numa is true, use NUMA optimizations
 # // Call once at the start of the program
-# LLAMA_API void llama_init_backend();
-def llama_init_backend():
-    return _lib.llama_init_backend()
+# LLAMA_API void llama_init_backend(bool numa);
+def llama_init_backend(numa: c_bool):
+    return _lib.llama_init_backend(numa)
 
 
-_lib.llama_init_backend.argtypes = []
+_lib.llama_init_backend.argtypes = [c_bool]
 _lib.llama_init_backend.restype = None
 
 
@@ -565,6 +566,27 @@ def llama_eval(
 _lib.llama_eval.restype = c_int
 
 
+# // Same as llama_eval, but use float matrix input directly.
+# LLAMA_API int llama_eval_embd(
+#         struct llama_context * ctx,
+#                     const float * embd,
+#                             int   n_tokens,
+#                             int   n_past,
+#                             int   n_threads);
+def llama_eval_embd(
+    ctx: llama_context_p,
+    embd,  # type: Array[c_float]
+    n_tokens: c_int,
+    n_past: c_int,
+    n_threads: c_int,
+) -> int:
+    return _lib.llama_eval_embd(ctx, embd, n_tokens, n_past, n_threads)
+
+
+_lib.llama_eval_embd.argtypes = [llama_context_p, c_float_p, c_int, c_int, c_int]
+_lib.llama_eval_embd.restype = c_int
+
+
 # Convert the provided text into tokens.
 # The tokens pointer must be large enough to hold the resulting tokens.
 # Returns the number of tokens on success, no more than n_max_tokens
@@ -998,5 +1020,5 @@ def llama_print_system_info() -> bytes:
 _llama_initialized = False
 
 if not _llama_initialized:
-    llama_init_backend()
+    llama_init_backend(c_bool(False))
     _llama_initialized = True