ggml-org
diff --git a/‎Makefile
Copy file name to clipboardExpand all lines: Makefile
+9Lines changed: 9 additions & 0 deletions b/‎Makefile
Copy file name to clipboardExpand all lines: Makefile
+9Lines changed: 9 additions & 0 deletions
diff --git a/‎include/llama.h
Copy file name to clipboardExpand all lines: include/llama.h
+6-6Lines changed: 6 additions & 6 deletions b/‎include/llama.h
Copy file name to clipboardExpand all lines: include/llama.h
+6-6Lines changed: 6 additions & 6 deletions
diff --git a/‎src/CMakeLists.txt
Copy file name to clipboardExpand all lines: src/CMakeLists.txt
+1Lines changed: 1 addition & 0 deletions b/‎src/CMakeLists.txt
Copy file name to clipboardExpand all lines: src/CMakeLists.txt
+1Lines changed: 1 addition & 0 deletions
diff --git a/‎src/llama-impl.h
Copy file name to clipboard
+50Lines changed: 50 additions & 0 deletions b/‎src/llama-impl.h
Copy file name to clipboard
+50Lines changed: 50 additions & 0 deletions
@@ -868,6 +868,7 @@ OBJ_GGML += \
 
 OBJ_LLAMA = \
 	src/llama.o \
+	src/llama-sampling.o \
 	src/unicode.o \
 	src/unicode-data.o
 
@@ -1047,6 +1048,7 @@ src/unicode-data.o: \
 
 src/llama.o: \
 	src/llama.cpp \
+	src/llama-impl.h \
 	src/unicode.h \
 	include/llama.h \
 	ggml/include/ggml-cuda.h \
@@ -1056,6 +1058,13 @@ src/llama.o: \
 	ggml/include/ggml-backend.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
+src/llama-sampling.o: \
+	src/llama-sampling.cpp \
+	src/llama-sampling.h \
+	src/llama-impl.h \
+	include/llama.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 $(LIB_LLAMA): \
 	$(OBJ_LLAMA) \
 	$(LIB_GGML)
 
@@ -1081,12 +1081,6 @@ extern "C" {
           llama_token_data_array * candidates,
                            float   temp);
 
-    /// @details Apply constraints from grammar
-    LLAMA_API void llama_sample_grammar(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-      const struct llama_grammar * grammar);
-
     /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
     /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
     /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -1124,6 +1118,12 @@ extern "C" {
             struct llama_context * ctx,
           llama_token_data_array * candidates);
 
+    /// @details Apply constraints from grammar
+    LLAMA_API void llama_sample_grammar(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+      const struct llama_grammar * grammar);
+
     /// @details Accepts the sampled token into the grammar
     LLAMA_API void llama_grammar_accept_token(
             struct llama_context * ctx,
 
@@ -14,6 +14,7 @@ endif()
 add_library(llama
             ../include/llama.h
             llama.cpp
+            llama-sampling.cpp
             unicode.h
             unicode.cpp
             unicode-data.cpp
 
@@ -0,0 +1,50 @@
+#pragma once
+
+#define LLAMA_API_INTERNAL
+#include "llama.h"
+
+#include <array>
+#include <set>
+#include <map>
+#include <cstdint>
+#include <random>
+
+#ifdef __has_include
+    #if __has_include(<unistd.h>)
+        #include <unistd.h>
+        #if defined(_POSIX_MAPPED_FILES)
+            #include <sys/mman.h>
+            #include <fcntl.h>
+        #endif
+        #if defined(_POSIX_MEMLOCK_RANGE)
+            #include <sys/resource.h>
+        #endif
+    #endif
+#endif
+
+// bump if necessary
+#define LLAMA_MAX_NODES   8192
+#define LLAMA_MAX_LAYERS  256
+#define LLAMA_MAX_EXPERTS 160  // DeepSeekV2
+
+#ifdef __GNUC__
+#ifdef __MINGW32__
+#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#else
+#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#endif
+#else
+#define LLAMA_ATTRIBUTE_FORMAT(...)
+#endif
+
+//
+// logging
+//
+
+LLAMA_ATTRIBUTE_FORMAT(2, 3)
+void llama_log_internal        (ggml_log_level level, const char * format, ...);
+void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
+
+#define LLAMA_LOG_INFO(...)  llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
+#define LLAMA_LOG_WARN(...)  llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
+#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)