Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit e121edc

Browse filesBrowse files
ochafikochafikngxson
authored
server: add --reasoning-budget 0 to disable thinking (incl. qwen3 w/ enable_thinking:false) (#13771)
--------- Co-authored-by: ochafik <ochafik@google.com> Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
1 parent 2f099b5 commit e121edc
Copy full SHA for e121edc

File tree

Expand file treeCollapse file tree

12 files changed

+277
-107
lines changed
Filter options
Expand file treeCollapse file tree

12 files changed

+277
-107
lines changed

‎common/arg.cpp

Copy file name to clipboardExpand all lines: common/arg.cpp
+14-5Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2848,15 +2848,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
28482848
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
28492849
add_opt(common_arg(
28502850
{"--reasoning-format"}, "FORMAT",
2851-
"reasoning format (default: deepseek; allowed values: deepseek, none)\n"
2852-
"controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
2853-
"only supported for non-streamed responses",
2851+
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
2852+
"- none: leaves thoughts unparsed in `message.content`\n"
2853+
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
2854+
"(default: deepseek)",
28542855
[](common_params & params, const std::string & value) {
28552856
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
28562857
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
2857-
else { std::invalid_argument("invalid value"); }
2858+
else { throw std::invalid_argument("invalid value"); }
28582859
}
28592860
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
2861+
add_opt(common_arg(
2862+
{"--reasoning-budget"}, "N",
2863+
"controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
2864+
[](common_params & params, int value) {
2865+
if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
2866+
params.reasoning_budget = value;
2867+
}
2868+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
28602869
add_opt(common_arg(
28612870
{"--chat-template"}, "JINJA_TEMPLATE",
28622871
string_format(
@@ -2955,7 +2964,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
29552964
[](common_params & params, const std::string & value) {
29562965
/**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
29572966
else if (value == "md") { params.batched_bench_output_jsonl = false; }
2958-
else { std::invalid_argument("invalid value"); }
2967+
else { throw std::invalid_argument("invalid value"); }
29592968
}
29602969
).set_examples({LLAMA_EXAMPLE_BENCH}));
29612970
add_opt(common_arg(

‎common/chat.cpp

Copy file name to clipboardExpand all lines: common/chat.cpp
+126-95Lines changed: 126 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ struct templates_params {
133133
bool stream;
134134
std::string grammar;
135135
bool add_generation_prompt = true;
136+
bool enable_thinking = true;
136137
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
137138
};
138139

@@ -573,7 +574,7 @@ common_chat_templates_ptr common_chat_templates_init(
573574
return tmpls;
574575
}
575576

576-
std::string common_chat_format_name(common_chat_format format) {
577+
const char * common_chat_format_name(common_chat_format format) {
577578
switch (format) {
578579
case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only";
579580
case COMMON_CHAT_FORMAT_GENERIC: return "Generic";
@@ -591,6 +592,15 @@ std::string common_chat_format_name(common_chat_format format) {
591592
}
592593
}
593594

595+
const char * common_reasoning_format_name(common_reasoning_format format) {
596+
switch (format) {
597+
case COMMON_REASONING_FORMAT_NONE: return "none";
598+
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
599+
default:
600+
throw std::runtime_error("Unknown reasoning format");
601+
}
602+
}
603+
594604
static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
595605
std::string arguments;
596606
if (builder.is_partial()) {
@@ -918,7 +928,13 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
918928
data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {});
919929
data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
920930
if (string_ends_with(data.prompt, "<|START_THINKING|>")) {
921-
data.thinking_forced_open = true;
931+
if (!inputs.enable_thinking) {
932+
data.prompt += "<|END_THINKING|>";
933+
} else {
934+
data.thinking_forced_open = true;
935+
}
936+
} else if (!inputs.enable_thinking && string_ends_with(data.prompt, "<|CHATBOT_TOKEN|>")) {
937+
data.prompt += "<|START_THINKING|><|END_THINKING|>";
922938
}
923939

924940
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
@@ -1186,7 +1202,11 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
11861202
data.prompt = prompt;
11871203
data.format = COMMON_CHAT_FORMAT_DEEPSEEK_R1;
11881204
if (string_ends_with(data.prompt, "<think>\n")) {
1189-
data.thinking_forced_open = true;
1205+
if (!inputs.enable_thinking) {
1206+
data.prompt += "</think>";
1207+
} else {
1208+
data.thinking_forced_open = true;
1209+
}
11901210
}
11911211

11921212
if (inputs.tools.is_array() && !inputs.tools.empty()) {
@@ -1460,104 +1480,114 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser
14601480
static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
14611481
common_chat_params data;
14621482

1463-
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
1483+
json additional_context = {
1484+
{"enable_thinking", inputs.enable_thinking},
1485+
};
1486+
1487+
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, additional_context);
14641488
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
14651489
if (string_ends_with(data.prompt, "<think>\n")) {
1466-
data.thinking_forced_open = true;
1490+
if (!inputs.enable_thinking) {
1491+
data.prompt += "</think>";
1492+
} else {
1493+
data.thinking_forced_open = true;
1494+
}
14671495
}
14681496

1469-
// (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*
1470-
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
1471-
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1472-
std::vector<std::string> tool_rules;
1473-
std::vector<std::string> tool_call_alts;
1474-
std::vector<std::string> escaped_names;
1475-
foreach_function(inputs.tools, [&](const json & tool) {
1476-
const auto & function = tool.at("function");
1477-
std::string name = function.at("name");
1478-
auto parameters = function.at("parameters");
1479-
builder.resolve_refs(parameters);
1480-
tool_rules.push_back(builder.add_schema(name + "-call", {
1481-
{"type", "object"},
1482-
{"properties", json {
1483-
{"name", json {{"const", name}}},
1484-
{"arguments", parameters},
1485-
}},
1486-
{"required", json::array({"name", "arguments"})},
1487-
}));
1488-
tool_call_alts.push_back(builder.add_rule(
1489-
name + "-function-tag",
1490-
"\"<function\" ( \"=" + name + "\" | \" name=\\\"" + name + "\\\"\" ) \">\" space " +
1491-
builder.add_schema(name + "-args", parameters) + " "
1492-
"\"</function>\" space"));
1497+
if (!inputs.tools.is_null()) {
1498+
// (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*
1499+
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
1500+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1501+
std::vector<std::string> tool_rules;
1502+
std::vector<std::string> tool_call_alts;
1503+
std::vector<std::string> escaped_names;
1504+
foreach_function(inputs.tools, [&](const json & tool) {
1505+
const auto & function = tool.at("function");
1506+
std::string name = function.at("name");
1507+
auto parameters = function.at("parameters");
1508+
builder.resolve_refs(parameters);
1509+
tool_rules.push_back(builder.add_schema(name + "-call", {
1510+
{"type", "object"},
1511+
{"properties", json {
1512+
{"name", json {{"const", name}}},
1513+
{"arguments", parameters},
1514+
}},
1515+
{"required", json::array({"name", "arguments"})},
1516+
}));
1517+
tool_call_alts.push_back(builder.add_rule(
1518+
name + "-function-tag",
1519+
"\"<function\" ( \"=" + name + "\" | \" name=\\\"" + name + "\\\"\" ) \">\" space " +
1520+
builder.add_schema(name + "-args", parameters) + " "
1521+
"\"</function>\" space"));
14931522

1494-
data.grammar_triggers.push_back({
1495-
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
1496-
"<function=" + name + ">",
1523+
data.grammar_triggers.push_back({
1524+
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
1525+
"<function=" + name + ">",
1526+
});
1527+
auto escaped_name = regex_escape(name);
1528+
data.grammar_triggers.push_back({
1529+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
1530+
"<function\\s+name\\s*=\\s*\"" + escaped_name + "\"",
1531+
});
1532+
escaped_names.push_back(escaped_name);
14971533
});
1498-
auto escaped_name = regex_escape(name);
1534+
auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space");
1535+
std::vector<std::string> alt_tags {
1536+
any_tool_call,
1537+
"\"<tool_call>\" space " + any_tool_call + " \"</tool_call>\"",
1538+
// The rest is just to accommodate common "good bad" outputs.
1539+
"\"<function_call>\" space " + any_tool_call + " \"</function_call>\"",
1540+
"\"<response>\" space " + any_tool_call + " \"</response>\"",
1541+
"\"<tools>\" space " + any_tool_call + " \"</tools>\"",
1542+
"\"<json>\" space " + any_tool_call + " \"</json>\"",
1543+
"\"<xml>\" space " + any_tool_call + " \"</xml>\"",
1544+
"\"<JSON>\" space " + any_tool_call + " \"</JSON>\"",
1545+
};
1546+
auto wrappable_tool_call = builder.add_rule("wrappable_tool_call", "( " + string_join(alt_tags, " | ") + " ) space");
1547+
tool_call_alts.push_back(wrappable_tool_call);
1548+
tool_call_alts.push_back(
1549+
"( \"```\\n\" | \"```json\\n\" | \"```xml\\n\" ) space " + wrappable_tool_call + " space \"```\" space ");
1550+
auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | "));
1551+
builder.add_rule("root",
1552+
std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
1553+
(inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
1554+
// Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
14991555
data.grammar_triggers.push_back({
1500-
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
1501-
"<function\\s+name\\s*=\\s*\"" + escaped_name + "\"",
1556+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
1557+
// If thinking_forced_open, then we capture the </think> tag in the grammar,
1558+
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
1559+
std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
1560+
"(\\s*"
1561+
"(?:<tool_call>"
1562+
"|<function"
1563+
"|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
1564+
"\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\""
1565+
")"
1566+
")[\\s\\S]*"
1567+
),
15021568
});
1503-
escaped_names.push_back(escaped_name);
1504-
});
1505-
auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space");
1506-
std::vector<std::string> alt_tags {
1507-
any_tool_call,
1508-
"\"<tool_call>\" space " + any_tool_call + " \"</tool_call>\"",
1509-
// The rest is just to accommodate common "good bad" outputs.
1510-
"\"<function_call>\" space " + any_tool_call + " \"</function_call>\"",
1511-
"\"<response>\" space " + any_tool_call + " \"</response>\"",
1512-
"\"<tools>\" space " + any_tool_call + " \"</tools>\"",
1513-
"\"<json>\" space " + any_tool_call + " \"</json>\"",
1514-
"\"<xml>\" space " + any_tool_call + " \"</xml>\"",
1515-
"\"<JSON>\" space " + any_tool_call + " \"</JSON>\"",
1516-
};
1517-
auto wrappable_tool_call = builder.add_rule("wrappable_tool_call", "( " + string_join(alt_tags, " | ") + " ) space");
1518-
tool_call_alts.push_back(wrappable_tool_call);
1519-
tool_call_alts.push_back(
1520-
"( \"```\\n\" | \"```json\\n\" | \"```xml\\n\" ) space " + wrappable_tool_call + " space \"```\" space ");
1521-
auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | "));
1522-
builder.add_rule("root",
1523-
std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
1524-
(inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
1525-
// Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
1526-
data.grammar_triggers.push_back({
1527-
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
1528-
// If thinking_forced_open, then we capture the </think> tag in the grammar,
1529-
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
1530-
std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
1531-
"(\\s*"
1532-
"(?:<tool_call>"
1533-
"|<function"
1534-
"|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
1535-
"\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\""
1536-
")"
1537-
")[\\s\\S]*"
1538-
),
1569+
data.preserved_tokens = {
1570+
"<think>",
1571+
"</think>",
1572+
"<tool_call>",
1573+
"</tool_call>",
1574+
"<function",
1575+
"<tools>",
1576+
"</tools>",
1577+
"<response>",
1578+
"</response>",
1579+
"<function_call>",
1580+
"</function_call>",
1581+
"<json>",
1582+
"</json>",
1583+
"<JSON>",
1584+
"</JSON>",
1585+
"```",
1586+
"```json",
1587+
"```xml",
1588+
};
15391589
});
1540-
data.preserved_tokens = {
1541-
"<think>",
1542-
"</think>",
1543-
"<tool_call>",
1544-
"</tool_call>",
1545-
"<function",
1546-
"<tools>",
1547-
"</tools>",
1548-
"<response>",
1549-
"</response>",
1550-
"<function_call>",
1551-
"</function_call>",
1552-
"<json>",
1553-
"</json>",
1554-
"<JSON>",
1555-
"</JSON>",
1556-
"```",
1557-
"```json",
1558-
"```xml",
1559-
};
1560-
});
1590+
}
15611591

15621592
return data;
15631593
}
@@ -1669,6 +1699,7 @@ static common_chat_params common_chat_templates_apply_jinja(
16691699
params.messages = common_chat_msgs_to_json_oaicompat<json>(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
16701700
params.add_generation_prompt = inputs.add_generation_prompt;
16711701
params.tool_choice = inputs.tool_choice;
1702+
params.enable_thinking = inputs.enable_thinking;
16721703
params.grammar = inputs.grammar;
16731704
params.now = inputs.now;
16741705
if (!inputs.json_schema.empty()) {
@@ -1702,7 +1733,7 @@ static common_chat_params common_chat_templates_apply_jinja(
17021733
}
17031734

17041735
// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
1705-
if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null() && params.tools.is_array() && params.json_schema.is_null()) {
1736+
if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
17061737
return common_chat_params_init_hermes_2_pro(tmpl, params);
17071738
}
17081739

@@ -1821,7 +1852,7 @@ static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
18211852
}
18221853

18231854
static void common_chat_parse(common_chat_msg_parser & builder, common_chat_format format) {
1824-
LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(format).c_str(), builder.input().c_str());
1855+
LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(format), builder.input().c_str());
18251856

18261857
switch (format) {
18271858
case COMMON_CHAT_FORMAT_CONTENT_ONLY:
@@ -1858,7 +1889,7 @@ static void common_chat_parse(common_chat_msg_parser & builder, common_chat_form
18581889
common_chat_parse_command_r7b(builder);
18591890
break;
18601891
default:
1861-
throw std::runtime_error("Unsupported format: " + common_chat_format_name(format));
1892+
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(format));
18621893
}
18631894
builder.finish();
18641895
}

‎common/chat.h

Copy file name to clipboardExpand all lines: common/chat.h
+3-1Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ struct common_chat_templates_inputs {
123123
common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
124124
bool parallel_tool_calls = false;
125125
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
126+
bool enable_thinking = true;
126127
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
127128
};
128129

@@ -181,7 +182,8 @@ std::string common_chat_format_example(
181182
const struct common_chat_templates * tmpls,
182183
bool use_jinja);
183184

184-
std::string common_chat_format_name(common_chat_format format);
185+
const char* common_chat_format_name(common_chat_format format);
186+
const char* common_reasoning_format_name(common_reasoning_format format);
185187
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
186188

187189
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);

‎common/common.h

Copy file name to clipboardExpand all lines: common/common.h
+1Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,7 @@ struct common_params {
368368
bool use_jinja = false; // NOLINT
369369
bool enable_chat_template = true;
370370
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
371+
int reasoning_budget = -1;
371372
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
372373

373374
std::vector<std::string> api_keys;

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.