feat: Jinja template thinking toggle, Qwen3.5-9B GGUF Q8_0

- Thinking/Instruct toggle via Jinja template patching in llama-cpp
  backend: creates separate handlers for thinking-enabled and
  thinking-disabled modes
- Replace lovedheart/Qwen3.5-9B-FP8 (safetensors, 15.8GB OOM) with
  unsloth/Qwen3.5-9B-GGUF Q8_0 (9.2GB, fits)
- Enable flash_attn in llama-cpp for better performance
- GGUF path resolution falls back to flat gguf/ directory

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
tlg
2026-04-06 09:44:02 +02:00
parent 7a0ff55eb5
commit 7c4bbe0b29
2 changed files with 68 additions and 19 deletions

View File

@@ -1,10 +1,11 @@
physical_models:
qwen3.5-9b-fp8:
type: llm
backend: transformers
model_id: "lovedheart/Qwen3.5-9B-FP8"
estimated_vram_gb: 9
supports_vision: true
backend: llamacpp
model_id: "unsloth/Qwen3.5-9B-GGUF"
model_file: "Qwen3.5-9B-Q8_0.gguf"
estimated_vram_gb: 10
supports_vision: false
supports_tools: true
qwen3.5-9b-fp8-uncensored:

View File

@@ -1,4 +1,5 @@
import asyncio
import gc
import json
import logging
import time
@@ -20,9 +21,13 @@ class LlamaCppBackend(BaseBackend):
self._loaded: dict[str, dict] = {}
def _resolve_gguf_path(self, physical: PhysicalModel, filename: str) -> str:
"""Resolve a GGUF filename to its path in the HF cache."""
"""Resolve a GGUF filename — check flat gguf/ dir first, then HF cache."""
# Check flat gguf/ directory
flat_path = self._models_dir / "gguf" / filename
if flat_path.exists():
return str(flat_path)
# Fall back to HF cache resolution
from huggingface_hub import hf_hub_download
# model_id stores the HF repo, model_file/mmproj_file store the filenames
return hf_hub_download(
repo_id=physical.model_id,
filename=filename,
@@ -42,36 +47,53 @@ class LlamaCppBackend(BaseBackend):
"model_path": model_path,
"n_gpu_layers": n_gpu_layers,
"n_ctx": 4096,
"flash_attn": True,
"verbose": False,
}
if physical.mmproj_file:
mmproj_path = self._resolve_gguf_path(physical, physical.mmproj_file)
kwargs["chat_handler"] = _create_vision_handler(mmproj_path)
return Llama(**kwargs)
llm = Llama(**kwargs)
return llm
loop = asyncio.get_event_loop()
llm = await loop.run_in_executor(None, _load)
self._loaded[model_id] = {"llm": llm, "n_gpu_layers": n_gpu_layers}
# Create thinking-enabled and thinking-disabled chat handlers from Jinja template
think_handler = _create_think_handler(llm, enable_thinking=True)
no_think_handler = _create_think_handler(llm, enable_thinking=False)
self._loaded[model_id] = {
"llm": llm,
"n_gpu_layers": n_gpu_layers,
"think_handler": think_handler,
"no_think_handler": no_think_handler,
}
async def unload(self, model_id: str) -> None:
if model_id not in self._loaded:
return
entry = self._loaded.pop(model_id)
del entry["llm"]
del entry
gc.collect()
logger.info(f"Unloaded GGUF model {model_id}")
async def generate(self, model_id, messages, params, stream=False, tools=None):
entry = self._loaded[model_id]
llm = entry["llm"]
effective_messages = list(messages)
# Swap chat handler based on thinking mode
original_handler = llm.chat_handler
if "enable_thinking" in params:
if not params["enable_thinking"]:
if effective_messages and effective_messages[0].get("role") == "system":
effective_messages[0] = dict(effective_messages[0])
effective_messages[0]["content"] = "/no_think\n" + effective_messages[0]["content"]
if params["enable_thinking"]:
handler = entry.get("think_handler")
else:
effective_messages.insert(0, {"role": "system", "content": "/no_think"})
handler = entry.get("no_think_handler")
if handler:
llm.chat_handler = handler
effective_messages = list(messages)
if "system_prompt_prefix" in params:
prefix = params["system_prompt_prefix"]
if effective_messages and effective_messages[0].get("role") == "system":
@@ -80,10 +102,14 @@ class LlamaCppBackend(BaseBackend):
else:
effective_messages.insert(0, {"role": "system", "content": prefix})
try:
if stream:
return self._stream_generate(llm, effective_messages, model_id, tools)
else:
return await self._full_generate(llm, effective_messages, model_id, tools)
finally:
# Restore original handler
llm.chat_handler = original_handler
async def _full_generate(self, llm, messages, model_id, tools):
def _run():
@@ -116,6 +142,28 @@ class LlamaCppBackend(BaseBackend):
return _iter()
def _create_think_handler(llm, enable_thinking: bool):
"""Create a chat handler with thinking enabled or disabled via Jinja template."""
mode = "enabled" if enable_thinking else "disabled"
try:
from llama_cpp.llama_chat_format import Jinja2ChatFormatter
template_str = llm.metadata.get("tokenizer.chat_template", "")
if not template_str:
logger.warning("Model has no embedded chat template")
return None
value = "true" if enable_thinking else "false"
patched = "{%- set enable_thinking = " + value + " %}\n" + template_str
eos = llm._model.token_get_text(llm._model.token_eos())
bos = llm._model.token_get_text(llm._model.token_bos())
formatter = Jinja2ChatFormatter(template=patched, eos_token=eos, bos_token=bos)
handler = formatter.to_chat_handler()
logger.info(f"Created chat handler with thinking {mode}")
return handler
except Exception as e:
logger.error(f"Failed to create thinking-{mode} handler: {e}")
return None
def _create_vision_handler(mmproj_path: str):
from llama_cpp.llama_chat_format import Llava16ChatHandler
return Llava16ChatHandler(clip_model_path=mmproj_path)