diff --git a/kischdle/llmux/config/models.yaml b/kischdle/llmux/config/models.yaml index e565961..1b0db5a 100644 --- a/kischdle/llmux/config/models.yaml +++ b/kischdle/llmux/config/models.yaml @@ -1,10 +1,11 @@ physical_models: qwen3.5-9b-fp8: type: llm - backend: transformers - model_id: "lovedheart/Qwen3.5-9B-FP8" - estimated_vram_gb: 9 - supports_vision: true + backend: llamacpp + model_id: "unsloth/Qwen3.5-9B-GGUF" + model_file: "Qwen3.5-9B-Q8_0.gguf" + estimated_vram_gb: 10 + supports_vision: false supports_tools: true qwen3.5-9b-fp8-uncensored: diff --git a/kischdle/llmux/llmux/backends/llamacpp.py b/kischdle/llmux/llmux/backends/llamacpp.py index aae0991..9834ad5 100644 --- a/kischdle/llmux/llmux/backends/llamacpp.py +++ b/kischdle/llmux/llmux/backends/llamacpp.py @@ -1,4 +1,5 @@ import asyncio +import gc import json import logging import time @@ -20,9 +21,13 @@ class LlamaCppBackend(BaseBackend): self._loaded: dict[str, dict] = {} def _resolve_gguf_path(self, physical: PhysicalModel, filename: str) -> str: - """Resolve a GGUF filename to its path in the HF cache.""" + """Resolve a GGUF filename — check flat gguf/ dir first, then HF cache.""" + # Check flat gguf/ directory + flat_path = self._models_dir / "gguf" / filename + if flat_path.exists(): + return str(flat_path) + # Fall back to HF cache resolution from huggingface_hub import hf_hub_download - # model_id stores the HF repo, model_file/mmproj_file store the filenames return hf_hub_download( repo_id=physical.model_id, filename=filename, @@ -42,36 +47,53 @@ class LlamaCppBackend(BaseBackend): "model_path": model_path, "n_gpu_layers": n_gpu_layers, "n_ctx": 4096, + "flash_attn": True, "verbose": False, } if physical.mmproj_file: mmproj_path = self._resolve_gguf_path(physical, physical.mmproj_file) kwargs["chat_handler"] = _create_vision_handler(mmproj_path) - return Llama(**kwargs) + llm = Llama(**kwargs) + return llm loop = asyncio.get_event_loop() llm = await loop.run_in_executor(None, _load) - self._loaded[model_id] = {"llm": llm, "n_gpu_layers": n_gpu_layers} + + # Create thinking-enabled and thinking-disabled chat handlers from Jinja template + think_handler = _create_think_handler(llm, enable_thinking=True) + no_think_handler = _create_think_handler(llm, enable_thinking=False) + + self._loaded[model_id] = { + "llm": llm, + "n_gpu_layers": n_gpu_layers, + "think_handler": think_handler, + "no_think_handler": no_think_handler, + } async def unload(self, model_id: str) -> None: if model_id not in self._loaded: return entry = self._loaded.pop(model_id) del entry["llm"] + del entry + gc.collect() + logger.info(f"Unloaded GGUF model {model_id}") async def generate(self, model_id, messages, params, stream=False, tools=None): entry = self._loaded[model_id] llm = entry["llm"] - effective_messages = list(messages) + # Swap chat handler based on thinking mode + original_handler = llm.chat_handler if "enable_thinking" in params: - if not params["enable_thinking"]: - if effective_messages and effective_messages[0].get("role") == "system": - effective_messages[0] = dict(effective_messages[0]) - effective_messages[0]["content"] = "/no_think\n" + effective_messages[0]["content"] - else: - effective_messages.insert(0, {"role": "system", "content": "/no_think"}) + if params["enable_thinking"]: + handler = entry.get("think_handler") + else: + handler = entry.get("no_think_handler") + if handler: + llm.chat_handler = handler + effective_messages = list(messages) if "system_prompt_prefix" in params: prefix = params["system_prompt_prefix"] if effective_messages and effective_messages[0].get("role") == "system": @@ -80,10 +102,14 @@ class LlamaCppBackend(BaseBackend): else: effective_messages.insert(0, {"role": "system", "content": prefix}) - if stream: - return self._stream_generate(llm, effective_messages, model_id, tools) - else: - return await self._full_generate(llm, effective_messages, model_id, tools) + try: + if stream: + return self._stream_generate(llm, effective_messages, model_id, tools) + else: + return await self._full_generate(llm, effective_messages, model_id, tools) + finally: + # Restore original handler + llm.chat_handler = original_handler async def _full_generate(self, llm, messages, model_id, tools): def _run(): @@ -116,6 +142,28 @@ class LlamaCppBackend(BaseBackend): return _iter() +def _create_think_handler(llm, enable_thinking: bool): + """Create a chat handler with thinking enabled or disabled via Jinja template.""" + mode = "enabled" if enable_thinking else "disabled" + try: + from llama_cpp.llama_chat_format import Jinja2ChatFormatter + template_str = llm.metadata.get("tokenizer.chat_template", "") + if not template_str: + logger.warning("Model has no embedded chat template") + return None + value = "true" if enable_thinking else "false" + patched = "{%- set enable_thinking = " + value + " %}\n" + template_str + eos = llm._model.token_get_text(llm._model.token_eos()) + bos = llm._model.token_get_text(llm._model.token_bos()) + formatter = Jinja2ChatFormatter(template=patched, eos_token=eos, bos_token=bos) + handler = formatter.to_chat_handler() + logger.info(f"Created chat handler with thinking {mode}") + return handler + except Exception as e: + logger.error(f"Failed to create thinking-{mode} handler: {e}") + return None + + def _create_vision_handler(mmproj_path: str): from llama_cpp.llama_chat_format import Llava16ChatHandler return Llava16ChatHandler(clip_model_path=mmproj_path)