From c6677dcab346f603a8789d6205530a8a76fc100deec350a69e275e843861fe53 Mon Sep 17 00:00:00 2001 From: tlg Date: Sat, 4 Apr 2026 09:40:40 +0200 Subject: [PATCH] feat: llama-cpp-python backend with GGUF, vision, and tool support --- kischdle/llmux/llmux/backends/llamacpp.py | 120 ++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 kischdle/llmux/llmux/backends/llamacpp.py diff --git a/kischdle/llmux/llmux/backends/llamacpp.py b/kischdle/llmux/llmux/backends/llamacpp.py new file mode 100644 index 0000000..123bc1d --- /dev/null +++ b/kischdle/llmux/llmux/backends/llamacpp.py @@ -0,0 +1,120 @@ +import asyncio +import json +import logging +import time +import uuid +from pathlib import Path +from typing import AsyncIterator + +from llama_cpp import Llama + +from llmux.backends.base import BaseBackend +from llmux.config import PhysicalModel + +logger = logging.getLogger(__name__) + + +class LlamaCppBackend(BaseBackend): + def __init__(self, models_dir: str = "/models"): + self._models_dir = Path(models_dir) + self._loaded: dict[str, dict] = {} + + async def load(self, model_id: str, n_gpu_layers: int = -1) -> None: + if model_id in self._loaded: + return + physical = _get_physical_config(model_id) + model_path = self._models_dir / physical.model_file + logger.info(f"Loading GGUF model {model_path} with n_gpu_layers={n_gpu_layers}") + + def _load(): + kwargs = { + "model_path": str(model_path), + "n_gpu_layers": n_gpu_layers, + "n_ctx": 8192, + "verbose": False, + } + if physical.mmproj_file: + mmproj_path = self._models_dir / physical.mmproj_file + kwargs["chat_handler"] = _create_vision_handler(str(mmproj_path)) + return Llama(**kwargs) + + loop = asyncio.get_event_loop() + llm = await loop.run_in_executor(None, _load) + self._loaded[model_id] = {"llm": llm, "n_gpu_layers": n_gpu_layers} + + async def unload(self, model_id: str) -> None: + if model_id not in self._loaded: + return + entry = self._loaded.pop(model_id) + del entry["llm"] + + async def generate(self, model_id, messages, params, stream=False, tools=None): + entry = self._loaded[model_id] + llm = entry["llm"] + + effective_messages = list(messages) + if "enable_thinking" in params: + if not params["enable_thinking"]: + if effective_messages and effective_messages[0].get("role") == "system": + effective_messages[0] = dict(effective_messages[0]) + effective_messages[0]["content"] = "/no_think\n" + effective_messages[0]["content"] + else: + effective_messages.insert(0, {"role": "system", "content": "/no_think"}) + + if "system_prompt_prefix" in params: + prefix = params["system_prompt_prefix"] + if effective_messages and effective_messages[0].get("role") == "system": + effective_messages[0] = dict(effective_messages[0]) + effective_messages[0]["content"] = prefix + "\n\n" + effective_messages[0]["content"] + else: + effective_messages.insert(0, {"role": "system", "content": prefix}) + + if stream: + return self._stream_generate(llm, effective_messages, model_id, tools) + else: + return await self._full_generate(llm, effective_messages, model_id, tools) + + async def _full_generate(self, llm, messages, model_id, tools): + def _run(): + kwargs = {"messages": messages, "max_tokens": 4096} + if tools: + kwargs["tools"] = tools + return llm.create_chat_completion(**kwargs) + + loop = asyncio.get_event_loop() + result = await loop.run_in_executor(None, _run) + result["model"] = model_id + return result + + async def _stream_generate(self, llm, messages, model_id, tools): + def _run(): + kwargs = {"messages": messages, "max_tokens": 4096, "stream": True} + if tools: + kwargs["tools"] = tools + return llm.create_chat_completion(**kwargs) + + loop = asyncio.get_event_loop() + stream = await loop.run_in_executor(None, _run) + + async def _iter(): + for chunk in stream: + chunk["model"] = model_id + yield f"data: {json.dumps(chunk)}\n\n" + yield "data: [DONE]\n\n" + + return _iter() + + +def _create_vision_handler(mmproj_path: str): + from llama_cpp.llama_chat_format import Llava16ChatHandler + return Llava16ChatHandler(clip_model_path=mmproj_path) + + +_physical_models: dict[str, PhysicalModel] = {} + +def set_physical_models(models: dict[str, PhysicalModel]) -> None: + global _physical_models + _physical_models = models + +def _get_physical_config(model_id: str) -> PhysicalModel: + return _physical_models[model_id]