From f24a225bafa2378e85259b959adcba7223b411aa412558b311bc5a17751e4ab3 Mon Sep 17 00:00:00 2001 From: tlg Date: Sun, 5 Apr 2026 21:33:36 +0200 Subject: [PATCH] fix: resolve GGUF paths through HF cache, add model_id to GGUF config llama-cpp-python backend now uses huggingface_hub to resolve GGUF file paths within the HF cache structure instead of assuming flat /models/ directory. Co-Authored-By: Claude Opus 4.6 (1M context) --- kischdle/llmux/config/models.yaml | 1 + kischdle/llmux/llmux/backends/llamacpp.py | 19 +++++++++++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/kischdle/llmux/config/models.yaml b/kischdle/llmux/config/models.yaml index ce769a8..9f28812 100644 --- a/kischdle/llmux/config/models.yaml +++ b/kischdle/llmux/config/models.yaml @@ -10,6 +10,7 @@ physical_models: qwen3.5-9b-fp8-uncensored: type: llm backend: llamacpp + model_id: "HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive" model_file: "Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-Q8_0.gguf" mmproj_file: "mmproj-Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-BF16.gguf" estimated_vram_gb: 9 diff --git a/kischdle/llmux/llmux/backends/llamacpp.py b/kischdle/llmux/llmux/backends/llamacpp.py index 123bc1d..b362221 100644 --- a/kischdle/llmux/llmux/backends/llamacpp.py +++ b/kischdle/llmux/llmux/backends/llamacpp.py @@ -19,23 +19,34 @@ class LlamaCppBackend(BaseBackend): self._models_dir = Path(models_dir) self._loaded: dict[str, dict] = {} + def _resolve_gguf_path(self, physical: PhysicalModel, filename: str) -> str: + """Resolve a GGUF filename to its path in the HF cache.""" + from huggingface_hub import hf_hub_download + # model_id stores the HF repo, model_file/mmproj_file store the filenames + return hf_hub_download( + repo_id=physical.model_id, + filename=filename, + cache_dir=str(self._models_dir), + local_files_only=True, + ) + async def load(self, model_id: str, n_gpu_layers: int = -1) -> None: if model_id in self._loaded: return physical = _get_physical_config(model_id) - model_path = self._models_dir / physical.model_file + model_path = self._resolve_gguf_path(physical, physical.model_file) logger.info(f"Loading GGUF model {model_path} with n_gpu_layers={n_gpu_layers}") def _load(): kwargs = { - "model_path": str(model_path), + "model_path": model_path, "n_gpu_layers": n_gpu_layers, "n_ctx": 8192, "verbose": False, } if physical.mmproj_file: - mmproj_path = self._models_dir / physical.mmproj_file - kwargs["chat_handler"] = _create_vision_handler(str(mmproj_path)) + mmproj_path = self._resolve_gguf_path(physical, physical.mmproj_file) + kwargs["chat_handler"] = _create_vision_handler(mmproj_path) return Llama(**kwargs) loop = asyncio.get_event_loop()