From f24a225bafa2378e85259b959adcba7223b411aa412558b311bc5a17751e4ab3 Mon Sep 17 00:00:00 2001
From: tlg <thomas.langer@destengs.com>
Date: Sun, 5 Apr 2026 21:33:36 +0200
Subject: [PATCH] fix: resolve GGUF paths through HF cache, add model_id to
 GGUF config

llama-cpp-python backend now uses huggingface_hub to resolve GGUF
file paths within the HF cache structure instead of assuming flat
/models/ directory.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 kischdle/llmux/config/models.yaml         |  1 +
 kischdle/llmux/llmux/backends/llamacpp.py | 19 +++++++++++++++----
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/kischdle/llmux/config/models.yaml b/kischdle/llmux/config/models.yaml
index ce769a8..9f28812 100644
--- a/kischdle/llmux/config/models.yaml
+++ b/kischdle/llmux/config/models.yaml
@@ -10,6 +10,7 @@ physical_models:
   qwen3.5-9b-fp8-uncensored:
     type: llm
     backend: llamacpp
+    model_id: "HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive"
     model_file: "Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-Q8_0.gguf"
     mmproj_file: "mmproj-Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-BF16.gguf"
     estimated_vram_gb: 9
diff --git a/kischdle/llmux/llmux/backends/llamacpp.py b/kischdle/llmux/llmux/backends/llamacpp.py
index 123bc1d..b362221 100644
--- a/kischdle/llmux/llmux/backends/llamacpp.py
+++ b/kischdle/llmux/llmux/backends/llamacpp.py
@@ -19,23 +19,34 @@ class LlamaCppBackend(BaseBackend):
         self._models_dir = Path(models_dir)
         self._loaded: dict[str, dict] = {}
 
+    def _resolve_gguf_path(self, physical: PhysicalModel, filename: str) -> str:
+        """Resolve a GGUF filename to its path in the HF cache."""
+        from huggingface_hub import hf_hub_download
+        # model_id stores the HF repo, model_file/mmproj_file store the filenames
+        return hf_hub_download(
+            repo_id=physical.model_id,
+            filename=filename,
+            cache_dir=str(self._models_dir),
+            local_files_only=True,
+        )
+
     async def load(self, model_id: str, n_gpu_layers: int = -1) -> None:
         if model_id in self._loaded:
             return
         physical = _get_physical_config(model_id)
-        model_path = self._models_dir / physical.model_file
+        model_path = self._resolve_gguf_path(physical, physical.model_file)
         logger.info(f"Loading GGUF model {model_path} with n_gpu_layers={n_gpu_layers}")
 
         def _load():
             kwargs = {
-                "model_path": str(model_path),
+                "model_path": model_path,
                 "n_gpu_layers": n_gpu_layers,
                 "n_ctx": 8192,
                 "verbose": False,
             }
             if physical.mmproj_file:
-                mmproj_path = self._models_dir / physical.mmproj_file
-                kwargs["chat_handler"] = _create_vision_handler(str(mmproj_path))
+                mmproj_path = self._resolve_gguf_path(physical, physical.mmproj_file)
+                kwargs["chat_handler"] = _create_vision_handler(mmproj_path)
             return Llama(**kwargs)
 
         loop = asyncio.get_event_loop()