diff --git a/kischdle/llmux/Dockerfile b/kischdle/llmux/Dockerfile index 2146a22..3f2e108 100644 --- a/kischdle/llmux/Dockerfile +++ b/kischdle/llmux/Dockerfile @@ -57,6 +57,9 @@ COPY --from=builder /usr/local/lib/python3.12/dist-packages/llama_cpp_python* /u COPY llmux/ /app/llmux/ WORKDIR /app +# Avoid CUDA memory fragmentation when swapping models +ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True + # Run the server EXPOSE 8081 CMD ["uvicorn", "llmux.main:app", "--host", "0.0.0.0", "--port", "8081"] diff --git a/kischdle/llmux/llmux/backends/transformers_llm.py b/kischdle/llmux/llmux/backends/transformers_llm.py index ca1814f..d290554 100644 --- a/kischdle/llmux/llmux/backends/transformers_llm.py +++ b/kischdle/llmux/llmux/backends/transformers_llm.py @@ -45,12 +45,18 @@ class TransformersLLMBackend(BaseBackend): async def unload(self, model_id: str) -> None: if model_id not in self._loaded: return + import gc entry = self._loaded.pop(model_id) - del entry["model"] - del entry["tokenizer"] - if entry.get("processor"): - del entry["processor"] + model = entry.pop("model") + tokenizer = entry.pop("tokenizer") + processor = entry.pop("processor", None) + del model + del tokenizer + del processor + del entry + gc.collect() torch.cuda.empty_cache() + logger.info(f"Unloaded {model_id}, VRAM freed") async def generate(self, model_id, messages, params, stream=False, tools=None): entry = self._loaded[model_id]