fix: proper VRAM cleanup on model unload + CUDA alloc config

- Force gc.collect() before torch.cuda.empty_cache() to ensure all
  model references are released
- Set PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True in container

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
tlg
2026-04-05 17:59:23 +02:00
parent d3285bad8a
commit aa7a160118
2 changed files with 13 additions and 4 deletions

View File

@@ -57,6 +57,9 @@ COPY --from=builder /usr/local/lib/python3.12/dist-packages/llama_cpp_python* /u
COPY llmux/ /app/llmux/ COPY llmux/ /app/llmux/
WORKDIR /app WORKDIR /app
# Avoid CUDA memory fragmentation when swapping models
ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
# Run the server # Run the server
EXPOSE 8081 EXPOSE 8081
CMD ["uvicorn", "llmux.main:app", "--host", "0.0.0.0", "--port", "8081"] CMD ["uvicorn", "llmux.main:app", "--host", "0.0.0.0", "--port", "8081"]

View File

@@ -45,12 +45,18 @@ class TransformersLLMBackend(BaseBackend):
async def unload(self, model_id: str) -> None: async def unload(self, model_id: str) -> None:
if model_id not in self._loaded: if model_id not in self._loaded:
return return
import gc
entry = self._loaded.pop(model_id) entry = self._loaded.pop(model_id)
del entry["model"] model = entry.pop("model")
del entry["tokenizer"] tokenizer = entry.pop("tokenizer")
if entry.get("processor"): processor = entry.pop("processor", None)
del entry["processor"] del model
del tokenizer
del processor
del entry
gc.collect()
torch.cuda.empty_cache() torch.cuda.empty_cache()
logger.info(f"Unloaded {model_id}, VRAM freed")
async def generate(self, model_id, messages, params, stream=False, tools=None): async def generate(self, model_id, messages, params, stream=False, tools=None):
entry = self._loaded[model_id] entry = self._loaded[model_id]