fix: proper VRAM cleanup on model unload + CUDA alloc config
- Force gc.collect() before torch.cuda.empty_cache() to ensure all model references are released - Set PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True in container Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -57,6 +57,9 @@ COPY --from=builder /usr/local/lib/python3.12/dist-packages/llama_cpp_python* /u
|
|||||||
COPY llmux/ /app/llmux/
|
COPY llmux/ /app/llmux/
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Avoid CUDA memory fragmentation when swapping models
|
||||||
|
ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
||||||
|
|
||||||
# Run the server
|
# Run the server
|
||||||
EXPOSE 8081
|
EXPOSE 8081
|
||||||
CMD ["uvicorn", "llmux.main:app", "--host", "0.0.0.0", "--port", "8081"]
|
CMD ["uvicorn", "llmux.main:app", "--host", "0.0.0.0", "--port", "8081"]
|
||||||
|
|||||||
@@ -45,12 +45,18 @@ class TransformersLLMBackend(BaseBackend):
|
|||||||
async def unload(self, model_id: str) -> None:
|
async def unload(self, model_id: str) -> None:
|
||||||
if model_id not in self._loaded:
|
if model_id not in self._loaded:
|
||||||
return
|
return
|
||||||
|
import gc
|
||||||
entry = self._loaded.pop(model_id)
|
entry = self._loaded.pop(model_id)
|
||||||
del entry["model"]
|
model = entry.pop("model")
|
||||||
del entry["tokenizer"]
|
tokenizer = entry.pop("tokenizer")
|
||||||
if entry.get("processor"):
|
processor = entry.pop("processor", None)
|
||||||
del entry["processor"]
|
del model
|
||||||
|
del tokenizer
|
||||||
|
del processor
|
||||||
|
del entry
|
||||||
|
gc.collect()
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
logger.info(f"Unloaded {model_id}, VRAM freed")
|
||||||
|
|
||||||
async def generate(self, model_id, messages, params, stream=False, tools=None):
|
async def generate(self, model_id, messages, params, stream=False, tools=None):
|
||||||
entry = self._loaded[model_id]
|
entry = self._loaded[model_id]
|
||||||
|
|||||||
Reference in New Issue
Block a user