import asyncio import pytest from llmux.vram_manager import VRAMManager, ModelSlot class FakeBackend: """Simulates a backend that tracks load/unload calls.""" def __init__(self): self.loaded = {} self.load_count = 0 self.unload_count = 0 async def load(self, model_id: str): self.loaded[model_id] = True self.load_count += 1 async def unload(self, model_id: str): self.loaded.pop(model_id, None) self.unload_count += 1 @pytest.fixture def manager(): return VRAMManager(total_vram_gb=16.0) def test_priority_ordering(): assert ModelSlot.priority_rank("llm") == 0 assert ModelSlot.priority_rank("tts") == 1 assert ModelSlot.priority_rank("asr") == 2 @pytest.mark.asyncio async def test_load_into_empty_vram(manager): backend = FakeBackend() await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend) assert manager.is_loaded("qwen3.5-4b") assert manager.available_vram_gb == pytest.approx(12.0) @pytest.mark.asyncio async def test_load_alongside_when_fits(manager): backend = FakeBackend() await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend) await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend) assert manager.is_loaded("cohere-transcribe") assert manager.is_loaded("qwen3.5-4b") assert manager.available_vram_gb == pytest.approx(8.0) @pytest.mark.asyncio async def test_evict_llm_first(manager): backend = FakeBackend() await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend) await manager.load_model("chatterbox-multilingual", model_type="tts", vram_gb=2.0, backend=backend) await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend) # 10 GB used. Loading 9B (9GB). Evict LLM (4B), free=12. ASR+TTS+9B=15, fits. await manager.load_model("qwen3.5-9b-fp8", model_type="llm", vram_gb=9.0, backend=backend) assert not manager.is_loaded("qwen3.5-4b") assert manager.is_loaded("cohere-transcribe") assert manager.is_loaded("chatterbox-multilingual") assert manager.is_loaded("qwen3.5-9b-fp8") @pytest.mark.asyncio async def test_evict_cascade_for_large_llm(manager): backend = FakeBackend() await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend) await manager.load_model("chatterbox-multilingual", model_type="tts", vram_gb=2.0, backend=backend) await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend) # 10 GB used. gpt-oss-20b needs 12GB. Evict LLM(4)->free=10. Evict TTS(2)->free=12. Load. await manager.load_model("gpt-oss-20b", model_type="llm", vram_gb=12.0, backend=backend) assert not manager.is_loaded("qwen3.5-4b") assert not manager.is_loaded("chatterbox-multilingual") assert manager.is_loaded("cohere-transcribe") # ASR survives if possible assert manager.is_loaded("gpt-oss-20b") @pytest.mark.asyncio async def test_asr_evicts_llm_not_reversed(manager): """When ASR request arrives and LLM is loaded, evict LLM (lower priority).""" backend = FakeBackend() await manager.load_model("gpt-oss-20b", model_type="llm", vram_gb=13.0, backend=backend) # 13GB used, 3GB free. ASR needs 4GB. Must evict LLM. await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend) assert not manager.is_loaded("gpt-oss-20b") assert manager.is_loaded("cohere-transcribe") @pytest.mark.asyncio async def test_already_loaded_is_noop(manager): backend = FakeBackend() await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend) await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend) assert backend.load_count == 1 @pytest.mark.asyncio async def test_spec_scenario_switch_to_9b(manager): backend = FakeBackend() await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend) await manager.load_model("chatterbox-multilingual", model_type="tts", vram_gb=2.0, backend=backend) await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend) await manager.load_model("qwen3.5-9b-fp8", model_type="llm", vram_gb=9.0, backend=backend) assert manager.is_loaded("cohere-transcribe") assert manager.is_loaded("chatterbox-multilingual") assert manager.is_loaded("qwen3.5-9b-fp8") assert not manager.is_loaded("qwen3.5-4b") assert manager.available_vram_gb == pytest.approx(1.0) @pytest.mark.asyncio async def test_get_loaded_models(manager): backend = FakeBackend() await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend) await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend) loaded = manager.get_loaded_models() assert set(loaded.keys()) == {"cohere-transcribe", "qwen3.5-4b"}