- Add harmony.py: strip GPT-OSS-20B analysis/thinking channel from both streaming and non-streaming responses (HarmonyStreamFilter + extract_final_text) - Add per-model asyncio.Lock in llamacpp backend to prevent concurrent C++ access that caused container segfaults (exit 139) - Fix chat handler swap for streaming: move inside _stream_generate within lock scope (was broken by try/finally running before stream was consumed) - Filter /v1/models to return only LLM models (hide ASR/TTS from chat dropdown) - Correct Qwen3.5-4B estimated_vram_gb: 4 → 9 (actual allocation ~8GB) - Add GPU memory verification after eviction with retry loop in vram_manager - Add HF_TOKEN_PATH support in main.py for gated model access - Add /v1/audio/models and /v1/audio/voices discovery endpoints (no auth) - Add OOM error handling in both backends and chat route - Add AUDIO_STT_SUPPORTED_CONTENT_TYPES for webm/wav/mp3/ogg - Add performance test script (scripts/perf_test.py) - Update tests to match current config (42 tests pass) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
155 lines
6.9 KiB
Python
155 lines
6.9 KiB
Python
import asyncio
|
|
import pytest
|
|
|
|
from llmux.vram_manager import VRAMManager, ModelSlot
|
|
|
|
|
|
class FakeBackend:
|
|
"""Simulates a backend that tracks load/unload calls."""
|
|
|
|
def __init__(self):
|
|
self.loaded = {}
|
|
self.load_count = 0
|
|
self.unload_count = 0
|
|
|
|
async def load(self, model_id: str):
|
|
self.loaded[model_id] = True
|
|
self.load_count += 1
|
|
|
|
async def unload(self, model_id: str):
|
|
self.loaded.pop(model_id, None)
|
|
self.unload_count += 1
|
|
|
|
|
|
@pytest.fixture
|
|
def manager():
|
|
return VRAMManager(total_vram_gb=16.0, verify_gpu=False)
|
|
|
|
|
|
def test_priority_ordering():
|
|
assert ModelSlot.priority_rank("llm") == 0
|
|
assert ModelSlot.priority_rank("tts") == 1
|
|
assert ModelSlot.priority_rank("asr") == 2
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_load_into_empty_vram(manager):
|
|
backend = FakeBackend()
|
|
await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend)
|
|
assert manager.is_loaded("qwen3.5-4b")
|
|
assert manager.available_vram_gb == pytest.approx(12.0)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_load_alongside_when_fits(manager):
|
|
backend = FakeBackend()
|
|
await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend)
|
|
await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend)
|
|
assert manager.is_loaded("cohere-transcribe")
|
|
assert manager.is_loaded("qwen3.5-4b")
|
|
assert manager.available_vram_gb == pytest.approx(8.0)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_evict_llm_first(manager):
|
|
backend = FakeBackend()
|
|
await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend)
|
|
await manager.load_model("chatterbox-multilingual", model_type="tts", vram_gb=2.0, backend=backend)
|
|
await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend)
|
|
# 10 GB used. Loading 9B (9GB). Evict LLM (4B), free=12. ASR+TTS+9B=15, fits.
|
|
await manager.load_model("qwen3.5-9b-fp8", model_type="llm", vram_gb=9.0, backend=backend)
|
|
assert not manager.is_loaded("qwen3.5-4b")
|
|
assert manager.is_loaded("cohere-transcribe")
|
|
assert manager.is_loaded("chatterbox-multilingual")
|
|
assert manager.is_loaded("qwen3.5-9b-fp8")
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_evict_cascade_asr_survives(manager):
|
|
"""When LLM fits alongside ASR after evicting LLM+TTS, ASR survives."""
|
|
backend = FakeBackend()
|
|
await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend)
|
|
await manager.load_model("chatterbox-multilingual", model_type="tts", vram_gb=2.0, backend=backend)
|
|
await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend)
|
|
# 10 GB used. Need 12GB. Evict LLM(4)->free=10. Evict TTS(2)->free=12. ASR+12=16, fits.
|
|
await manager.load_model("large-llm", model_type="llm", vram_gb=12.0, backend=backend)
|
|
assert not manager.is_loaded("qwen3.5-4b")
|
|
assert not manager.is_loaded("chatterbox-multilingual")
|
|
assert manager.is_loaded("cohere-transcribe") # ASR survives
|
|
assert manager.is_loaded("large-llm")
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_evict_cascade_full_for_huge_llm(manager):
|
|
"""When LLM is too large to fit alongside ASR, everything gets evicted."""
|
|
backend = FakeBackend()
|
|
await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend)
|
|
await manager.load_model("chatterbox-multilingual", model_type="tts", vram_gb=2.0, backend=backend)
|
|
await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend)
|
|
# 10 GB used. gpt-oss-20b needs 13GB. Evict LLM(4)->free=10. TTS(2)->free=12. ASR(4)->free=16. Load alone.
|
|
await manager.load_model("gpt-oss-20b", model_type="llm", vram_gb=13.0, backend=backend)
|
|
assert not manager.is_loaded("qwen3.5-4b")
|
|
assert not manager.is_loaded("chatterbox-multilingual")
|
|
assert not manager.is_loaded("cohere-transcribe") # ASR evicted as last resort
|
|
assert manager.is_loaded("gpt-oss-20b")
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_tts_cannot_evict_asr(manager):
|
|
"""TTS request must not evict ASR — it evicts LLM instead."""
|
|
backend = FakeBackend()
|
|
await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend)
|
|
await manager.load_model("qwen3.5-9b-fp8", model_type="llm", vram_gb=9.0, backend=backend)
|
|
# 13GB used, 3GB free. TTS needs 2GB — fits! Load alongside.
|
|
await manager.load_model("chatterbox", model_type="tts", vram_gb=2.0, backend=backend)
|
|
assert manager.is_loaded("cohere-transcribe")
|
|
assert manager.is_loaded("qwen3.5-9b-fp8")
|
|
assert manager.is_loaded("chatterbox")
|
|
# Now replace TTS with a bigger one that needs eviction
|
|
# 15GB used, 1GB free. New TTS needs 2GB. Evict old TTS(2)->free=3. Load.
|
|
await manager.load_model("chatterbox-ml", model_type="tts", vram_gb=2.0, backend=backend)
|
|
assert manager.is_loaded("cohere-transcribe") # ASR must survive
|
|
assert manager.is_loaded("chatterbox-ml")
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_asr_evicts_llm_not_reversed(manager):
|
|
"""When ASR request arrives and LLM is loaded, evict LLM (lower priority)."""
|
|
backend = FakeBackend()
|
|
await manager.load_model("gpt-oss-20b", model_type="llm", vram_gb=13.0, backend=backend)
|
|
# 13GB used, 3GB free. ASR needs 4GB. Must evict LLM.
|
|
await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend)
|
|
assert not manager.is_loaded("gpt-oss-20b")
|
|
assert manager.is_loaded("cohere-transcribe")
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_already_loaded_is_noop(manager):
|
|
backend = FakeBackend()
|
|
await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend)
|
|
await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend)
|
|
assert backend.load_count == 1
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_spec_scenario_switch_to_9b(manager):
|
|
backend = FakeBackend()
|
|
await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend)
|
|
await manager.load_model("chatterbox-multilingual", model_type="tts", vram_gb=2.0, backend=backend)
|
|
await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend)
|
|
await manager.load_model("qwen3.5-9b-fp8", model_type="llm", vram_gb=9.0, backend=backend)
|
|
assert manager.is_loaded("cohere-transcribe")
|
|
assert manager.is_loaded("chatterbox-multilingual")
|
|
assert manager.is_loaded("qwen3.5-9b-fp8")
|
|
assert not manager.is_loaded("qwen3.5-4b")
|
|
assert manager.available_vram_gb == pytest.approx(1.0)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_get_loaded_models(manager):
|
|
backend = FakeBackend()
|
|
await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend)
|
|
await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend)
|
|
loaded = manager.get_loaded_models()
|
|
assert set(loaded.keys()) == {"cohere-transcribe", "qwen3.5-4b"}
|