fix: Open WebUI integration — Harmony stripping, VRAM eviction, concurrency lock
- Add harmony.py: strip GPT-OSS-20B analysis/thinking channel from both streaming and non-streaming responses (HarmonyStreamFilter + extract_final_text) - Add per-model asyncio.Lock in llamacpp backend to prevent concurrent C++ access that caused container segfaults (exit 139) - Fix chat handler swap for streaming: move inside _stream_generate within lock scope (was broken by try/finally running before stream was consumed) - Filter /v1/models to return only LLM models (hide ASR/TTS from chat dropdown) - Correct Qwen3.5-4B estimated_vram_gb: 4 → 9 (actual allocation ~8GB) - Add GPU memory verification after eviction with retry loop in vram_manager - Add HF_TOKEN_PATH support in main.py for gated model access - Add /v1/audio/models and /v1/audio/voices discovery endpoints (no auth) - Add OOM error handling in both backends and chat route - Add AUDIO_STT_SUPPORTED_CONTENT_TYPES for webm/wav/mp3/ogg - Add performance test script (scripts/perf_test.py) - Update tests to match current config (42 tests pass) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -10,12 +10,12 @@ def registry():
|
||||
|
||||
def test_list_virtual_models(registry):
|
||||
models = registry.list_virtual_models()
|
||||
assert len(models) == 15
|
||||
assert len(models) == 12 # only LLM models, not ASR/TTS
|
||||
names = [m["id"] for m in models]
|
||||
assert "Qwen3.5-9B-FP8-Thinking" in names
|
||||
assert "GPT-OSS-20B-High" in names
|
||||
assert "cohere-transcribe" in names
|
||||
assert "Chatterbox-Multilingual" in names
|
||||
assert "cohere-transcribe" not in names
|
||||
assert "Chatterbox-Multilingual" not in names
|
||||
|
||||
|
||||
def test_virtual_model_openai_format(registry):
|
||||
@@ -28,7 +28,7 @@ def test_virtual_model_openai_format(registry):
|
||||
def test_resolve_virtual_to_physical(registry):
|
||||
physical_id, physical, params = registry.resolve("Qwen3.5-9B-FP8-Thinking")
|
||||
assert physical_id == "qwen3.5-9b-fp8"
|
||||
assert physical.backend == "transformers"
|
||||
assert physical.backend == "llamacpp"
|
||||
assert params == {"enable_thinking": True}
|
||||
|
||||
|
||||
@@ -58,7 +58,7 @@ def test_resolve_unknown_model_raises(registry):
|
||||
def test_get_physical(registry):
|
||||
physical = registry.get_physical("qwen3.5-9b-fp8")
|
||||
assert physical.type == "llm"
|
||||
assert physical.estimated_vram_gb == 9
|
||||
assert physical.estimated_vram_gb == 10
|
||||
|
||||
|
||||
def test_get_physical_unknown_raises(registry):
|
||||
|
||||
Reference in New Issue
Block a user