fix: Open WebUI integration — Harmony stripping, VRAM eviction, concurrency lock
- Add harmony.py: strip GPT-OSS-20B analysis/thinking channel from both streaming and non-streaming responses (HarmonyStreamFilter + extract_final_text) - Add per-model asyncio.Lock in llamacpp backend to prevent concurrent C++ access that caused container segfaults (exit 139) - Fix chat handler swap for streaming: move inside _stream_generate within lock scope (was broken by try/finally running before stream was consumed) - Filter /v1/models to return only LLM models (hide ASR/TTS from chat dropdown) - Correct Qwen3.5-4B estimated_vram_gb: 4 → 9 (actual allocation ~8GB) - Add GPU memory verification after eviction with retry loop in vram_manager - Add HF_TOKEN_PATH support in main.py for gated model access - Add /v1/audio/models and /v1/audio/voices discovery endpoints (no auth) - Add OOM error handling in both backends and chat route - Add AUDIO_STT_SUPPORTED_CONTENT_TYPES for webm/wav/mp3/ogg - Add performance test script (scripts/perf_test.py) - Update tests to match current config (42 tests pass) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
55
kischdle/llmux/tests/test_harmony.py
Normal file
55
kischdle/llmux/tests/test_harmony.py
Normal file
@@ -0,0 +1,55 @@
|
||||
from llmux.harmony import extract_final_text, HarmonyStreamFilter
|
||||
|
||||
|
||||
def test_extract_llamacpp_format():
|
||||
text = '<|channel|>analysis<|message|>User greeting. Simple.<|end|><|start|>assistant<|channel|>final<|message|>Hello! How can I help you today?'
|
||||
assert extract_final_text(text) == "Hello! How can I help you today?"
|
||||
|
||||
|
||||
def test_extract_llamacpp_with_end_tag():
|
||||
text = '<|channel|>analysis<|message|>thinking...<|end|><|start|>assistant<|channel|>final<|message|>The answer is 42.<|end|>'
|
||||
assert extract_final_text(text) == "The answer is 42."
|
||||
|
||||
|
||||
def test_extract_transformers_format():
|
||||
text = 'analysisUser greeting. Just respond friendly.assistantfinalHello! I am doing great.'
|
||||
assert extract_final_text(text) == "Hello! I am doing great."
|
||||
|
||||
|
||||
def test_extract_non_harmony_passthrough():
|
||||
text = "Hello! I'm doing well, thanks for asking."
|
||||
assert extract_final_text(text) == text
|
||||
|
||||
|
||||
def test_stream_filter_llamacpp():
|
||||
f = HarmonyStreamFilter()
|
||||
chunks = [
|
||||
"<|channel|>", "analysis", "<|message|>", "User ", "greeting.",
|
||||
"<|end|>", "<|start|>", "assistant", "<|channel|>", "final",
|
||||
"<|message|>", "Hello!", " How ", "are you?"
|
||||
]
|
||||
output = ""
|
||||
for c in chunks:
|
||||
output += f.feed(c)
|
||||
output += f.flush()
|
||||
assert output == "Hello! How are you?"
|
||||
|
||||
|
||||
def test_stream_filter_transformers():
|
||||
f = HarmonyStreamFilter()
|
||||
chunks = ["analysis", "User ", "greeting.", "assistant", "final", "Hello!", " Great day!"]
|
||||
output = ""
|
||||
for c in chunks:
|
||||
output += f.feed(c)
|
||||
output += f.flush()
|
||||
assert output == "Hello! Great day!"
|
||||
|
||||
|
||||
def test_stream_filter_non_harmony():
|
||||
f = HarmonyStreamFilter()
|
||||
chunks = ["Hello", " world", "!"]
|
||||
output = ""
|
||||
for c in chunks:
|
||||
output += f.feed(c)
|
||||
output += f.flush()
|
||||
assert output == "Hello world!"
|
||||
Reference in New Issue
Block a user