fix: Open WebUI integration — Harmony stripping, VRAM eviction, concurrency lock

- Add harmony.py: strip GPT-OSS-20B analysis/thinking channel from both streaming and non-streaming responses (HarmonyStreamFilter + extract_final_text) - Add per-model asyncio.Lock in llamacpp backend to prevent concurrent C++ access that caused container segfaults (exit 139) - Fix chat handler swap for streaming: move inside _stream_generate within lock scope (was broken by try/finally running before stream was consumed) - Filter /v1/models to return only LLM models (hide ASR/TTS from chat dropdown) - Correct Qwen3.5-4B estimated_vram_gb: 4 → 9 (actual allocation ~8GB) - Add GPU memory verification after eviction with retry loop in vram_manager - Add HF_TOKEN_PATH support in main.py for gated model access - Add /v1/audio/models and /v1/audio/voices discovery endpoints (no auth) - Add OOM error handling in both backends and chat route - Add AUDIO_STT_SUPPORTED_CONTENT_TYPES for webm/wav/mp3/ogg - Add performance test script (scripts/perf_test.py) - Update tests to match current config (42 tests pass) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 21:50:39 +02:00
parent 06923d51b4
commit 3edc055299
15 changed files with 634 additions and 74 deletions
--- a/kischdle/llmux/scripts/perf_test.py
+++ b/kischdle/llmux/scripts/perf_test.py
@@ -0,0 +1,224 @@
+#!/usr/bin/env python3
+"""Performance test for llmux — measures TTFT, tok/s, and total latency for each LLM model."""
+
+import json
+import time
+import sys
+import httpx
+
+BASE_URL = "http://127.0.0.1:8081"
+API_KEY = "sk-llmux-openwebui-hMD6pAka1czM53MtTkmmlFP8tF5zuiiDRgt-PCBnj-c"
+HEADERS = {"Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json"}
+
+# Test prompts — short and long to measure different characteristics
+PROMPTS = {
+    "short": "What is 2+2? Answer in one sentence.",
+    "medium": "Explain how a CPU works in 3-4 paragraphs.",
+}
+
+# Models to test — one virtual model per physical model (avoid duplicating physical loads)
+TEST_MODELS = [
+    # llama-cpp backend (GGUF)
+    ("Qwen3.5-9B-FP8-Instruct", "llamacpp", "~10GB"),
+    ("GPT-OSS-20B-Uncensored-Low", "llamacpp", "~13GB"),
+    # transformers backend
+    ("Qwen3.5-4B-Instruct", "transformers", "~4GB"),
+    # GPT-OSS-20B-Low disabled: needs libc6-dev sys/ headers for triton MXFP4 kernels
+]
+
+
+def clear_vram():
+    """Unload all models to start fresh."""
+    r = httpx.post(f"{BASE_URL}/admin/clear-vram", headers=HEADERS, timeout=60)
+    if r.status_code == 200:
+        print("  VRAM cleared")
+    else:
+        print(f"  WARN: clear-vram returned {r.status_code}")
+
+
+def test_streaming(model: str, prompt: str, prompt_label: str) -> dict:
+    """Test a model with streaming, measuring TTFT and tok/s."""
+    body = {
+        "model": model,
+        "messages": [{"role": "user", "content": prompt}],
+        "stream": True,
+    }
+
+    start = time.perf_counter()
+    first_token_time = None
+    token_count = 0
+    full_text = []
+
+    try:
+        with httpx.stream("POST", f"{BASE_URL}/v1/chat/completions",
+                          json=body, headers=HEADERS, timeout=300) as resp:
+            if resp.status_code != 200:
+                return {"model": model, "prompt": prompt_label, "error": f"HTTP {resp.status_code}"}
+
+            for line in resp.iter_lines():
+                if not line.startswith("data: "):
+                    continue
+                data = line[6:]
+                if data == "[DONE]":
+                    break
+                try:
+                    chunk = json.loads(data)
+                    delta = chunk.get("choices", [{}])[0].get("delta", {})
+                    content = delta.get("content", "")
+                    if content:
+                        if first_token_time is None:
+                            first_token_time = time.perf_counter()
+                        token_count += 1
+                        full_text.append(content)
+                except json.JSONDecodeError:
+                    continue
+
+    except Exception as e:
+        return {"model": model, "prompt": prompt_label, "error": str(e)}
+
+    end = time.perf_counter()
+    total_time = end - start
+    ttft = (first_token_time - start) if first_token_time else total_time
+
+    # Token generation time (after first token)
+    gen_time = (end - first_token_time) if first_token_time and token_count > 1 else 0
+    tok_per_sec = (token_count - 1) / gen_time if gen_time > 0 else 0
+
+    output_text = "".join(full_text)
+    output_chars = len(output_text)
+
+    return {
+        "model": model,
+        "prompt": prompt_label,
+        "ttft_s": round(ttft, 2),
+        "total_s": round(total_time, 2),
+        "tokens": token_count,
+        "tok_per_s": round(tok_per_sec, 1),
+        "output_chars": output_chars,
+    }
+
+
+def test_non_streaming(model: str, prompt: str, prompt_label: str) -> dict:
+    """Test a model without streaming — measures total latency."""
+    body = {
+        "model": model,
+        "messages": [{"role": "user", "content": prompt}],
+        "stream": False,
+    }
+
+    start = time.perf_counter()
+    try:
+        r = httpx.post(f"{BASE_URL}/v1/chat/completions",
+                       json=body, headers=HEADERS, timeout=300)
+        if r.status_code != 200:
+            return {"model": model, "prompt": prompt_label, "mode": "non-stream", "error": f"HTTP {r.status_code}"}
+        result = r.json()
+    except Exception as e:
+        return {"model": model, "prompt": prompt_label, "mode": "non-stream", "error": str(e)}
+
+    end = time.perf_counter()
+    content = result.get("choices", [{}])[0].get("message", {}).get("content", "")
+
+    return {
+        "model": model,
+        "prompt": prompt_label,
+        "mode": "non-stream",
+        "total_s": round(end - start, 2),
+        "output_chars": len(content),
+    }
+
+
+def run_tests():
+    print("=" * 80)
+    print("llmux Performance Test")
+    print("=" * 80)
+
+    # Check health
+    try:
+        r = httpx.get(f"{BASE_URL}/health", timeout=5)
+        health = r.json()
+        print(f"Server healthy — available VRAM: {health['available_vram_gb']} GB")
+    except Exception as e:
+        print(f"ERROR: Server not reachable: {e}")
+        sys.exit(1)
+
+    results = []
+
+    for model, backend, vram_est in TEST_MODELS:
+        print(f"\n{'─' * 60}")
+        print(f"Model: {model}  ({backend}, {vram_est})")
+        print(f"{'─' * 60}")
+
+        # Clear VRAM before each model to measure cold-start load time
+        clear_vram()
+
+        for prompt_label, prompt_text in PROMPTS.items():
+            # First run = cold start (includes model loading)
+            print(f"  [{prompt_label}] streaming (cold)...", end=" ", flush=True)
+            r = test_streaming(model, prompt_text, prompt_label)
+            r["cold_start"] = True
+            results.append(r)
+            if "error" in r:
+                print(f"ERROR: {r['error']}")
+            else:
+                print(f"TTFT={r['ttft_s']}s  total={r['total_s']}s  {r['tok_per_s']} tok/s  ({r['tokens']} tokens)")
+
+            # Second run = warm (model already loaded)
+            print(f"  [{prompt_label}] streaming (warm)...", end=" ", flush=True)
+            r = test_streaming(model, prompt_text, prompt_label)
+            r["cold_start"] = False
+            results.append(r)
+            if "error" in r:
+                print(f"ERROR: {r['error']}")
+            else:
+                print(f"TTFT={r['ttft_s']}s  total={r['total_s']}s  {r['tok_per_s']} tok/s  ({r['tokens']} tokens)")
+
+        # Non-streaming tests (warm)
+        for plabel in ["short", "medium"]:
+            print(f"  [{plabel}] non-streaming (warm)...", end=" ", flush=True)
+            r = test_non_streaming(model, PROMPTS[plabel], plabel)
+            results.append(r)
+            if "error" in r:
+                print(f"ERROR: {r['error']}")
+            else:
+                chars_per_s = round(r['output_chars'] / r['total_s'], 1) if r['total_s'] > 0 else 0
+                print(f"total={r['total_s']}s  ({r['output_chars']} chars, {chars_per_s} chars/s)")
+
+        # Clear to free VRAM for next model
+        clear_vram()
+
+    # Summary table
+    print(f"\n{'=' * 90}")
+    print("Summary — Streaming")
+    print(f"{'=' * 90}")
+    print(f"{'Model':<40} {'Prompt':<8} {'Cold':>5} {'TTFT':>7} {'Total':>7} {'Chunks':>7} {'Char/s':>7}")
+    print(f"{'-' * 40} {'-' * 8} {'-' * 5} {'-' * 7} {'-' * 7} {'-' * 7} {'-' * 7}")
+    for r in results:
+        if r.get("mode") == "non-stream":
+            continue
+        if "error" in r:
+            print(f"{r['model']:<40} {r['prompt']:<8} {'':>5} {'ERROR':>7}")
+            continue
+        cold = "yes" if r.get("cold_start") else "no"
+        chars_per_s = round(r['output_chars'] / r['total_s'], 1) if r['total_s'] > 0 else 0
+        print(f"{r['model']:<40} {r['prompt']:<8} {cold:>5} {r['ttft_s']:>6.2f}s {r['total_s']:>6.2f}s {r['tokens']:>7} {chars_per_s:>6.1f}")
+
+    print(f"\n{'=' * 90}")
+    print("Summary — Non-streaming")
+    print(f"{'=' * 90}")
+    print(f"{'Model':<40} {'Prompt':<8} {'Total':>7} {'Chars':>7} {'Char/s':>7}")
+    print(f"{'-' * 40} {'-' * 8} {'-' * 7} {'-' * 7} {'-' * 7}")
+    for r in results:
+        if r.get("mode") != "non-stream":
+            continue
+        if "error" in r:
+            print(f"{r['model']:<40} {r['prompt']:<8} {'ERROR':>7}")
+            continue
+        chars_per_s = round(r['output_chars'] / r['total_s'], 1) if r['total_s'] > 0 else 0
+        print(f"{r['model']:<40} {r['prompt']:<8} {r['total_s']:>6.2f}s {r['output_chars']:>7} {chars_per_s:>6.1f}")
+
+    return results
+
+
+if __name__ == "__main__":
+    run_tests()