fix: Open WebUI integration — Harmony stripping, VRAM eviction, concurrency lock
- Add harmony.py: strip GPT-OSS-20B analysis/thinking channel from both streaming and non-streaming responses (HarmonyStreamFilter + extract_final_text) - Add per-model asyncio.Lock in llamacpp backend to prevent concurrent C++ access that caused container segfaults (exit 139) - Fix chat handler swap for streaming: move inside _stream_generate within lock scope (was broken by try/finally running before stream was consumed) - Filter /v1/models to return only LLM models (hide ASR/TTS from chat dropdown) - Correct Qwen3.5-4B estimated_vram_gb: 4 → 9 (actual allocation ~8GB) - Add GPU memory verification after eviction with retry loop in vram_manager - Add HF_TOKEN_PATH support in main.py for gated model access - Add /v1/audio/models and /v1/audio/voices discovery endpoints (no auth) - Add OOM error handling in both backends and chat route - Add AUDIO_STT_SUPPORTED_CONTENT_TYPES for webm/wav/mp3/ogg - Add performance test script (scripts/perf_test.py) - Update tests to match current config (42 tests pass) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
224
kischdle/llmux/scripts/perf_test.py
Normal file
224
kischdle/llmux/scripts/perf_test.py
Normal file
@@ -0,0 +1,224 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Performance test for llmux — measures TTFT, tok/s, and total latency for each LLM model."""
|
||||
|
||||
import json
|
||||
import time
|
||||
import sys
|
||||
import httpx
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8081"
|
||||
API_KEY = "sk-llmux-openwebui-hMD6pAka1czM53MtTkmmlFP8tF5zuiiDRgt-PCBnj-c"
|
||||
HEADERS = {"Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json"}
|
||||
|
||||
# Test prompts — short and long to measure different characteristics
|
||||
PROMPTS = {
|
||||
"short": "What is 2+2? Answer in one sentence.",
|
||||
"medium": "Explain how a CPU works in 3-4 paragraphs.",
|
||||
}
|
||||
|
||||
# Models to test — one virtual model per physical model (avoid duplicating physical loads)
|
||||
TEST_MODELS = [
|
||||
# llama-cpp backend (GGUF)
|
||||
("Qwen3.5-9B-FP8-Instruct", "llamacpp", "~10GB"),
|
||||
("GPT-OSS-20B-Uncensored-Low", "llamacpp", "~13GB"),
|
||||
# transformers backend
|
||||
("Qwen3.5-4B-Instruct", "transformers", "~4GB"),
|
||||
# GPT-OSS-20B-Low disabled: needs libc6-dev sys/ headers for triton MXFP4 kernels
|
||||
]
|
||||
|
||||
|
||||
def clear_vram():
|
||||
"""Unload all models to start fresh."""
|
||||
r = httpx.post(f"{BASE_URL}/admin/clear-vram", headers=HEADERS, timeout=60)
|
||||
if r.status_code == 200:
|
||||
print(" VRAM cleared")
|
||||
else:
|
||||
print(f" WARN: clear-vram returned {r.status_code}")
|
||||
|
||||
|
||||
def test_streaming(model: str, prompt: str, prompt_label: str) -> dict:
|
||||
"""Test a model with streaming, measuring TTFT and tok/s."""
|
||||
body = {
|
||||
"model": model,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"stream": True,
|
||||
}
|
||||
|
||||
start = time.perf_counter()
|
||||
first_token_time = None
|
||||
token_count = 0
|
||||
full_text = []
|
||||
|
||||
try:
|
||||
with httpx.stream("POST", f"{BASE_URL}/v1/chat/completions",
|
||||
json=body, headers=HEADERS, timeout=300) as resp:
|
||||
if resp.status_code != 200:
|
||||
return {"model": model, "prompt": prompt_label, "error": f"HTTP {resp.status_code}"}
|
||||
|
||||
for line in resp.iter_lines():
|
||||
if not line.startswith("data: "):
|
||||
continue
|
||||
data = line[6:]
|
||||
if data == "[DONE]":
|
||||
break
|
||||
try:
|
||||
chunk = json.loads(data)
|
||||
delta = chunk.get("choices", [{}])[0].get("delta", {})
|
||||
content = delta.get("content", "")
|
||||
if content:
|
||||
if first_token_time is None:
|
||||
first_token_time = time.perf_counter()
|
||||
token_count += 1
|
||||
full_text.append(content)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
return {"model": model, "prompt": prompt_label, "error": str(e)}
|
||||
|
||||
end = time.perf_counter()
|
||||
total_time = end - start
|
||||
ttft = (first_token_time - start) if first_token_time else total_time
|
||||
|
||||
# Token generation time (after first token)
|
||||
gen_time = (end - first_token_time) if first_token_time and token_count > 1 else 0
|
||||
tok_per_sec = (token_count - 1) / gen_time if gen_time > 0 else 0
|
||||
|
||||
output_text = "".join(full_text)
|
||||
output_chars = len(output_text)
|
||||
|
||||
return {
|
||||
"model": model,
|
||||
"prompt": prompt_label,
|
||||
"ttft_s": round(ttft, 2),
|
||||
"total_s": round(total_time, 2),
|
||||
"tokens": token_count,
|
||||
"tok_per_s": round(tok_per_sec, 1),
|
||||
"output_chars": output_chars,
|
||||
}
|
||||
|
||||
|
||||
def test_non_streaming(model: str, prompt: str, prompt_label: str) -> dict:
|
||||
"""Test a model without streaming — measures total latency."""
|
||||
body = {
|
||||
"model": model,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"stream": False,
|
||||
}
|
||||
|
||||
start = time.perf_counter()
|
||||
try:
|
||||
r = httpx.post(f"{BASE_URL}/v1/chat/completions",
|
||||
json=body, headers=HEADERS, timeout=300)
|
||||
if r.status_code != 200:
|
||||
return {"model": model, "prompt": prompt_label, "mode": "non-stream", "error": f"HTTP {r.status_code}"}
|
||||
result = r.json()
|
||||
except Exception as e:
|
||||
return {"model": model, "prompt": prompt_label, "mode": "non-stream", "error": str(e)}
|
||||
|
||||
end = time.perf_counter()
|
||||
content = result.get("choices", [{}])[0].get("message", {}).get("content", "")
|
||||
|
||||
return {
|
||||
"model": model,
|
||||
"prompt": prompt_label,
|
||||
"mode": "non-stream",
|
||||
"total_s": round(end - start, 2),
|
||||
"output_chars": len(content),
|
||||
}
|
||||
|
||||
|
||||
def run_tests():
|
||||
print("=" * 80)
|
||||
print("llmux Performance Test")
|
||||
print("=" * 80)
|
||||
|
||||
# Check health
|
||||
try:
|
||||
r = httpx.get(f"{BASE_URL}/health", timeout=5)
|
||||
health = r.json()
|
||||
print(f"Server healthy — available VRAM: {health['available_vram_gb']} GB")
|
||||
except Exception as e:
|
||||
print(f"ERROR: Server not reachable: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
results = []
|
||||
|
||||
for model, backend, vram_est in TEST_MODELS:
|
||||
print(f"\n{'─' * 60}")
|
||||
print(f"Model: {model} ({backend}, {vram_est})")
|
||||
print(f"{'─' * 60}")
|
||||
|
||||
# Clear VRAM before each model to measure cold-start load time
|
||||
clear_vram()
|
||||
|
||||
for prompt_label, prompt_text in PROMPTS.items():
|
||||
# First run = cold start (includes model loading)
|
||||
print(f" [{prompt_label}] streaming (cold)...", end=" ", flush=True)
|
||||
r = test_streaming(model, prompt_text, prompt_label)
|
||||
r["cold_start"] = True
|
||||
results.append(r)
|
||||
if "error" in r:
|
||||
print(f"ERROR: {r['error']}")
|
||||
else:
|
||||
print(f"TTFT={r['ttft_s']}s total={r['total_s']}s {r['tok_per_s']} tok/s ({r['tokens']} tokens)")
|
||||
|
||||
# Second run = warm (model already loaded)
|
||||
print(f" [{prompt_label}] streaming (warm)...", end=" ", flush=True)
|
||||
r = test_streaming(model, prompt_text, prompt_label)
|
||||
r["cold_start"] = False
|
||||
results.append(r)
|
||||
if "error" in r:
|
||||
print(f"ERROR: {r['error']}")
|
||||
else:
|
||||
print(f"TTFT={r['ttft_s']}s total={r['total_s']}s {r['tok_per_s']} tok/s ({r['tokens']} tokens)")
|
||||
|
||||
# Non-streaming tests (warm)
|
||||
for plabel in ["short", "medium"]:
|
||||
print(f" [{plabel}] non-streaming (warm)...", end=" ", flush=True)
|
||||
r = test_non_streaming(model, PROMPTS[plabel], plabel)
|
||||
results.append(r)
|
||||
if "error" in r:
|
||||
print(f"ERROR: {r['error']}")
|
||||
else:
|
||||
chars_per_s = round(r['output_chars'] / r['total_s'], 1) if r['total_s'] > 0 else 0
|
||||
print(f"total={r['total_s']}s ({r['output_chars']} chars, {chars_per_s} chars/s)")
|
||||
|
||||
# Clear to free VRAM for next model
|
||||
clear_vram()
|
||||
|
||||
# Summary table
|
||||
print(f"\n{'=' * 90}")
|
||||
print("Summary — Streaming")
|
||||
print(f"{'=' * 90}")
|
||||
print(f"{'Model':<40} {'Prompt':<8} {'Cold':>5} {'TTFT':>7} {'Total':>7} {'Chunks':>7} {'Char/s':>7}")
|
||||
print(f"{'-' * 40} {'-' * 8} {'-' * 5} {'-' * 7} {'-' * 7} {'-' * 7} {'-' * 7}")
|
||||
for r in results:
|
||||
if r.get("mode") == "non-stream":
|
||||
continue
|
||||
if "error" in r:
|
||||
print(f"{r['model']:<40} {r['prompt']:<8} {'':>5} {'ERROR':>7}")
|
||||
continue
|
||||
cold = "yes" if r.get("cold_start") else "no"
|
||||
chars_per_s = round(r['output_chars'] / r['total_s'], 1) if r['total_s'] > 0 else 0
|
||||
print(f"{r['model']:<40} {r['prompt']:<8} {cold:>5} {r['ttft_s']:>6.2f}s {r['total_s']:>6.2f}s {r['tokens']:>7} {chars_per_s:>6.1f}")
|
||||
|
||||
print(f"\n{'=' * 90}")
|
||||
print("Summary — Non-streaming")
|
||||
print(f"{'=' * 90}")
|
||||
print(f"{'Model':<40} {'Prompt':<8} {'Total':>7} {'Chars':>7} {'Char/s':>7}")
|
||||
print(f"{'-' * 40} {'-' * 8} {'-' * 7} {'-' * 7} {'-' * 7}")
|
||||
for r in results:
|
||||
if r.get("mode") != "non-stream":
|
||||
continue
|
||||
if "error" in r:
|
||||
print(f"{r['model']:<40} {r['prompt']:<8} {'ERROR':>7}")
|
||||
continue
|
||||
chars_per_s = round(r['output_chars'] / r['total_s'], 1) if r['total_s'] > 0 else 0
|
||||
print(f"{r['model']:<40} {r['prompt']:<8} {r['total_s']:>6.2f}s {r['output_chars']:>7} {chars_per_s:>6.1f}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
Reference in New Issue
Block a user