- Add harmony.py: strip GPT-OSS-20B analysis/thinking channel from both streaming and non-streaming responses (HarmonyStreamFilter + extract_final_text) - Add per-model asyncio.Lock in llamacpp backend to prevent concurrent C++ access that caused container segfaults (exit 139) - Fix chat handler swap for streaming: move inside _stream_generate within lock scope (was broken by try/finally running before stream was consumed) - Filter /v1/models to return only LLM models (hide ASR/TTS from chat dropdown) - Correct Qwen3.5-4B estimated_vram_gb: 4 → 9 (actual allocation ~8GB) - Add GPU memory verification after eviction with retry loop in vram_manager - Add HF_TOKEN_PATH support in main.py for gated model access - Add /v1/audio/models and /v1/audio/voices discovery endpoints (no auth) - Add OOM error handling in both backends and chat route - Add AUDIO_STT_SUPPORTED_CONTENT_TYPES for webm/wav/mp3/ogg - Add performance test script (scripts/perf_test.py) - Update tests to match current config (42 tests pass) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
225 lines
8.1 KiB
Python
225 lines
8.1 KiB
Python
#!/usr/bin/env python3
|
|
"""Performance test for llmux — measures TTFT, tok/s, and total latency for each LLM model."""
|
|
|
|
import json
|
|
import time
|
|
import sys
|
|
import httpx
|
|
|
|
BASE_URL = "http://127.0.0.1:8081"
|
|
API_KEY = "sk-llmux-openwebui-hMD6pAka1czM53MtTkmmlFP8tF5zuiiDRgt-PCBnj-c"
|
|
HEADERS = {"Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json"}
|
|
|
|
# Test prompts — short and long to measure different characteristics
|
|
PROMPTS = {
|
|
"short": "What is 2+2? Answer in one sentence.",
|
|
"medium": "Explain how a CPU works in 3-4 paragraphs.",
|
|
}
|
|
|
|
# Models to test — one virtual model per physical model (avoid duplicating physical loads)
|
|
TEST_MODELS = [
|
|
# llama-cpp backend (GGUF)
|
|
("Qwen3.5-9B-FP8-Instruct", "llamacpp", "~10GB"),
|
|
("GPT-OSS-20B-Uncensored-Low", "llamacpp", "~13GB"),
|
|
# transformers backend
|
|
("Qwen3.5-4B-Instruct", "transformers", "~4GB"),
|
|
# GPT-OSS-20B-Low disabled: needs libc6-dev sys/ headers for triton MXFP4 kernels
|
|
]
|
|
|
|
|
|
def clear_vram():
|
|
"""Unload all models to start fresh."""
|
|
r = httpx.post(f"{BASE_URL}/admin/clear-vram", headers=HEADERS, timeout=60)
|
|
if r.status_code == 200:
|
|
print(" VRAM cleared")
|
|
else:
|
|
print(f" WARN: clear-vram returned {r.status_code}")
|
|
|
|
|
|
def test_streaming(model: str, prompt: str, prompt_label: str) -> dict:
|
|
"""Test a model with streaming, measuring TTFT and tok/s."""
|
|
body = {
|
|
"model": model,
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
"stream": True,
|
|
}
|
|
|
|
start = time.perf_counter()
|
|
first_token_time = None
|
|
token_count = 0
|
|
full_text = []
|
|
|
|
try:
|
|
with httpx.stream("POST", f"{BASE_URL}/v1/chat/completions",
|
|
json=body, headers=HEADERS, timeout=300) as resp:
|
|
if resp.status_code != 200:
|
|
return {"model": model, "prompt": prompt_label, "error": f"HTTP {resp.status_code}"}
|
|
|
|
for line in resp.iter_lines():
|
|
if not line.startswith("data: "):
|
|
continue
|
|
data = line[6:]
|
|
if data == "[DONE]":
|
|
break
|
|
try:
|
|
chunk = json.loads(data)
|
|
delta = chunk.get("choices", [{}])[0].get("delta", {})
|
|
content = delta.get("content", "")
|
|
if content:
|
|
if first_token_time is None:
|
|
first_token_time = time.perf_counter()
|
|
token_count += 1
|
|
full_text.append(content)
|
|
except json.JSONDecodeError:
|
|
continue
|
|
|
|
except Exception as e:
|
|
return {"model": model, "prompt": prompt_label, "error": str(e)}
|
|
|
|
end = time.perf_counter()
|
|
total_time = end - start
|
|
ttft = (first_token_time - start) if first_token_time else total_time
|
|
|
|
# Token generation time (after first token)
|
|
gen_time = (end - first_token_time) if first_token_time and token_count > 1 else 0
|
|
tok_per_sec = (token_count - 1) / gen_time if gen_time > 0 else 0
|
|
|
|
output_text = "".join(full_text)
|
|
output_chars = len(output_text)
|
|
|
|
return {
|
|
"model": model,
|
|
"prompt": prompt_label,
|
|
"ttft_s": round(ttft, 2),
|
|
"total_s": round(total_time, 2),
|
|
"tokens": token_count,
|
|
"tok_per_s": round(tok_per_sec, 1),
|
|
"output_chars": output_chars,
|
|
}
|
|
|
|
|
|
def test_non_streaming(model: str, prompt: str, prompt_label: str) -> dict:
|
|
"""Test a model without streaming — measures total latency."""
|
|
body = {
|
|
"model": model,
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
"stream": False,
|
|
}
|
|
|
|
start = time.perf_counter()
|
|
try:
|
|
r = httpx.post(f"{BASE_URL}/v1/chat/completions",
|
|
json=body, headers=HEADERS, timeout=300)
|
|
if r.status_code != 200:
|
|
return {"model": model, "prompt": prompt_label, "mode": "non-stream", "error": f"HTTP {r.status_code}"}
|
|
result = r.json()
|
|
except Exception as e:
|
|
return {"model": model, "prompt": prompt_label, "mode": "non-stream", "error": str(e)}
|
|
|
|
end = time.perf_counter()
|
|
content = result.get("choices", [{}])[0].get("message", {}).get("content", "")
|
|
|
|
return {
|
|
"model": model,
|
|
"prompt": prompt_label,
|
|
"mode": "non-stream",
|
|
"total_s": round(end - start, 2),
|
|
"output_chars": len(content),
|
|
}
|
|
|
|
|
|
def run_tests():
|
|
print("=" * 80)
|
|
print("llmux Performance Test")
|
|
print("=" * 80)
|
|
|
|
# Check health
|
|
try:
|
|
r = httpx.get(f"{BASE_URL}/health", timeout=5)
|
|
health = r.json()
|
|
print(f"Server healthy — available VRAM: {health['available_vram_gb']} GB")
|
|
except Exception as e:
|
|
print(f"ERROR: Server not reachable: {e}")
|
|
sys.exit(1)
|
|
|
|
results = []
|
|
|
|
for model, backend, vram_est in TEST_MODELS:
|
|
print(f"\n{'─' * 60}")
|
|
print(f"Model: {model} ({backend}, {vram_est})")
|
|
print(f"{'─' * 60}")
|
|
|
|
# Clear VRAM before each model to measure cold-start load time
|
|
clear_vram()
|
|
|
|
for prompt_label, prompt_text in PROMPTS.items():
|
|
# First run = cold start (includes model loading)
|
|
print(f" [{prompt_label}] streaming (cold)...", end=" ", flush=True)
|
|
r = test_streaming(model, prompt_text, prompt_label)
|
|
r["cold_start"] = True
|
|
results.append(r)
|
|
if "error" in r:
|
|
print(f"ERROR: {r['error']}")
|
|
else:
|
|
print(f"TTFT={r['ttft_s']}s total={r['total_s']}s {r['tok_per_s']} tok/s ({r['tokens']} tokens)")
|
|
|
|
# Second run = warm (model already loaded)
|
|
print(f" [{prompt_label}] streaming (warm)...", end=" ", flush=True)
|
|
r = test_streaming(model, prompt_text, prompt_label)
|
|
r["cold_start"] = False
|
|
results.append(r)
|
|
if "error" in r:
|
|
print(f"ERROR: {r['error']}")
|
|
else:
|
|
print(f"TTFT={r['ttft_s']}s total={r['total_s']}s {r['tok_per_s']} tok/s ({r['tokens']} tokens)")
|
|
|
|
# Non-streaming tests (warm)
|
|
for plabel in ["short", "medium"]:
|
|
print(f" [{plabel}] non-streaming (warm)...", end=" ", flush=True)
|
|
r = test_non_streaming(model, PROMPTS[plabel], plabel)
|
|
results.append(r)
|
|
if "error" in r:
|
|
print(f"ERROR: {r['error']}")
|
|
else:
|
|
chars_per_s = round(r['output_chars'] / r['total_s'], 1) if r['total_s'] > 0 else 0
|
|
print(f"total={r['total_s']}s ({r['output_chars']} chars, {chars_per_s} chars/s)")
|
|
|
|
# Clear to free VRAM for next model
|
|
clear_vram()
|
|
|
|
# Summary table
|
|
print(f"\n{'=' * 90}")
|
|
print("Summary — Streaming")
|
|
print(f"{'=' * 90}")
|
|
print(f"{'Model':<40} {'Prompt':<8} {'Cold':>5} {'TTFT':>7} {'Total':>7} {'Chunks':>7} {'Char/s':>7}")
|
|
print(f"{'-' * 40} {'-' * 8} {'-' * 5} {'-' * 7} {'-' * 7} {'-' * 7} {'-' * 7}")
|
|
for r in results:
|
|
if r.get("mode") == "non-stream":
|
|
continue
|
|
if "error" in r:
|
|
print(f"{r['model']:<40} {r['prompt']:<8} {'':>5} {'ERROR':>7}")
|
|
continue
|
|
cold = "yes" if r.get("cold_start") else "no"
|
|
chars_per_s = round(r['output_chars'] / r['total_s'], 1) if r['total_s'] > 0 else 0
|
|
print(f"{r['model']:<40} {r['prompt']:<8} {cold:>5} {r['ttft_s']:>6.2f}s {r['total_s']:>6.2f}s {r['tokens']:>7} {chars_per_s:>6.1f}")
|
|
|
|
print(f"\n{'=' * 90}")
|
|
print("Summary — Non-streaming")
|
|
print(f"{'=' * 90}")
|
|
print(f"{'Model':<40} {'Prompt':<8} {'Total':>7} {'Chars':>7} {'Char/s':>7}")
|
|
print(f"{'-' * 40} {'-' * 8} {'-' * 7} {'-' * 7} {'-' * 7}")
|
|
for r in results:
|
|
if r.get("mode") != "non-stream":
|
|
continue
|
|
if "error" in r:
|
|
print(f"{r['model']:<40} {r['prompt']:<8} {'ERROR':>7}")
|
|
continue
|
|
chars_per_s = round(r['output_chars'] / r['total_s'], 1) if r['total_s'] > 0 else 0
|
|
print(f"{r['model']:<40} {r['prompt']:<8} {r['total_s']:>6.2f}s {r['output_chars']:>7} {chars_per_s:>6.1f}")
|
|
|
|
return results
|
|
|
|
|
|
if __name__ == "__main__":
|
|
run_tests()
|