- Add harmony.py: strip GPT-OSS-20B analysis/thinking channel from both streaming and non-streaming responses (HarmonyStreamFilter + extract_final_text) - Add per-model asyncio.Lock in llamacpp backend to prevent concurrent C++ access that caused container segfaults (exit 139) - Fix chat handler swap for streaming: move inside _stream_generate within lock scope (was broken by try/finally running before stream was consumed) - Filter /v1/models to return only LLM models (hide ASR/TTS from chat dropdown) - Correct Qwen3.5-4B estimated_vram_gb: 4 → 9 (actual allocation ~8GB) - Add GPU memory verification after eviction with retry loop in vram_manager - Add HF_TOKEN_PATH support in main.py for gated model access - Add /v1/audio/models and /v1/audio/voices discovery endpoints (no auth) - Add OOM error handling in both backends and chat route - Add AUDIO_STT_SUPPORTED_CONTENT_TYPES for webm/wav/mp3/ogg - Add performance test script (scripts/perf_test.py) - Update tests to match current config (42 tests pass) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
63 lines
1.6 KiB
Python
63 lines
1.6 KiB
Python
import pytest
|
|
from fastapi import FastAPI
|
|
from fastapi.testclient import TestClient
|
|
|
|
from llmux.config import ApiKey
|
|
from llmux.auth import create_api_key_dependency
|
|
from llmux.model_registry import ModelRegistry
|
|
from llmux.vram_manager import VRAMManager
|
|
from llmux.routes.models import create_models_router
|
|
|
|
API_KEY = "sk-test-key"
|
|
|
|
|
|
@pytest.fixture
|
|
def registry():
|
|
return ModelRegistry.from_config()
|
|
|
|
|
|
@pytest.fixture
|
|
def vram_manager():
|
|
return VRAMManager(total_vram_gb=16.0)
|
|
|
|
|
|
@pytest.fixture
|
|
def app(registry, vram_manager):
|
|
keys = [ApiKey(key=API_KEY, name="Test")]
|
|
require_api_key = create_api_key_dependency(keys)
|
|
app = FastAPI()
|
|
app.include_router(create_models_router(registry, require_api_key))
|
|
return app
|
|
|
|
|
|
@pytest.fixture
|
|
def client(app):
|
|
return TestClient(app)
|
|
|
|
|
|
@pytest.fixture
|
|
def auth_headers():
|
|
return {"Authorization": f"Bearer {API_KEY}"}
|
|
|
|
|
|
def test_list_models_returns_only_llm(client, auth_headers):
|
|
resp = client.get("/v1/models", headers=auth_headers)
|
|
assert resp.status_code == 200
|
|
body = resp.json()
|
|
assert body["object"] == "list"
|
|
assert len(body["data"]) == 12 # only LLM models
|
|
|
|
|
|
def test_list_models_contains_expected_names(client, auth_headers):
|
|
resp = client.get("/v1/models", headers=auth_headers)
|
|
names = [m["id"] for m in resp.json()["data"]]
|
|
assert "Qwen3.5-9B-FP8-Thinking" in names
|
|
assert "GPT-OSS-20B-High" in names
|
|
assert "cohere-transcribe" not in names
|
|
assert "Chatterbox-Multilingual" not in names
|
|
|
|
|
|
def test_list_models_requires_auth(client):
|
|
resp = client.get("/v1/models")
|
|
assert resp.status_code == 401
|