Files
DesTEngSsv006_swd/kischdle/llmux/llmux/backends/chatterbox_tts.py
tlg d615bb4553 fix: Chatterbox uses separate classes per variant, remove turbo
ChatterboxTTS and ChatterboxMultilingualTTS are separate classes.
Turbo variant doesn't exist in chatterbox-tts 0.1.7.
Multilingual generate() requires language_id parameter.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 21:43:40 +02:00

82 lines
2.7 KiB
Python

import asyncio
import gc
import io
import logging
import soundfile as sf
import torch
from llmux.backends.base import BaseBackend
from llmux.config import PhysicalModel
logger = logging.getLogger(__name__)
class ChatterboxTTSBackend(BaseBackend):
def __init__(self, models_dir: str = "/models"):
self._models_dir = models_dir
self._loaded: dict[str, dict] = {}
async def load(self, model_id: str, device: str = "cuda") -> None:
if model_id in self._loaded:
return
physical = _get_physical_config(model_id)
variant = physical.variant
logger.info(f"Loading Chatterbox {variant} to {device}")
def _load():
if variant == "multilingual":
from chatterbox import ChatterboxMultilingualTTS
return ChatterboxMultilingualTTS.from_pretrained(device=device)
else:
from chatterbox.tts import ChatterboxTTS
return ChatterboxTTS.from_pretrained(device=device)
loop = asyncio.get_event_loop()
model = await loop.run_in_executor(None, _load)
self._loaded[model_id] = {"model": model, "variant": variant, "device": device}
async def unload(self, model_id: str) -> None:
if model_id not in self._loaded:
return
entry = self._loaded.pop(model_id)
del entry["model"]
del entry
gc.collect()
torch.cuda.empty_cache()
logger.info(f"Unloaded Chatterbox {model_id}")
async def generate(self, model_id, messages, params, stream=False, tools=None):
raise NotImplementedError("TTS backend does not support chat generation")
async def synthesize(self, model_id: str, text: str, voice: str = "default") -> bytes:
entry = self._loaded[model_id]
model = entry["model"]
variant = entry["variant"]
def _synthesize():
if variant == "multilingual":
# Default to English; voice param could encode language
lang = "en" if voice == "default" else voice
wav = model.generate(text, language_id=lang)
else:
wav = model.generate(text)
buf = io.BytesIO()
sf.write(buf, wav.cpu().numpy().squeeze(), samplerate=24000, format="WAV")
buf.seek(0)
return buf.read()
loop = asyncio.get_event_loop()
audio_bytes = await loop.run_in_executor(None, _synthesize)
return audio_bytes
_physical_models: dict[str, PhysicalModel] = {}
def set_physical_models(models: dict[str, PhysicalModel]) -> None:
global _physical_models
_physical_models = models
def _get_physical_config(model_id: str) -> PhysicalModel:
return _physical_models[model_id]