DesTEngSsv006_swd/kischdle/llmux/llmux/backends/chatterbox_tts.py

import asyncio
import gc
import io
import logging

import soundfile as sf
import torch

from llmux.backends.base import BaseBackend
from llmux.config import PhysicalModel

logger = logging.getLogger(__name__)


class ChatterboxTTSBackend(BaseBackend):
    def __init__(self, models_dir: str = "/models"):
        self._models_dir = models_dir
        self._loaded: dict[str, dict] = {}

    async def load(self, model_id: str, device: str = "cuda") -> None:
        if model_id in self._loaded:
            return
        physical = _get_physical_config(model_id)
        variant = physical.variant
        logger.info(f"Loading Chatterbox {variant} to {device}")

        def _load():
            if variant == "multilingual":
                from chatterbox import ChatterboxMultilingualTTS
                return ChatterboxMultilingualTTS.from_pretrained(device=device)
            else:
                from chatterbox.tts import ChatterboxTTS
                return ChatterboxTTS.from_pretrained(device=device)

        loop = asyncio.get_event_loop()
        model = await loop.run_in_executor(None, _load)
        self._loaded[model_id] = {"model": model, "variant": variant, "device": device}

    async def unload(self, model_id: str) -> None:
        if model_id not in self._loaded:
            return
        entry = self._loaded.pop(model_id)
        del entry["model"]
        del entry
        gc.collect()
        torch.cuda.empty_cache()
        logger.info(f"Unloaded Chatterbox {model_id}")

    async def generate(self, model_id, messages, params, stream=False, tools=None):
        raise NotImplementedError("TTS backend does not support chat generation")

    async def synthesize(self, model_id: str, text: str, voice: str = "default") -> bytes:
        entry = self._loaded[model_id]
        model = entry["model"]
        variant = entry["variant"]

        def _synthesize():
            if variant == "multilingual":
                # Default to English; voice param could encode language
                lang = "en" if voice == "default" else voice
                wav = model.generate(text, language_id=lang)
            else:
                wav = model.generate(text)
            buf = io.BytesIO()
            sf.write(buf, wav.cpu().numpy().squeeze(), samplerate=24000, format="WAV")
            buf.seek(0)
            return buf.read()

        loop = asyncio.get_event_loop()
        audio_bytes = await loop.run_in_executor(None, _synthesize)
        return audio_bytes


_physical_models: dict[str, PhysicalModel] = {}

def set_physical_models(models: dict[str, PhysicalModel]) -> None:
    global _physical_models
    _physical_models = models

def _get_physical_config(model_id: str) -> PhysicalModel:
    return _physical_models[model_id]