voice.sori.studio/backend/app/tts/engines_subprocess.py

from __future__ import annotations

import subprocess
import sys
from pathlib import Path

from backend.app.config import project_root
from backend.app.tts.base import TTSEngine

ROOT = project_root()


class SubprocessEngine(TTSEngine):
    def __init__(self, venv_name: str, worker_name: str) -> None:
        self._python = ROOT / ".venvs" / venv_name / "bin" / "python"
        self._worker = ROOT / "scripts" / "workers" / worker_name

    def _run(self, args: list[str]) -> None:
        if not self._python.is_file():
            raise RuntimeError(
                f"{self._python.parent.parent.name} venv 없음. "
                f"scripts/setup_{self._python.parent.parent.name}.sh 실행"
            )
        cmd = [str(self._python), str(self._worker), *args]
        proc = subprocess.run(cmd, cwd=str(ROOT), capture_output=True, text=True)
        if proc.returncode != 0:
            raise RuntimeError(
                f"{self.name} inference failed:\n{proc.stderr or proc.stdout}"
            )


class F5TTSEngine(SubprocessEngine):
    name = "f5_tts"

    def __init__(self) -> None:
        super().__init__("f5tts", "f5_infer.py")

    def synthesize(
        self,
        text: str,
        ref_audio: Path,
        ref_text: str,
        out_path: Path,
    ) -> Path:
        out_path.parent.mkdir(parents=True, exist_ok=True)
        self._run(
            [
                "--ref-audio",
                str(ref_audio),
                "--ref-text",
                ref_text or "reference audio transcript",
                "--gen-text",
                text,
                "--out",
                str(out_path),
            ]
        )
        return out_path


class CosyVoiceEngine(SubprocessEngine):
    name = "cosyvoice"

    def __init__(self, model_dir: Path, prompt_prefix: str) -> None:
        super().__init__("cosyvoice", "cosy_infer.py")
        self._model_dir = model_dir
        self._prompt_prefix = prompt_prefix

    def synthesize(
        self,
        text: str,
        ref_audio: Path,
        ref_text: str,
        out_path: Path,
    ) -> Path:
        out_path.parent.mkdir(parents=True, exist_ok=True)
        self._run(
            [
                "--ref-audio",
                str(ref_audio),
                "--gen-text",
                text,
                "--prompt-text",
                ref_text or "",
                "--out",
                str(out_path),
                "--model-dir",
                str(self._model_dir),
                "--prompt-prefix",
                self._prompt_prefix,
            ]
        )
        return out_path


def create_engine(model: str, model_dir: Path, prompt_prefix: str) -> TTSEngine:
    if model == "f5_tts":
        return F5TTSEngine()
    if model == "cosyvoice":
        return CosyVoiceEngine(model_dir, prompt_prefix)
    raise ValueError(f"Unknown model: {model}. Use cosyvoice or f5_tts.")