FastAPI backend, web UI, CosyVoice3/F5-TTS setup scripts, and handoff docs for GPU PC continuation. Co-authored-by: Cursor <cursoragent@cursor.com>
102 lines
2.9 KiB
Python
102 lines
2.9 KiB
Python
from __future__ import annotations
|
|
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
from backend.app.config import project_root
|
|
from backend.app.tts.base import TTSEngine
|
|
|
|
ROOT = project_root()
|
|
|
|
|
|
class SubprocessEngine(TTSEngine):
|
|
def __init__(self, venv_name: str, worker_name: str) -> None:
|
|
self._python = ROOT / ".venvs" / venv_name / "bin" / "python"
|
|
self._worker = ROOT / "scripts" / "workers" / worker_name
|
|
|
|
def _run(self, args: list[str]) -> None:
|
|
if not self._python.is_file():
|
|
raise RuntimeError(
|
|
f"{self._python.parent.parent.name} venv 없음. "
|
|
f"scripts/setup_{self._python.parent.parent.name}.sh 실행"
|
|
)
|
|
cmd = [str(self._python), str(self._worker), *args]
|
|
proc = subprocess.run(cmd, cwd=str(ROOT), capture_output=True, text=True)
|
|
if proc.returncode != 0:
|
|
raise RuntimeError(
|
|
f"{self.name} inference failed:\n{proc.stderr or proc.stdout}"
|
|
)
|
|
|
|
|
|
class F5TTSEngine(SubprocessEngine):
|
|
name = "f5_tts"
|
|
|
|
def __init__(self) -> None:
|
|
super().__init__("f5tts", "f5_infer.py")
|
|
|
|
def synthesize(
|
|
self,
|
|
text: str,
|
|
ref_audio: Path,
|
|
ref_text: str,
|
|
out_path: Path,
|
|
) -> Path:
|
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
self._run(
|
|
[
|
|
"--ref-audio",
|
|
str(ref_audio),
|
|
"--ref-text",
|
|
ref_text or "reference audio transcript",
|
|
"--gen-text",
|
|
text,
|
|
"--out",
|
|
str(out_path),
|
|
]
|
|
)
|
|
return out_path
|
|
|
|
|
|
class CosyVoiceEngine(SubprocessEngine):
|
|
name = "cosyvoice"
|
|
|
|
def __init__(self, model_dir: Path, prompt_prefix: str) -> None:
|
|
super().__init__("cosyvoice", "cosy_infer.py")
|
|
self._model_dir = model_dir
|
|
self._prompt_prefix = prompt_prefix
|
|
|
|
def synthesize(
|
|
self,
|
|
text: str,
|
|
ref_audio: Path,
|
|
ref_text: str,
|
|
out_path: Path,
|
|
) -> Path:
|
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
self._run(
|
|
[
|
|
"--ref-audio",
|
|
str(ref_audio),
|
|
"--gen-text",
|
|
text,
|
|
"--prompt-text",
|
|
ref_text or "",
|
|
"--out",
|
|
str(out_path),
|
|
"--model-dir",
|
|
str(self._model_dir),
|
|
"--prompt-prefix",
|
|
self._prompt_prefix,
|
|
]
|
|
)
|
|
return out_path
|
|
|
|
|
|
def create_engine(model: str, model_dir: Path, prompt_prefix: str) -> TTSEngine:
|
|
if model == "f5_tts":
|
|
return F5TTSEngine()
|
|
if model == "cosyvoice":
|
|
return CosyVoiceEngine(model_dir, prompt_prefix)
|
|
raise ValueError(f"Unknown model: {model}. Use cosyvoice or f5_tts.")
|