Initial commit: Korean voice-cloning TTS prototype

FastAPI backend, web UI, CosyVoice3/F5-TTS setup scripts, and handoff docs for GPU PC continuation. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-04 13:36:37 +09:00
commit 7101fdcd65
36 changed files with 1937 additions and 0 deletions
--- a/backend/app/tts/engines_subprocess.py
+++ b/backend/app/tts/engines_subprocess.py
@@ -0,0 +1,101 @@
+from __future__ import annotations
+
+import subprocess
+import sys
+from pathlib import Path
+
+from backend.app.config import project_root
+from backend.app.tts.base import TTSEngine
+
+ROOT = project_root()
+
+
+class SubprocessEngine(TTSEngine):
+    def __init__(self, venv_name: str, worker_name: str) -> None:
+        self._python = ROOT / ".venvs" / venv_name / "bin" / "python"
+        self._worker = ROOT / "scripts" / "workers" / worker_name
+
+    def _run(self, args: list[str]) -> None:
+        if not self._python.is_file():
+            raise RuntimeError(
+                f"{self._python.parent.parent.name} venv 없음. "
+                f"scripts/setup_{self._python.parent.parent.name}.sh 실행"
+            )
+        cmd = [str(self._python), str(self._worker), *args]
+        proc = subprocess.run(cmd, cwd=str(ROOT), capture_output=True, text=True)
+        if proc.returncode != 0:
+            raise RuntimeError(
+                f"{self.name} inference failed:\n{proc.stderr or proc.stdout}"
+            )
+
+
+class F5TTSEngine(SubprocessEngine):
+    name = "f5_tts"
+
+    def __init__(self) -> None:
+        super().__init__("f5tts", "f5_infer.py")
+
+    def synthesize(
+        self,
+        text: str,
+        ref_audio: Path,
+        ref_text: str,
+        out_path: Path,
+    ) -> Path:
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        self._run(
+            [
+                "--ref-audio",
+                str(ref_audio),
+                "--ref-text",
+                ref_text or "reference audio transcript",
+                "--gen-text",
+                text,
+                "--out",
+                str(out_path),
+            ]
+        )
+        return out_path
+
+
+class CosyVoiceEngine(SubprocessEngine):
+    name = "cosyvoice"
+
+    def __init__(self, model_dir: Path, prompt_prefix: str) -> None:
+        super().__init__("cosyvoice", "cosy_infer.py")
+        self._model_dir = model_dir
+        self._prompt_prefix = prompt_prefix
+
+    def synthesize(
+        self,
+        text: str,
+        ref_audio: Path,
+        ref_text: str,
+        out_path: Path,
+    ) -> Path:
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        self._run(
+            [
+                "--ref-audio",
+                str(ref_audio),
+                "--gen-text",
+                text,
+                "--prompt-text",
+                ref_text or "",
+                "--out",
+                str(out_path),
+                "--model-dir",
+                str(self._model_dir),
+                "--prompt-prefix",
+                self._prompt_prefix,
+            ]
+        )
+        return out_path
+
+
+def create_engine(model: str, model_dir: Path, prompt_prefix: str) -> TTSEngine:
+    if model == "f5_tts":
+        return F5TTSEngine()
+    if model == "cosyvoice":
+        return CosyVoiceEngine(model_dir, prompt_prefix)
+    raise ValueError(f"Unknown model: {model}. Use cosyvoice or f5_tts.")