Initial commit: Korean voice-cloning TTS prototype

FastAPI backend, web UI, CosyVoice3/F5-TTS setup scripts, and handoff docs for GPU PC continuation. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-04 13:36:37 +09:00
commit 7101fdcd65
36 changed files with 1937 additions and 0 deletions
--- a/backend/app/tts/init.py
+++ b/backend/app/tts/init.py
@@ -0,0 +1,3 @@
+from backend.app.tts.service import TTSService
+
+__all__ = ["TTSService"]
--- a/backend/app/tts/base.py
+++ b/backend/app/tts/base.py
@@ -0,0 +1,18 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from pathlib import Path
+
+
+class TTSEngine(ABC):
+    name: str
+
+    @abstractmethod
+    def synthesize(
+        self,
+        text: str,
+        ref_audio: Path,
+        ref_text: str,
+        out_path: Path,
+    ) -> Path:
+        """단일 텍스트 청크를 WAV로 생성."""
--- a/backend/app/tts/engines_subprocess.py
+++ b/backend/app/tts/engines_subprocess.py
@@ -0,0 +1,101 @@
+from __future__ import annotations
+
+import subprocess
+import sys
+from pathlib import Path
+
+from backend.app.config import project_root
+from backend.app.tts.base import TTSEngine
+
+ROOT = project_root()
+
+
+class SubprocessEngine(TTSEngine):
+    def __init__(self, venv_name: str, worker_name: str) -> None:
+        self._python = ROOT / ".venvs" / venv_name / "bin" / "python"
+        self._worker = ROOT / "scripts" / "workers" / worker_name
+
+    def _run(self, args: list[str]) -> None:
+        if not self._python.is_file():
+            raise RuntimeError(
+                f"{self._python.parent.parent.name} venv 없음. "
+                f"scripts/setup_{self._python.parent.parent.name}.sh 실행"
+            )
+        cmd = [str(self._python), str(self._worker), *args]
+        proc = subprocess.run(cmd, cwd=str(ROOT), capture_output=True, text=True)
+        if proc.returncode != 0:
+            raise RuntimeError(
+                f"{self.name} inference failed:\n{proc.stderr or proc.stdout}"
+            )
+
+
+class F5TTSEngine(SubprocessEngine):
+    name = "f5_tts"
+
+    def __init__(self) -> None:
+        super().__init__("f5tts", "f5_infer.py")
+
+    def synthesize(
+        self,
+        text: str,
+        ref_audio: Path,
+        ref_text: str,
+        out_path: Path,
+    ) -> Path:
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        self._run(
+            [
+                "--ref-audio",
+                str(ref_audio),
+                "--ref-text",
+                ref_text or "reference audio transcript",
+                "--gen-text",
+                text,
+                "--out",
+                str(out_path),
+            ]
+        )
+        return out_path
+
+
+class CosyVoiceEngine(SubprocessEngine):
+    name = "cosyvoice"
+
+    def __init__(self, model_dir: Path, prompt_prefix: str) -> None:
+        super().__init__("cosyvoice", "cosy_infer.py")
+        self._model_dir = model_dir
+        self._prompt_prefix = prompt_prefix
+
+    def synthesize(
+        self,
+        text: str,
+        ref_audio: Path,
+        ref_text: str,
+        out_path: Path,
+    ) -> Path:
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        self._run(
+            [
+                "--ref-audio",
+                str(ref_audio),
+                "--gen-text",
+                text,
+                "--prompt-text",
+                ref_text or "",
+                "--out",
+                str(out_path),
+                "--model-dir",
+                str(self._model_dir),
+                "--prompt-prefix",
+                self._prompt_prefix,
+            ]
+        )
+        return out_path
+
+
+def create_engine(model: str, model_dir: Path, prompt_prefix: str) -> TTSEngine:
+    if model == "f5_tts":
+        return F5TTSEngine()
+    if model == "cosyvoice":
+        return CosyVoiceEngine(model_dir, prompt_prefix)
+    raise ValueError(f"Unknown model: {model}. Use cosyvoice or f5_tts.")
--- a/backend/app/tts/service.py
+++ b/backend/app/tts/service.py
@@ -0,0 +1,97 @@
+from __future__ import annotations
+
+import uuid
+import wave
+from pathlib import Path
+
+from backend.app.config import AppSettings, get_settings, project_root
+from backend.app.text_preprocess import split_sentences
+from backend.app.tts.engines_subprocess import create_engine
+
+ROOT = project_root()
+
+
+class TTSService:
+    def __init__(self, settings: AppSettings | None = None) -> None:
+        self.settings = settings or get_settings()
+        self.engine = create_engine(
+            self.settings.tts_model,
+            self.settings.cosyvoice_model_dir,
+            self.settings.cosyvoice_prompt_prefix,
+        )
+        self.settings.outputs_dir.mkdir(parents=True, exist_ok=True)
+        self.settings.uploads_dir.mkdir(parents=True, exist_ok=True)
+
+    def resolve_reference(
+        self,
+        ref_audio: Path | None = None,
+        ref_text: str | None = None,
+    ) -> tuple[Path, str]:
+        if ref_audio and ref_audio.is_file():
+            audio = ref_audio
+        elif self.settings.default_ref_audio:
+            audio = Path(self.settings.default_ref_audio)
+        else:
+            samples = sorted(self.settings.samples_dir.glob("*.wav"))
+            if not samples:
+                raise FileNotFoundError(
+                    "reference WAV 없음. samples/에 녹음하거나 TTS_REF_AUDIO 설정"
+                )
+            audio = samples[0]
+
+        text = ref_text or self.settings.default_ref_text or ""
+        if not text:
+            for candidate in (
+                audio.with_suffix(".txt"),
+                self.settings.samples_dir / "my_voice_ref.txt",
+            ):
+                if candidate.is_file():
+                    text = candidate.read_text(encoding="utf-8").strip()
+                    break
+        if not text and self.settings.tts_model == "f5_tts":
+            text = "참조 음성의 대본을 samples/my_voice_ref.txt 에 저장하세요."
+        return audio, text
+
+    def synthesize_to_file(
+        self,
+        text: str,
+        ref_audio: Path | None = None,
+        ref_text: str | None = None,
+        job_id: str | None = None,
+    ) -> tuple[str, Path]:
+        ref_path, ref_txt = self.resolve_reference(ref_audio, ref_text)
+        chunks = split_sentences(text, self.settings.chunk_max_chars)
+        job_id = job_id or uuid.uuid4().hex[:12]
+        job_dir = self.settings.outputs_dir / job_id
+        job_dir.mkdir(parents=True, exist_ok=True)
+
+        chunk_paths: list[Path] = []
+        for i, chunk in enumerate(chunks):
+            out = job_dir / f"part_{i:03d}.wav"
+            self.engine.synthesize(chunk, ref_path, ref_txt, out)
+            chunk_paths.append(out)
+
+        final = job_dir / "output.wav"
+        if len(chunk_paths) == 1:
+            chunk_paths[0].replace(final)
+        else:
+            _concat_wav(chunk_paths, final)
+
+        return job_id, final
+
+
+def _concat_wav(paths: list[Path], out: Path) -> None:
+    """동일 포맷 WAV 단순 연결."""
+    with wave.open(str(paths[0]), "rb") as w0:
+        params = w0.getparams()
+        frames = [w0.readframes(w0.getnframes())]
+    for p in paths[1:]:
+        with wave.open(str(p), "rb") as w:
+            if w.getparams() != params:
+                raise ValueError(f"WAV format mismatch: {p}")
+            frames.append(w.readframes(w.getframes()))
+    out.parent.mkdir(parents=True, exist_ok=True)
+    with wave.open(str(out), "wb") as wo:
+        wo.setparams(params)
+        for f in frames:
+            wo.writeframes(f)