Initial commit: Korean voice-cloning TTS prototype

FastAPI backend, web UI, CosyVoice3/F5-TTS setup scripts, and handoff docs for GPU PC continuation.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
2026-06-04 13:36:37 +09:00
commit 7101fdcd65
36 changed files with 1937 additions and 0 deletions

View File

@@ -0,0 +1,3 @@
from backend.app.tts.service import TTSService
__all__ = ["TTSService"]

18
backend/app/tts/base.py Normal file
View File

@@ -0,0 +1,18 @@
from __future__ import annotations
from abc import ABC, abstractmethod
from pathlib import Path
class TTSEngine(ABC):
name: str
@abstractmethod
def synthesize(
self,
text: str,
ref_audio: Path,
ref_text: str,
out_path: Path,
) -> Path:
"""단일 텍스트 청크를 WAV로 생성."""

View File

@@ -0,0 +1,101 @@
from __future__ import annotations
import subprocess
import sys
from pathlib import Path
from backend.app.config import project_root
from backend.app.tts.base import TTSEngine
ROOT = project_root()
class SubprocessEngine(TTSEngine):
def __init__(self, venv_name: str, worker_name: str) -> None:
self._python = ROOT / ".venvs" / venv_name / "bin" / "python"
self._worker = ROOT / "scripts" / "workers" / worker_name
def _run(self, args: list[str]) -> None:
if not self._python.is_file():
raise RuntimeError(
f"{self._python.parent.parent.name} venv 없음. "
f"scripts/setup_{self._python.parent.parent.name}.sh 실행"
)
cmd = [str(self._python), str(self._worker), *args]
proc = subprocess.run(cmd, cwd=str(ROOT), capture_output=True, text=True)
if proc.returncode != 0:
raise RuntimeError(
f"{self.name} inference failed:\n{proc.stderr or proc.stdout}"
)
class F5TTSEngine(SubprocessEngine):
name = "f5_tts"
def __init__(self) -> None:
super().__init__("f5tts", "f5_infer.py")
def synthesize(
self,
text: str,
ref_audio: Path,
ref_text: str,
out_path: Path,
) -> Path:
out_path.parent.mkdir(parents=True, exist_ok=True)
self._run(
[
"--ref-audio",
str(ref_audio),
"--ref-text",
ref_text or "reference audio transcript",
"--gen-text",
text,
"--out",
str(out_path),
]
)
return out_path
class CosyVoiceEngine(SubprocessEngine):
name = "cosyvoice"
def __init__(self, model_dir: Path, prompt_prefix: str) -> None:
super().__init__("cosyvoice", "cosy_infer.py")
self._model_dir = model_dir
self._prompt_prefix = prompt_prefix
def synthesize(
self,
text: str,
ref_audio: Path,
ref_text: str,
out_path: Path,
) -> Path:
out_path.parent.mkdir(parents=True, exist_ok=True)
self._run(
[
"--ref-audio",
str(ref_audio),
"--gen-text",
text,
"--prompt-text",
ref_text or "",
"--out",
str(out_path),
"--model-dir",
str(self._model_dir),
"--prompt-prefix",
self._prompt_prefix,
]
)
return out_path
def create_engine(model: str, model_dir: Path, prompt_prefix: str) -> TTSEngine:
if model == "f5_tts":
return F5TTSEngine()
if model == "cosyvoice":
return CosyVoiceEngine(model_dir, prompt_prefix)
raise ValueError(f"Unknown model: {model}. Use cosyvoice or f5_tts.")

View File

@@ -0,0 +1,97 @@
from __future__ import annotations
import uuid
import wave
from pathlib import Path
from backend.app.config import AppSettings, get_settings, project_root
from backend.app.text_preprocess import split_sentences
from backend.app.tts.engines_subprocess import create_engine
ROOT = project_root()
class TTSService:
def __init__(self, settings: AppSettings | None = None) -> None:
self.settings = settings or get_settings()
self.engine = create_engine(
self.settings.tts_model,
self.settings.cosyvoice_model_dir,
self.settings.cosyvoice_prompt_prefix,
)
self.settings.outputs_dir.mkdir(parents=True, exist_ok=True)
self.settings.uploads_dir.mkdir(parents=True, exist_ok=True)
def resolve_reference(
self,
ref_audio: Path | None = None,
ref_text: str | None = None,
) -> tuple[Path, str]:
if ref_audio and ref_audio.is_file():
audio = ref_audio
elif self.settings.default_ref_audio:
audio = Path(self.settings.default_ref_audio)
else:
samples = sorted(self.settings.samples_dir.glob("*.wav"))
if not samples:
raise FileNotFoundError(
"reference WAV 없음. samples/에 녹음하거나 TTS_REF_AUDIO 설정"
)
audio = samples[0]
text = ref_text or self.settings.default_ref_text or ""
if not text:
for candidate in (
audio.with_suffix(".txt"),
self.settings.samples_dir / "my_voice_ref.txt",
):
if candidate.is_file():
text = candidate.read_text(encoding="utf-8").strip()
break
if not text and self.settings.tts_model == "f5_tts":
text = "참조 음성의 대본을 samples/my_voice_ref.txt 에 저장하세요."
return audio, text
def synthesize_to_file(
self,
text: str,
ref_audio: Path | None = None,
ref_text: str | None = None,
job_id: str | None = None,
) -> tuple[str, Path]:
ref_path, ref_txt = self.resolve_reference(ref_audio, ref_text)
chunks = split_sentences(text, self.settings.chunk_max_chars)
job_id = job_id or uuid.uuid4().hex[:12]
job_dir = self.settings.outputs_dir / job_id
job_dir.mkdir(parents=True, exist_ok=True)
chunk_paths: list[Path] = []
for i, chunk in enumerate(chunks):
out = job_dir / f"part_{i:03d}.wav"
self.engine.synthesize(chunk, ref_path, ref_txt, out)
chunk_paths.append(out)
final = job_dir / "output.wav"
if len(chunk_paths) == 1:
chunk_paths[0].replace(final)
else:
_concat_wav(chunk_paths, final)
return job_id, final
def _concat_wav(paths: list[Path], out: Path) -> None:
"""동일 포맷 WAV 단순 연결."""
with wave.open(str(paths[0]), "rb") as w0:
params = w0.getparams()
frames = [w0.readframes(w0.getnframes())]
for p in paths[1:]:
with wave.open(str(p), "rb") as w:
if w.getparams() != params:
raise ValueError(f"WAV format mismatch: {p}")
frames.append(w.readframes(w.getframes()))
out.parent.mkdir(parents=True, exist_ok=True)
with wave.open(str(out), "wb") as wo:
wo.setparams(params)
for f in frames:
wo.writeframes(f)