Initial commit: Korean voice-cloning TTS prototype

FastAPI backend, web UI, CosyVoice3/F5-TTS setup scripts, and handoff docs for GPU PC continuation. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-04 13:36:37 +09:00
commit 7101fdcd65
36 changed files with 1937 additions and 0 deletions
--- a/backend/app/init.py
+++ b/backend/app/init.py
@@ -0,0 +1 @@
+"""Korean voice-cloning TTS API."""
--- a/backend/app/config.py
+++ b/backend/app/config.py
@@ -0,0 +1,65 @@
+from __future__ import annotations
+
+from functools import lru_cache
+from pathlib import Path
+
+import yaml
+from pydantic import Field
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+ROOT = Path(__file__).resolve().parents[2]
+
+
+class AppSettings(BaseSettings):
+    model_config = SettingsConfigDict(
+        env_file=str(ROOT / ".env"),
+        env_file_encoding="utf-8",
+        extra="ignore",
+    )
+
+    tts_model: str = Field(default="cosyvoice", validation_alias="TTS_MODEL")
+    host: str = Field(default="0.0.0.0", validation_alias="TTS_HOST")
+    port: int = Field(default=8000, validation_alias="TTS_PORT")
+    samples_dir: Path = Field(default=ROOT / "samples")
+    outputs_dir: Path = Field(default=ROOT / "outputs" / "api")
+    uploads_dir: Path = Field(default=ROOT / "backend" / "data" / "uploads")
+    default_ref_audio: str | None = Field(default=None, validation_alias="TTS_REF_AUDIO")
+    default_ref_text: str | None = Field(default=None, validation_alias="TTS_REF_TEXT")
+    cosyvoice_model_dir: Path = Field(default=ROOT / "models" / "Fun-CosyVoice3-0.5B")
+    cosyvoice_prompt_prefix: str = (
+        "You are a helpful assistant.<|endofprompt|>"
+    )
+    chunk_max_chars: int = 120
+
+
+@lru_cache
+def get_settings() -> AppSettings:
+    yaml_path = ROOT / "config" / "settings.yaml"
+    data: dict = {}
+    if yaml_path.is_file():
+        with open(yaml_path, encoding="utf-8") as f:
+            raw = yaml.safe_load(f) or {}
+        data["tts_model"] = raw.get("default_model", "cosyvoice")
+        gen = raw.get("generation") or {}
+        data["chunk_max_chars"] = gen.get("chunk_max_chars", 120)
+        cv = raw.get("cosyvoice") or {}
+        if cv.get("model_dir"):
+            data["cosyvoice_model_dir"] = ROOT / cv["model_dir"]
+        if cv.get("prompt_prefix"):
+            data["cosyvoice_prompt_prefix"] = cv["prompt_prefix"]
+        srv = raw.get("server") or {}
+        data["host"] = srv.get("host", "0.0.0.0")
+        data["port"] = srv.get("port", 8000)
+        paths = raw.get("paths") or {}
+        if paths.get("samples_dir"):
+            data["samples_dir"] = ROOT / paths["samples_dir"]
+        if paths.get("outputs_dir"):
+            data["outputs_dir"] = ROOT / paths["outputs_dir"] / "api"
+        if paths.get("uploads_dir"):
+            data["uploads_dir"] = ROOT / paths["uploads_dir"]
+
+    return AppSettings(**{k: v for k, v in data.items() if v is not None})
+
+
+def project_root() -> Path:
+    return ROOT
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -0,0 +1,170 @@
+from __future__ import annotations
+
+import shutil
+import uuid
+from pathlib import Path
+
+from fastapi import FastAPI, File, Form, HTTPException, UploadFile
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import FileResponse
+from fastapi.staticfiles import StaticFiles
+from pydantic import BaseModel, Field
+
+from backend.app.config import get_settings, project_root
+from backend.app.text_preprocess import preprocess_korean
+from backend.app.tts.service import TTSService
+
+ROOT = project_root()
+WEB_DIR = ROOT / "web"
+
+app = FastAPI(
+    title="Korean Voice Cloning TTS",
+    description="CosyVoice / F5-TTS 기반 한국어 보이스 클로닝 API",
+    version="0.1.0",
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+_tts: TTSService | None = None
+
+
+def get_tts() -> TTSService:
+    global _tts
+    if _tts is None:
+        _tts = TTSService()
+    return _tts
+
+
+class TTSRequest(BaseModel):
+    text: str = Field(..., min_length=1, max_length=5000)
+    ref_audio: str | None = Field(
+        default=None, description="samples/ 또는 uploads/ 기준 상대/절대 경로"
+    )
+    ref_text: str | None = None
+    preprocess: bool = True
+
+
+class TTSResponse(BaseModel):
+    job_id: str
+    audio_url: str
+    model: str
+    text_preview: str
+
+
+class HealthResponse(BaseModel):
+    status: str
+    model: str
+    samples_count: int
+
+
+@app.get("/api/health", response_model=HealthResponse)
+def health() -> HealthResponse:
+    s = get_settings()
+    samples = list(s.samples_dir.glob("*.wav"))
+    return HealthResponse(
+        status="ok",
+        model=s.tts_model,
+        samples_count=len(samples),
+    )
+
+
+@app.post("/api/tts", response_model=TTSResponse)
+def create_tts(body: TTSRequest) -> TTSResponse:
+    text = preprocess_korean(body.text) if body.preprocess else body.text.strip()
+    if not text:
+        raise HTTPException(400, "text is empty")
+
+    ref_path: Path | None = None
+    if body.ref_audio:
+        p = Path(body.ref_audio)
+        if not p.is_absolute():
+            for base in (get_settings().samples_dir, get_settings().uploads_dir):
+                candidate = base / p
+                if candidate.is_file():
+                    p = candidate
+                    break
+        if not p.is_file():
+            raise HTTPException(404, f"ref_audio not found: {body.ref_audio}")
+        ref_path = p
+
+    try:
+        job_id, _ = get_tts().synthesize_to_file(
+            text, ref_audio=ref_path, ref_text=body.ref_text
+        )
+    except FileNotFoundError as e:
+        raise HTTPException(404, str(e)) from e
+    except RuntimeError as e:
+        raise HTTPException(503, str(e)) from e
+
+    return TTSResponse(
+        job_id=job_id,
+        audio_url=f"/api/audio/{job_id}",
+        model=get_settings().tts_model,
+        text_preview=text[:80] + ("…" if len(text) > 80 else ""),
+    )
+
+
+@app.get("/api/audio/{job_id}")
+def get_audio(job_id: str) -> FileResponse:
+    path = get_settings().outputs_dir / job_id / "output.wav"
+    if not path.is_file():
+        alt = get_settings().outputs_dir / job_id / "part_000.wav"
+        path = alt if alt.is_file() else path
+    if not path.is_file():
+        raise HTTPException(404, "audio not found")
+    return FileResponse(path, media_type="audio/wav", filename=f"{job_id}.wav")
+
+
+@app.get("/api/voice-samples")
+def list_voice_samples() -> dict:
+    s = get_settings()
+    samples = []
+    for d, label in ((s.samples_dir, "samples"), (s.uploads_dir, "uploads")):
+        for wav in sorted(d.glob("*.wav")):
+            txt = wav.with_suffix(".txt")
+            samples.append(
+                {
+                    "id": wav.stem,
+                    "path": str(wav),
+                    "label": label,
+                    "has_transcript": txt.is_file(),
+                }
+            )
+    return {"samples": samples, "default_model": s.tts_model}
+
+
+@app.post("/api/voice-sample")
+async def upload_voice_sample(
+    file: UploadFile = File(...),
+    ref_text: str = Form(""),
+) -> dict:
+    if not file.filename or not file.filename.lower().endswith(".wav"):
+        raise HTTPException(400, "WAV 파일만 업로드 가능합니다")
+
+    sample_id = uuid.uuid4().hex[:10]
+    dest = get_settings().uploads_dir / f"{sample_id}.wav"
+    with open(dest, "wb") as f:
+        shutil.copyfileobj(file.file, f)
+
+    if ref_text.strip():
+        (dest.with_suffix(".txt")).write_text(ref_text.strip(), encoding="utf-8")
+
+    return {
+        "id": sample_id,
+        "path": str(dest),
+        "message": "업로드 완료. TTS 요청 시 ref_audio에 이 path를 사용하세요.",
+    }
+
+
+if WEB_DIR.is_dir():
+    app.mount("/", StaticFiles(directory=str(WEB_DIR), html=True), name="web")
+
+
+@app.on_event("startup")
+def startup() -> None:
+    get_settings().outputs_dir.mkdir(parents=True, exist_ok=True)
+    get_settings().uploads_dir.mkdir(parents=True, exist_ok=True)
--- a/backend/app/text_preprocess.py
+++ b/backend/app/text_preprocess.py
@@ -0,0 +1,95 @@
+"""한국어 TTS용 간단한 텍스트 정규화."""
+from __future__ import annotations
+
+import re
+
+
+_RE_MULTI_SPACE = re.compile(r"\s+")
+_RE_EMAIL = re.compile(r"[\w.+-]+@[\w.-]+\.\w+")
+_RE_URL = re.compile(r"https?://\S+")
+
+
+def _digits_to_korean(num_str: str) -> str:
+    """정수 문자열을 한글 읽기로 변환 (간단 버전)."""
+    if not num_str.isdigit():
+        return num_str
+    n = int(num_str.replace(",", ""))
+    if n == 0:
+        return "영"
+    units = ["", "만", "억", "조"]
+    small = ["", "일", "이", "삼", "사", "오", "육", "칠", "팔", "구"]
+    ten = ["", "십", "백", "천"]
+
+    def chunk_to_korean(x: int) -> str:
+        if x == 0:
+            return ""
+        parts: list[str] = []
+        s = f"{x:04d}"
+        for i, d in enumerate(s):
+            di = int(d)
+            if di == 0:
+                continue
+            if i == 0 and di == 1 and len(s) > 1:
+                parts.append(ten[3 - i])
+            elif di == 1 and i > 0:
+                parts.append(ten[3 - i])
+            else:
+                parts.append(small[di] + ten[3 - i])
+        return "".join(parts)
+
+    if n < 10000:
+        return chunk_to_korean(n)
+
+    result: list[str] = []
+    u = 0
+    while n > 0 and u < len(units):
+        part = n % 10000
+        n //= 10000
+        if part:
+            result.append(chunk_to_korean(part) + units[u])
+        u += 1
+    return "".join(reversed(result)) or num_str
+
+
+def _replace_numbers(text: str) -> str:
+    def repl(m: re.Match[str]) -> str:
+        raw = m.group(0).replace(",", "")
+        return _digits_to_korean(raw)
+
+    return re.sub(r"\d[\d,]*", repl, text)
+
+
+def preprocess_korean(text: str) -> str:
+    t = text.strip()
+    t = _RE_URL.sub(" 링크 ", t)
+    t = _RE_EMAIL.sub(" 이메일 ", t)
+    t = t.replace("&", " 앤드 ")
+    t = t.replace("%", " 퍼센트 ")
+    t = _replace_numbers(t)
+    t = _RE_MULTI_SPACE.sub(" ", t)
+    return t.strip()
+
+
+def split_sentences(text: str, max_chars: int = 120) -> list[str]:
+    """긴 텍스트를 문장 단위로 분리."""
+    parts = re.split(r"(?<=[.!?…])\s+|\n+", preprocess_korean(text))
+    chunks: list[str] = []
+    buf = ""
+    for p in parts:
+        p = p.strip()
+        if not p:
+            continue
+        if len(buf) + len(p) + 1 <= max_chars:
+            buf = f"{buf} {p}".strip() if buf else p
+        else:
+            if buf:
+                chunks.append(buf)
+            if len(p) <= max_chars:
+                buf = p
+            else:
+                for i in range(0, len(p), max_chars):
+                    chunks.append(p[i : i + max_chars])
+                buf = ""
+    if buf:
+        chunks.append(buf)
+    return chunks or [text]
--- a/backend/app/tts/init.py
+++ b/backend/app/tts/init.py
@@ -0,0 +1,3 @@
+from backend.app.tts.service import TTSService
+
+__all__ = ["TTSService"]
--- a/backend/app/tts/base.py
+++ b/backend/app/tts/base.py
@@ -0,0 +1,18 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from pathlib import Path
+
+
+class TTSEngine(ABC):
+    name: str
+
+    @abstractmethod
+    def synthesize(
+        self,
+        text: str,
+        ref_audio: Path,
+        ref_text: str,
+        out_path: Path,
+    ) -> Path:
+        """단일 텍스트 청크를 WAV로 생성."""
--- a/backend/app/tts/engines_subprocess.py
+++ b/backend/app/tts/engines_subprocess.py
@@ -0,0 +1,101 @@
+from __future__ import annotations
+
+import subprocess
+import sys
+from pathlib import Path
+
+from backend.app.config import project_root
+from backend.app.tts.base import TTSEngine
+
+ROOT = project_root()
+
+
+class SubprocessEngine(TTSEngine):
+    def __init__(self, venv_name: str, worker_name: str) -> None:
+        self._python = ROOT / ".venvs" / venv_name / "bin" / "python"
+        self._worker = ROOT / "scripts" / "workers" / worker_name
+
+    def _run(self, args: list[str]) -> None:
+        if not self._python.is_file():
+            raise RuntimeError(
+                f"{self._python.parent.parent.name} venv 없음. "
+                f"scripts/setup_{self._python.parent.parent.name}.sh 실행"
+            )
+        cmd = [str(self._python), str(self._worker), *args]
+        proc = subprocess.run(cmd, cwd=str(ROOT), capture_output=True, text=True)
+        if proc.returncode != 0:
+            raise RuntimeError(
+                f"{self.name} inference failed:\n{proc.stderr or proc.stdout}"
+            )
+
+
+class F5TTSEngine(SubprocessEngine):
+    name = "f5_tts"
+
+    def __init__(self) -> None:
+        super().__init__("f5tts", "f5_infer.py")
+
+    def synthesize(
+        self,
+        text: str,
+        ref_audio: Path,
+        ref_text: str,
+        out_path: Path,
+    ) -> Path:
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        self._run(
+            [
+                "--ref-audio",
+                str(ref_audio),
+                "--ref-text",
+                ref_text or "reference audio transcript",
+                "--gen-text",
+                text,
+                "--out",
+                str(out_path),
+            ]
+        )
+        return out_path
+
+
+class CosyVoiceEngine(SubprocessEngine):
+    name = "cosyvoice"
+
+    def __init__(self, model_dir: Path, prompt_prefix: str) -> None:
+        super().__init__("cosyvoice", "cosy_infer.py")
+        self._model_dir = model_dir
+        self._prompt_prefix = prompt_prefix
+
+    def synthesize(
+        self,
+        text: str,
+        ref_audio: Path,
+        ref_text: str,
+        out_path: Path,
+    ) -> Path:
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        self._run(
+            [
+                "--ref-audio",
+                str(ref_audio),
+                "--gen-text",
+                text,
+                "--prompt-text",
+                ref_text or "",
+                "--out",
+                str(out_path),
+                "--model-dir",
+                str(self._model_dir),
+                "--prompt-prefix",
+                self._prompt_prefix,
+            ]
+        )
+        return out_path
+
+
+def create_engine(model: str, model_dir: Path, prompt_prefix: str) -> TTSEngine:
+    if model == "f5_tts":
+        return F5TTSEngine()
+    if model == "cosyvoice":
+        return CosyVoiceEngine(model_dir, prompt_prefix)
+    raise ValueError(f"Unknown model: {model}. Use cosyvoice or f5_tts.")
--- a/backend/app/tts/service.py
+++ b/backend/app/tts/service.py
@@ -0,0 +1,97 @@
+from __future__ import annotations
+
+import uuid
+import wave
+from pathlib import Path
+
+from backend.app.config import AppSettings, get_settings, project_root
+from backend.app.text_preprocess import split_sentences
+from backend.app.tts.engines_subprocess import create_engine
+
+ROOT = project_root()
+
+
+class TTSService:
+    def __init__(self, settings: AppSettings | None = None) -> None:
+        self.settings = settings or get_settings()
+        self.engine = create_engine(
+            self.settings.tts_model,
+            self.settings.cosyvoice_model_dir,
+            self.settings.cosyvoice_prompt_prefix,
+        )
+        self.settings.outputs_dir.mkdir(parents=True, exist_ok=True)
+        self.settings.uploads_dir.mkdir(parents=True, exist_ok=True)
+
+    def resolve_reference(
+        self,
+        ref_audio: Path | None = None,
+        ref_text: str | None = None,
+    ) -> tuple[Path, str]:
+        if ref_audio and ref_audio.is_file():
+            audio = ref_audio
+        elif self.settings.default_ref_audio:
+            audio = Path(self.settings.default_ref_audio)
+        else:
+            samples = sorted(self.settings.samples_dir.glob("*.wav"))
+            if not samples:
+                raise FileNotFoundError(
+                    "reference WAV 없음. samples/에 녹음하거나 TTS_REF_AUDIO 설정"
+                )
+            audio = samples[0]
+
+        text = ref_text or self.settings.default_ref_text or ""
+        if not text:
+            for candidate in (
+                audio.with_suffix(".txt"),
+                self.settings.samples_dir / "my_voice_ref.txt",
+            ):
+                if candidate.is_file():
+                    text = candidate.read_text(encoding="utf-8").strip()
+                    break
+        if not text and self.settings.tts_model == "f5_tts":
+            text = "참조 음성의 대본을 samples/my_voice_ref.txt 에 저장하세요."
+        return audio, text
+
+    def synthesize_to_file(
+        self,
+        text: str,
+        ref_audio: Path | None = None,
+        ref_text: str | None = None,
+        job_id: str | None = None,
+    ) -> tuple[str, Path]:
+        ref_path, ref_txt = self.resolve_reference(ref_audio, ref_text)
+        chunks = split_sentences(text, self.settings.chunk_max_chars)
+        job_id = job_id or uuid.uuid4().hex[:12]
+        job_dir = self.settings.outputs_dir / job_id
+        job_dir.mkdir(parents=True, exist_ok=True)
+
+        chunk_paths: list[Path] = []
+        for i, chunk in enumerate(chunks):
+            out = job_dir / f"part_{i:03d}.wav"
+            self.engine.synthesize(chunk, ref_path, ref_txt, out)
+            chunk_paths.append(out)
+
+        final = job_dir / "output.wav"
+        if len(chunk_paths) == 1:
+            chunk_paths[0].replace(final)
+        else:
+            _concat_wav(chunk_paths, final)
+
+        return job_id, final
+
+
+def _concat_wav(paths: list[Path], out: Path) -> None:
+    """동일 포맷 WAV 단순 연결."""
+    with wave.open(str(paths[0]), "rb") as w0:
+        params = w0.getparams()
+        frames = [w0.readframes(w0.getnframes())]
+    for p in paths[1:]:
+        with wave.open(str(p), "rb") as w:
+            if w.getparams() != params:
+                raise ValueError(f"WAV format mismatch: {p}")
+            frames.append(w.readframes(w.getframes()))
+    out.parent.mkdir(parents=True, exist_ok=True)
+    with wave.open(str(out), "wb") as wo:
+        wo.setparams(params)
+        for f in frames:
+            wo.writeframes(f)
--- a/backend/data/uploads/.gitkeep
+++ b/backend/data/uploads/.gitkeep
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -0,0 +1,9 @@
+fastapi>=0.115.0
+uvicorn[standard]>=0.32.0
+python-multipart>=0.0.12
+pydantic>=2.9.0
+pydantic-settings>=2.6.0
+pyyaml>=6.0.2
+aiofiles>=24.1.0
+soundfile>=0.12.1
+librosa>=0.10.2