Initial commit: Korean voice-cloning TTS prototype

FastAPI backend, web UI, CosyVoice3/F5-TTS setup scripts, and handoff docs for GPU PC continuation. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-04 13:36:37 +09:00
commit 7101fdcd65
36 changed files with 1937 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@@ -0,0 +1,9 @@
 # 기본 TTS 엔진: cosyvoice | f5_tts
 TTS_MODEL=cosyvoice
 # 기본 reference (선택)
 # TTS_REF_AUDIO=samples/my_voice_30s.wav
 # TTS_REF_TEXT=참조 음성 대본...
 TTS_HOST=0.0.0.0
 TTS_PORT=8000
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,16 @@
 .venvs/
 venv/
 __pycache__/
 *.py[cod]
 .env
 *.wav
 !samples/.gitkeep
 outputs/
 models/
 external/CosyVoice/
 *.egg-info/
 .DS_Store
 .pytest_cache/
 .mypy_cache/
 backend/data/uploads/*
 !backend/data/uploads/.gitkeep
--- a/README.md
+++ b/README.md
@@ -0,0 +1,81 @@
 # 한국어 보이스 클로닝 TTS
 한국어 자연스러움과 **내 목소리 유사도**를 우선하는 로컬 TTS 프로토타입입니다.  
 기본 엔진은 **CosyVoice3**, 비교·대안으로 **F5-TTS**를 지원합니다.
 ## 빠른 시작 (NVIDIA GPU Linux)
 ```bash
 cd /path/to/tts
 # 1) 환경 점검
 chmod +x scripts/*.sh
 ./scripts/check_env.sh
 # 2) API + 모델 venv (모델별 격리)
 ./scripts/setup_api.sh
 ./scripts/setup_f5tts.sh      # 약 5~15분, GPU/CUDA 필요
 ./scripts/setup_cosyvoice.sh  # 레포 클론 + 모델 다운로드, 시간 소요
 # 3) A/B 비교 (동일 텍스트·reference)
 ./scripts/run_ab_compare.py --ref-audio auto
 # 본인 목소리:
 # samples/my_voice_30s.wav 녹음 + my_voice_ref.txt 작성 후
 ./scripts/run_ab_compare.py --ref-audio samples/my_voice_30s.wav
 # 4) API + 웹 UI
 cp .env.example .env   # 필요 시 TTS_MODEL 수정
 ./scripts/run_server.sh
 # 브라우저: http://localhost:8000
 ```
 ## 디렉터리
 | 경로 | 설명 |
 |------|------|
 | `config/` | 설정, 테스트 문장, 모델 선택 |
 | `samples/` | reference WAV + 대본 |
 | `outputs/` | A/B 비교 및 API 생성 결과 |
 | `models/` | CosyVoice3 체크포인트 |
 | `backend/` | FastAPI |
 | `web/` | 간단한 웹 UI |
 | `.venvs/f5tts`, `.venvs/cosyvoice`, `.venvs/api` | 격리 Python 환경 |
 ## 모델 선택
 `config/model_choice.json` — 품질 우선 시 **cosyvoice** 권장.  
 F5-TTS가 더 나으면 `.env`에서 `TTS_MODEL=f5_tts`로 변경.
 ## API
 | Method | Path | 설명 |
 |--------|------|------|
 | GET | `/api/health` | 상태 |
 | POST | `/api/tts` | 텍스트 → WAV |
 | GET | `/api/audio/{job_id}` | 결과 재생 |
 | GET | `/api/voice-samples` | 샘플 목록 |
 | POST | `/api/voice-sample` | WAV 업로드 |
 ### 예시
 ```bash
 curl -X POST http://localhost:8000/api/tts \
  -H "Content-Type: application/json" \
  -d '{"text":"안녕하세요. 테스트입니다."}'
 ```
 ## 내 목소리 녹음
 [samples/README.md](samples/README.md) 참고.  
 30초 / 1분 / 3분 샘플을 각각 녹음해 A/B 비교하는 것을 권장합니다.
 ## 문제 해결
 - `nvidia-smi` 없음 → NVIDIA 드라이버/CUDA 설치 후 재시도
 - CosyVoice import/sox 오류 → `sudo apt install sox libsox-dev`
 - F5-TTS `ref_text` 필수 → `samples/my_voice_ref.txt` 작성
 - API 503 → 해당 모델 venv setup 스크립트 재실행
 ## 라이선스
 각 모델(F5-TTS, CosyVoice)의 원저작권·라이선스를 따릅니다.
--- a/backend/app/init.py
+++ b/backend/app/init.py
@@ -0,0 +1 @@
 """Korean voice-cloning TTS API."""
--- a/backend/app/config.py
+++ b/backend/app/config.py
@@ -0,0 +1,65 @@
 from __future__ import annotations
 from functools import lru_cache
 from pathlib import Path
 import yaml
 from pydantic import Field
 from pydantic_settings import BaseSettings, SettingsConfigDict
 ROOT = Path(__file__).resolve().parents[2]
 class AppSettings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file=str(ROOT / ".env"),
        env_file_encoding="utf-8",
        extra="ignore",
    )
    tts_model: str = Field(default="cosyvoice", validation_alias="TTS_MODEL")
    host: str = Field(default="0.0.0.0", validation_alias="TTS_HOST")
    port: int = Field(default=8000, validation_alias="TTS_PORT")
    samples_dir: Path = Field(default=ROOT / "samples")
    outputs_dir: Path = Field(default=ROOT / "outputs" / "api")
    uploads_dir: Path = Field(default=ROOT / "backend" / "data" / "uploads")
    default_ref_audio: str | None = Field(default=None, validation_alias="TTS_REF_AUDIO")
    default_ref_text: str | None = Field(default=None, validation_alias="TTS_REF_TEXT")
    cosyvoice_model_dir: Path = Field(default=ROOT / "models" / "Fun-CosyVoice3-0.5B")
    cosyvoice_prompt_prefix: str = (
        "You are a helpful assistant.<|endofprompt|>"
    )
    chunk_max_chars: int = 120
@lru_cache
 def get_settings() -> AppSettings:
    yaml_path = ROOT / "config" / "settings.yaml"
    data: dict = {}
    if yaml_path.is_file():
        with open(yaml_path, encoding="utf-8") as f:
            raw = yaml.safe_load(f) or {}
        data["tts_model"] = raw.get("default_model", "cosyvoice")
        gen = raw.get("generation") or {}
        data["chunk_max_chars"] = gen.get("chunk_max_chars", 120)
        cv = raw.get("cosyvoice") or {}
        if cv.get("model_dir"):
            data["cosyvoice_model_dir"] = ROOT / cv["model_dir"]
        if cv.get("prompt_prefix"):
            data["cosyvoice_prompt_prefix"] = cv["prompt_prefix"]
        srv = raw.get("server") or {}
        data["host"] = srv.get("host", "0.0.0.0")
        data["port"] = srv.get("port", 8000)
        paths = raw.get("paths") or {}
        if paths.get("samples_dir"):
            data["samples_dir"] = ROOT / paths["samples_dir"]
        if paths.get("outputs_dir"):
            data["outputs_dir"] = ROOT / paths["outputs_dir"] / "api"
        if paths.get("uploads_dir"):
            data["uploads_dir"] = ROOT / paths["uploads_dir"]
    return AppSettings(**{k: v for k, v in data.items() if v is not None})
 def project_root() -> Path:
    return ROOT
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -0,0 +1,170 @@
 from __future__ import annotations
 import shutil
 import uuid
 from pathlib import Path
 from fastapi import FastAPI, File, Form, HTTPException, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse
 from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel, Field
 from backend.app.config import get_settings, project_root
 from backend.app.text_preprocess import preprocess_korean
 from backend.app.tts.service import TTSService
 ROOT = project_root()
 WEB_DIR = ROOT / "web"
 app = FastAPI(
    title="Korean Voice Cloning TTS",
    description="CosyVoice / F5-TTS 기반 한국어 보이스 클로닝 API",
    version="0.1.0",
 )
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"],
 )
 _tts: TTSService | None = None
 def get_tts() -> TTSService:
    global _tts
    if _tts is None:
        _tts = TTSService()
    return _tts
 class TTSRequest(BaseModel):
    text: str = Field(..., min_length=1, max_length=5000)
    ref_audio: str | None = Field(
        default=None, description="samples/ 또는 uploads/ 기준 상대/절대 경로"
    )
    ref_text: str | None = None
    preprocess: bool = True
 class TTSResponse(BaseModel):
    job_id: str
    audio_url: str
    model: str
    text_preview: str
 class HealthResponse(BaseModel):
    status: str
    model: str
    samples_count: int
@app.get("/api/health", response_model=HealthResponse)
 def health() -> HealthResponse:
    s = get_settings()
    samples = list(s.samples_dir.glob("*.wav"))
    return HealthResponse(
        status="ok",
        model=s.tts_model,
        samples_count=len(samples),
    )
@app.post("/api/tts", response_model=TTSResponse)
 def create_tts(body: TTSRequest) -> TTSResponse:
    text = preprocess_korean(body.text) if body.preprocess else body.text.strip()
    if not text:
        raise HTTPException(400, "text is empty")
    ref_path: Path | None = None
    if body.ref_audio:
        p = Path(body.ref_audio)
        if not p.is_absolute():
            for base in (get_settings().samples_dir, get_settings().uploads_dir):
                candidate = base / p
                if candidate.is_file():
                    p = candidate
                    break
        if not p.is_file():
            raise HTTPException(404, f"ref_audio not found: {body.ref_audio}")
        ref_path = p
    try:
        job_id, _ = get_tts().synthesize_to_file(
            text, ref_audio=ref_path, ref_text=body.ref_text
        )
    except FileNotFoundError as e:
        raise HTTPException(404, str(e)) from e
    except RuntimeError as e:
        raise HTTPException(503, str(e)) from e
    return TTSResponse(
        job_id=job_id,
        audio_url=f"/api/audio/{job_id}",
        model=get_settings().tts_model,
        text_preview=text[:80] + ("…" if len(text) > 80 else ""),
    )
@app.get("/api/audio/{job_id}")
 def get_audio(job_id: str) -> FileResponse:
    path = get_settings().outputs_dir / job_id / "output.wav"
    if not path.is_file():
        alt = get_settings().outputs_dir / job_id / "part_000.wav"
        path = alt if alt.is_file() else path
    if not path.is_file():
        raise HTTPException(404, "audio not found")
    return FileResponse(path, media_type="audio/wav", filename=f"{job_id}.wav")
@app.get("/api/voice-samples")
 def list_voice_samples() -> dict:
    s = get_settings()
    samples = []
    for d, label in ((s.samples_dir, "samples"), (s.uploads_dir, "uploads")):
        for wav in sorted(d.glob("*.wav")):
            txt = wav.with_suffix(".txt")
            samples.append(
                {
                    "id": wav.stem,
                    "path": str(wav),
                    "label": label,
                    "has_transcript": txt.is_file(),
                }
            )
    return {"samples": samples, "default_model": s.tts_model}
@app.post("/api/voice-sample")
 async def upload_voice_sample(
    file: UploadFile = File(...),
    ref_text: str = Form(""),
 ) -> dict:
    if not file.filename or not file.filename.lower().endswith(".wav"):
        raise HTTPException(400, "WAV 파일만 업로드 가능합니다")
    sample_id = uuid.uuid4().hex[:10]
    dest = get_settings().uploads_dir / f"{sample_id}.wav"
    with open(dest, "wb") as f:
        shutil.copyfileobj(file.file, f)
    if ref_text.strip():
        (dest.with_suffix(".txt")).write_text(ref_text.strip(), encoding="utf-8")
    return {
        "id": sample_id,
        "path": str(dest),
        "message": "업로드 완료. TTS 요청 시 ref_audio에 이 path를 사용하세요.",
    }
 if WEB_DIR.is_dir():
    app.mount("/", StaticFiles(directory=str(WEB_DIR), html=True), name="web")
@app.on_event("startup")
 def startup() -> None:
    get_settings().outputs_dir.mkdir(parents=True, exist_ok=True)
    get_settings().uploads_dir.mkdir(parents=True, exist_ok=True)
--- a/backend/app/text_preprocess.py
+++ b/backend/app/text_preprocess.py
@@ -0,0 +1,95 @@
 """한국어 TTS용 간단한 텍스트 정규화."""
 from __future__ import annotations
 import re
 _RE_MULTI_SPACE = re.compile(r"\s+")
 _RE_EMAIL = re.compile(r"[\w.+-]+@[\w.-]+\.\w+")
 _RE_URL = re.compile(r"https?://\S+")
 def _digits_to_korean(num_str: str) -> str:
    """정수 문자열을 한글 읽기로 변환 (간단 버전)."""
    if not num_str.isdigit():
        return num_str
    n = int(num_str.replace(",", ""))
    if n == 0:
        return "영"
    units = ["", "만", "억", "조"]
    small = ["", "일", "이", "삼", "사", "오", "육", "칠", "팔", "구"]
    ten = ["", "십", "백", "천"]
    def chunk_to_korean(x: int) -> str:
        if x == 0:
            return ""
        parts: list[str] = []
        s = f"{x:04d}"
        for i, d in enumerate(s):
            di = int(d)
            if di == 0:
                continue
            if i == 0 and di == 1 and len(s) > 1:
                parts.append(ten[3 - i])
            elif di == 1 and i > 0:
                parts.append(ten[3 - i])
            else:
                parts.append(small[di] + ten[3 - i])
        return "".join(parts)
    if n < 10000:
        return chunk_to_korean(n)
    result: list[str] = []
    u = 0
    while n > 0 and u < len(units):
        part = n % 10000
        n //= 10000
        if part:
            result.append(chunk_to_korean(part) + units[u])
        u += 1
    return "".join(reversed(result)) or num_str
 def _replace_numbers(text: str) -> str:
    def repl(m: re.Match[str]) -> str:
        raw = m.group(0).replace(",", "")
        return _digits_to_korean(raw)
    return re.sub(r"\d[\d,]*", repl, text)
 def preprocess_korean(text: str) -> str:
    t = text.strip()
    t = _RE_URL.sub(" 링크 ", t)
    t = _RE_EMAIL.sub(" 이메일 ", t)
    t = t.replace("&", " 앤드 ")
    t = t.replace("%", " 퍼센트 ")
    t = _replace_numbers(t)
    t = _RE_MULTI_SPACE.sub(" ", t)
    return t.strip()
 def split_sentences(text: str, max_chars: int = 120) -> list[str]:
    """긴 텍스트를 문장 단위로 분리."""
    parts = re.split(r"(?<=[.!?…])\s+|\n+", preprocess_korean(text))
    chunks: list[str] = []
    buf = ""
    for p in parts:
        p = p.strip()
        if not p:
            continue
        if len(buf) + len(p) + 1 <= max_chars:
            buf = f"{buf} {p}".strip() if buf else p
        else:
            if buf:
                chunks.append(buf)
            if len(p) <= max_chars:
                buf = p
            else:
                for i in range(0, len(p), max_chars):
                    chunks.append(p[i : i + max_chars])
                buf = ""
    if buf:
        chunks.append(buf)
    return chunks or [text]
--- a/backend/app/tts/init.py
+++ b/backend/app/tts/init.py
@@ -0,0 +1,3 @@
 from backend.app.tts.service import TTSService
 __all__ = ["TTSService"]
--- a/backend/app/tts/base.py
+++ b/backend/app/tts/base.py
@@ -0,0 +1,18 @@
 from __future__ import annotations
 from abc import ABC, abstractmethod
 from pathlib import Path
 class TTSEngine(ABC):
    name: str
    @abstractmethod
    def synthesize(
        self,
        text: str,
        ref_audio: Path,
        ref_text: str,
        out_path: Path,
    ) -> Path:
        """단일 텍스트 청크를 WAV로 생성."""
--- a/backend/app/tts/engines_subprocess.py
+++ b/backend/app/tts/engines_subprocess.py
@@ -0,0 +1,101 @@
 from __future__ import annotations
 import subprocess
 import sys
 from pathlib import Path
 from backend.app.config import project_root
 from backend.app.tts.base import TTSEngine
 ROOT = project_root()
 class SubprocessEngine(TTSEngine):
    def __init__(self, venv_name: str, worker_name: str) -> None:
        self._python = ROOT / ".venvs" / venv_name / "bin" / "python"
        self._worker = ROOT / "scripts" / "workers" / worker_name
    def _run(self, args: list[str]) -> None:
        if not self._python.is_file():
            raise RuntimeError(
                f"{self._python.parent.parent.name} venv 없음. "
                f"scripts/setup_{self._python.parent.parent.name}.sh 실행"
            )
        cmd = [str(self._python), str(self._worker), *args]
        proc = subprocess.run(cmd, cwd=str(ROOT), capture_output=True, text=True)
        if proc.returncode != 0:
            raise RuntimeError(
                f"{self.name} inference failed:\n{proc.stderr or proc.stdout}"
            )
 class F5TTSEngine(SubprocessEngine):
    name = "f5_tts"
    def __init__(self) -> None:
        super().__init__("f5tts", "f5_infer.py")
    def synthesize(
        self,
        text: str,
        ref_audio: Path,
        ref_text: str,
        out_path: Path,
    ) -> Path:
        out_path.parent.mkdir(parents=True, exist_ok=True)
        self._run(
            [
                "--ref-audio",
                str(ref_audio),
                "--ref-text",
                ref_text or "reference audio transcript",
                "--gen-text",
                text,
                "--out",
                str(out_path),
            ]
        )
        return out_path
 class CosyVoiceEngine(SubprocessEngine):
    name = "cosyvoice"
    def __init__(self, model_dir: Path, prompt_prefix: str) -> None:
        super().__init__("cosyvoice", "cosy_infer.py")
        self._model_dir = model_dir
        self._prompt_prefix = prompt_prefix
    def synthesize(
        self,
        text: str,
        ref_audio: Path,
        ref_text: str,
        out_path: Path,
    ) -> Path:
        out_path.parent.mkdir(parents=True, exist_ok=True)
        self._run(
            [
                "--ref-audio",
                str(ref_audio),
                "--gen-text",
                text,
                "--prompt-text",
                ref_text or "",
                "--out",
                str(out_path),
                "--model-dir",
                str(self._model_dir),
                "--prompt-prefix",
                self._prompt_prefix,
            ]
        )
        return out_path
 def create_engine(model: str, model_dir: Path, prompt_prefix: str) -> TTSEngine:
    if model == "f5_tts":
        return F5TTSEngine()
    if model == "cosyvoice":
        return CosyVoiceEngine(model_dir, prompt_prefix)
    raise ValueError(f"Unknown model: {model}. Use cosyvoice or f5_tts.")
--- a/backend/app/tts/service.py
+++ b/backend/app/tts/service.py
@@ -0,0 +1,97 @@
 from __future__ import annotations
 import uuid
 import wave
 from pathlib import Path
 from backend.app.config import AppSettings, get_settings, project_root
 from backend.app.text_preprocess import split_sentences
 from backend.app.tts.engines_subprocess import create_engine
 ROOT = project_root()
 class TTSService:
    def __init__(self, settings: AppSettings | None = None) -> None:
        self.settings = settings or get_settings()
        self.engine = create_engine(
            self.settings.tts_model,
            self.settings.cosyvoice_model_dir,
            self.settings.cosyvoice_prompt_prefix,
        )
        self.settings.outputs_dir.mkdir(parents=True, exist_ok=True)
        self.settings.uploads_dir.mkdir(parents=True, exist_ok=True)
    def resolve_reference(
        self,
        ref_audio: Path | None = None,
        ref_text: str | None = None,
    ) -> tuple[Path, str]:
        if ref_audio and ref_audio.is_file():
            audio = ref_audio
        elif self.settings.default_ref_audio:
            audio = Path(self.settings.default_ref_audio)
        else:
            samples = sorted(self.settings.samples_dir.glob("*.wav"))
            if not samples:
                raise FileNotFoundError(
                    "reference WAV 없음. samples/에 녹음하거나 TTS_REF_AUDIO 설정"
                )
            audio = samples[0]
        text = ref_text or self.settings.default_ref_text or ""
        if not text:
            for candidate in (
                audio.with_suffix(".txt"),
                self.settings.samples_dir / "my_voice_ref.txt",
            ):
                if candidate.is_file():
                    text = candidate.read_text(encoding="utf-8").strip()
                    break
        if not text and self.settings.tts_model == "f5_tts":
            text = "참조 음성의 대본을 samples/my_voice_ref.txt 에 저장하세요."
        return audio, text
    def synthesize_to_file(
        self,
        text: str,
        ref_audio: Path | None = None,
        ref_text: str | None = None,
        job_id: str | None = None,
    ) -> tuple[str, Path]:
        ref_path, ref_txt = self.resolve_reference(ref_audio, ref_text)
        chunks = split_sentences(text, self.settings.chunk_max_chars)
        job_id = job_id or uuid.uuid4().hex[:12]
        job_dir = self.settings.outputs_dir / job_id
        job_dir.mkdir(parents=True, exist_ok=True)
        chunk_paths: list[Path] = []
        for i, chunk in enumerate(chunks):
            out = job_dir / f"part_{i:03d}.wav"
            self.engine.synthesize(chunk, ref_path, ref_txt, out)
            chunk_paths.append(out)
        final = job_dir / "output.wav"
        if len(chunk_paths) == 1:
            chunk_paths[0].replace(final)
        else:
            _concat_wav(chunk_paths, final)
        return job_id, final
 def _concat_wav(paths: list[Path], out: Path) -> None:
    """동일 포맷 WAV 단순 연결."""
    with wave.open(str(paths[0]), "rb") as w0:
        params = w0.getparams()
        frames = [w0.readframes(w0.getnframes())]
    for p in paths[1:]:
        with wave.open(str(p), "rb") as w:
            if w.getparams() != params:
                raise ValueError(f"WAV format mismatch: {p}")
            frames.append(w.readframes(w.getframes()))
    out.parent.mkdir(parents=True, exist_ok=True)
    with wave.open(str(out), "wb") as wo:
        wo.setparams(params)
        for f in frames:
            wo.writeframes(f)
--- a/backend/data/uploads/.gitkeep
+++ b/backend/data/uploads/.gitkeep
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -0,0 +1,9 @@
 fastapi>=0.115.0
 uvicorn[standard]>=0.32.0
 python-multipart>=0.0.12
 pydantic>=2.9.0
 pydantic-settings>=2.6.0
 pyyaml>=6.0.2
 aiofiles>=24.1.0
 soundfile>=0.12.1
 librosa>=0.10.2
--- a/config/model_choice.json
+++ b/config/model_choice.json
@@ -0,0 +1,10 @@
 {
  "selected_model": "cosyvoice",
  "selection_criteria": [
    "korean_naturalness",
    "prosody",
    "speaker_similarity",
    "long_sentence_stability"
  ],
  "notes": "품질 우선 기준으로 CosyVoice3를 기본 엔진으로 사용합니다. F5-TTS는 scripts/run_ab_compare.py로 동일 조건 비교 후 변경 가능합니다."
 }
--- a/config/settings.yaml
+++ b/config/settings.yaml
@@ -0,0 +1,26 @@
 # TTS 프로토타입 설정 (한국어 품질 우선)
 default_model: cosyvoice  # cosyvoice | f5_tts
 paths:
  samples_dir: samples
  outputs_dir: outputs
  models_dir: models
  uploads_dir: backend/data/uploads
 cosyvoice:
  repo_dir: external/CosyVoice
  model_dir: models/Fun-CosyVoice3-0.5B
  # reference WAV에 대응하는 프롬프트 텍스트 (CosyVoice3 zero-shot 형식)
  prompt_prefix: "You are a helpful assistant.<|endofprompt|>"
 f5_tts:
  model: F5TTS_v1_Base
 generation:
  chunk_max_chars: 120
  cross_fade_duration: 0.15
  speed: 1.0
 server:
  host: 0.0.0.0
  port: 8000
--- a/config/test_sentences.json
+++ b/config/test_sentences.json
@@ -0,0 +1,29 @@
 {
  "cases": [
    {
      "id": "short",
      "label": "짧은 문장",
      "text": "안녕하세요. 오늘 날씨가 정말 좋네요."
    },
    {
      "id": "long",
      "label": "긴 문장",
      "text": "인공지능 음성 합성 기술은 짧은 문장뿐 아니라 긴 설명문에서도 자연스러운 억양과 호흡을 유지해야 하며, 특히 한국어에서는 조사와 어미 변화가 발음 품질에 큰 영향을 줍니다."
    },
    {
      "id": "numbers",
      "label": "숫자/단위",
      "text": "회의는 3월 15일 오후 2시 30분에 시작하며, 예산은 약 1,250,000원입니다."
    },
    {
      "id": "mixed",
      "label": "영어/기호 혼합",
      "text": "GitHub에서 API 키를 발급받은 뒤, README.md 파일을 확인해 주세요."
    },
    {
      "id": "emotion",
      "label": "감정/강조",
      "text": "정말 기뻐요! 드디어 프로젝트가 완성됐어요. 고생 많으셨습니다."
    }
  ]
 }
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,28 @@
 # NVIDIA GPU 서버용 (모델 venv는 호스트에서 setup 후 볼륨 마운트 권장)
 services:
  tts-api:
    image: nvidia/cuda:12.4.1-runtime-ubuntu22.04
    working_dir: /app
    volumes:
      - .:/app
      - ./.venvs:/app/.venvs
      - ./models:/app/models
      - ./samples:/app/samples
      - ./outputs:/app/outputs
    ports:
      - "8000:8000"
    environment:
      - TTS_MODEL=cosyvoice
      - PYTHONPATH=/app
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    command: >
      bash -c "
        apt-get update -qq && apt-get install -y -qq python3 python3-venv python3-pip sox libsox-dev ffmpeg git &&
        ./scripts/run_server.sh
      "
--- a/docs/HANDOFF.md
+++ b/docs/HANDOFF.md
@@ -0,0 +1,274 @@
 # 다음 작업자 인수인계 문서
 이 문서는 한국어 보이스 클로닝 TTS 프로젝트를 다른 PC 또는 다음 AI가 이어받을 때 가장 먼저 읽어야 하는 문서입니다.
 ## 한 줄 요약
 한국어 자연스러움과 본인 목소리 유사도를 우선하는 로컬 TTS 프로토타입입니다. 기본 모델은 `CosyVoice3`, 비교 모델은 `F5-TTS`이며, FastAPI 백엔드와 간단한 웹 UI까지 만들어져 있습니다.
 ## 현재 작업 상태
 - 프로젝트 뼈대 생성 완료
 - FastAPI 서버 구현 완료
 - 웹 UI 구현 완료
 - 한국어 텍스트 전처리 구현 완료
 - 내 목소리 reference 녹음 가이드 작성 완료
 - `CosyVoice3` / `F5-TTS` A/B 비교 스크립트 작성 완료
 - 모델별 격리 venv 설치 스크립트 작성 완료
 - 현재 Mac에서 API/UI smoke test 완료
 - 실제 고품질 TTS 추론은 아직 NVIDIA GPU 환경에서 검증 필요
 ## 중요한 전제
 현재 작업한 Mac은 Apple Silicon 환경이고 `nvidia-smi`가 없습니다. 따라서 여기서는 API/UI 구조 확인은 가능하지만, 계획한 고품질 보이스 클로닝 추론은 Windows 또는 Linux의 NVIDIA GPU PC에서 이어가는 것이 맞습니다.
 메인 PC가 Windows + NVIDIA GPU라면 다음 순서로 진행하세요. 가능하면 WSL2 Ubuntu 환경을 권장합니다. Windows 네이티브도 가능할 수 있지만, `CosyVoice`는 Linux/WSL2 쪽이 문제 해결 자료가 많습니다.
 ## 깃에 올릴 때 주의
 올리면 좋은 것:
 - `README.md`
 - `docs/HANDOFF.md`
 - `backend/`
 - `web/`
 - `scripts/`
 - `config/`
 - `samples/README.md`
 - `samples/my_voice_ref.txt`
 - `.env.example`
 - `.gitignore`
 - `docker-compose.yml`
 - `tests/`
 올리지 말아야 할 것:
 - `.venvs/`
 - `models/`
 - `outputs/`
 - `external/CosyVoice/`
 - 실제 내 목소리 WAV 파일
 - `.env` 안에 개인 경로나 민감한 값이 들어간 경우
 현재 `.gitignore`에는 `.venvs/`, `models/`, `outputs/`, `external/CosyVoice/`, `.env`, WAV 파일 등이 제외되도록 설정되어 있습니다.
 ## 메인 PC에서 처음 할 일
 ```bash
 git clone <repo-url>
 cd tts
 chmod +x scripts/*.sh
 ./scripts/check_env.sh
 ```
 Windows라면 WSL2 Ubuntu에서 실행하는 것을 권장합니다.
 ```bash
 sudo apt update
 sudo apt install -y git python3 python3-venv python3-pip ffmpeg sox libsox-dev
 ```
 NVIDIA GPU가 제대로 잡히는지 확인하세요.
 ```bash
 nvidia-smi
 ```
 ## 환경 설치 순서
 API 서버용 venv:
 ```bash
 ./scripts/setup_api.sh
 ```
 F5-TTS 비교용 venv:
 ```bash
 ./scripts/setup_f5tts.sh
 ```
 CosyVoice3 기본 모델용 venv:
 ```bash
 ./scripts/setup_cosyvoice.sh
 ```
 `setup_cosyvoice.sh`는 `external/CosyVoice` 레포를 클론하고 `models/Fun-CosyVoice3-0.5B` 모델을 다운로드합니다. 시간이 오래 걸릴 수 있습니다.
 ## 현재 기본 모델
 기본값은 `cosyvoice`입니다.
 관련 파일:
 - `.env.example`
 - `.env`
 - `config/settings.yaml`
 - `config/model_choice.json`
 모델 변경:
 ```bash
 ./scripts/select_model.sh cosyvoice
 ./scripts/select_model.sh f5_tts
 ```
 또는 `.env`에서 직접 변경:
 ```env
 TTS_MODEL=cosyvoice
 ```
 ## 내 목소리 샘플 준비
 자세한 가이드는 `samples/README.md`에 있습니다.
 권장 파일:
 ```text
 samples/my_voice_30s.wav
 samples/my_voice_1m.wav
 samples/my_voice_3m.wav
 samples/my_voice_ref.txt
 ```
 중요:
 - WAV는 mono, 24kHz 또는 16kHz 권장
 - 녹음 대본과 `my_voice_ref.txt` 내용은 일치해야 함
 - 조용한 환경에서 마이크 거리 일정하게 유지
 - 30초, 1분, 3분 샘플을 각각 비교
 reference WAV 전처리:
 ```bash
 ./scripts/prepare_reference.sh samples/my_voice_30s.wav
 ```
 ## 모델 A/B 비교
 설치 검증용:
 ```bash
 ./scripts/run_ab_compare.py --ref-audio auto
 ```
 본인 목소리 비교:
 ```bash
 ./scripts/run_ab_compare.py --ref-audio samples/my_voice_30s.wav
 ```
 길이별 reference 비교:
 ```bash
 ./scripts/compare_voice_lengths.sh
 ```
 결과는 보통 아래에 생성됩니다.
 ```text
 outputs/ab_compare/
 outputs/voice_length_compare/
 ```
 평가 기준:
 - 한국어 발음 정확도
 - 조사/어미 자연스러움
 - 억양과 호흡
 - 내 목소리 유사도
 - 긴 문장 안정성
 - 숫자, 영어, 기호 포함 문장 처리
 ## 서버 실행
 ```bash
 cp .env.example .env
 ./scripts/run_server.sh
 ```
 브라우저:
 ```text
 http://localhost:8000
 ```
 주요 API:
 ```text
 GET  /api/health
 POST /api/tts
 GET  /api/audio/{job_id}
 GET  /api/voice-samples
 POST /api/voice-sample
 ```
 간단 테스트:
 ```bash
 curl -X POST http://localhost:8000/api/tts \
  -H "Content-Type: application/json" \
  -d '{"text":"안녕하세요. 한국어 음성 합성 테스트입니다."}'
 ```
 ## Mac에서 가능한 것과 불가능한 것
 현재 Mac에서 가능한 것:
 - FastAPI 서버 실행
 - 웹 UI 확인
 - `/api/health` 확인
 - 샘플 업로드 UI 확인
 - 텍스트 전처리 테스트
 현재 Mac에서 어려운 것:
 - `CosyVoice3` 고품질 추론
 - `F5-TTS` CUDA 기반 추론
 - 최종 품질 평가
 즉, Mac은 개발/구조 확인용이고, 최종 모델 품질 검증은 NVIDIA GPU PC에서 해야 합니다.
 ## 다음 AI에게 요청할 때 권장 문장
 다음 AI에게는 아래처럼 시작하면 됩니다.
 ```text
 먼저 docs/HANDOFF.md를 읽고, 현재 한국어 보이스 클로닝 TTS 프로젝트 상태를 파악한 뒤 이어서 작업해줘.
 목표는 Windows/NVIDIA GPU PC에서 CosyVoice3와 F5-TTS를 설치하고, 내 목소리 샘플로 A/B 비교 후 더 자연스러운 모델을 선택하는 거야.
 ```
 ## 다음 작업자가 우선 확인할 파일
 1. `docs/HANDOFF.md`
 2. `README.md`
 3. `config/settings.yaml`
 4. `config/model_choice.json`
 5. `.env.example`
 6. `samples/README.md`
 7. `scripts/setup_cosyvoice.sh`
 8. `scripts/setup_f5tts.sh`
 9. `scripts/run_ab_compare.py`
 10. `backend/app/main.py`
 ## 알려진 주의점
 - `FastAPI TestClient`는 현재 API venv에서 `httpx` 계열 패키지가 없어 직접 테스트가 실패했습니다. 대신 실제 `uvicorn` 서버를 띄우고 `/api/health`와 웹 UI 200 응답은 확인했습니다.
 - `scripts/setup_f5tts.sh`와 `scripts/setup_cosyvoice.sh`는 CUDA 12.4 PyTorch index를 사용합니다. 메인 PC CUDA 버전에 맞지 않으면 `cu124`를 `cu121`, `cu126`, `cu128` 등으로 조정해야 할 수 있습니다.
 - `CosyVoice`는 `sox`, `libsox-dev`, `ffmpeg`가 필요할 수 있습니다.
 - 실제 음성 파일은 개인정보성이 있으므로 깃에 올리지 않는 것을 권장합니다.
 - `.env`도 깃에 올리지 말고 `.env.example`만 공유하세요.
 ## 완료 기준
 이 프로젝트의 다음 큰 완료 기준은 다음과 같습니다.
 - NVIDIA GPU PC에서 `./scripts/setup_cosyvoice.sh` 성공
 - NVIDIA GPU PC에서 `./scripts/setup_f5tts.sh` 성공
 - 본인 목소리 샘플로 `./scripts/run_ab_compare.py` 실행 성공
 - `outputs/ab_compare/` 결과를 듣고 모델 선택
 - 선택 모델로 `./scripts/run_server.sh` 실행
 - 웹 UI에서 텍스트 입력 후 본인 목소리 WAV 재생 성공
--- a/samples/.gitkeep
+++ b/samples/.gitkeep
--- a/samples/README.md
+++ b/samples/README.md
@@ -0,0 +1,42 @@
 # Reference 음성 샘플
 내 목소리로 TTS를 만들려면 **조용한 환경**에서 아래 길이별로 녹음하세요.
 ## 권장 녹음 방식
 1. 마이크와 입 사이 거리를 일정하게 유지 (15~20cm)
 2. 평서문으로 자연스럽게 읽기 (연기·과장 금지)
 3. 포맷: **mono WAV, 24kHz** (또는 16kHz)
 4. 파일명 예시:
   - `my_voice_30s.wav`
   - `my_voice_1m.wav`
   - `my_voice_3m.wav`
 ## reference 텍스트
 녹음한 내용과 **동일한 대본**을 `my_voice_ref.txt`에 저장하세요.  
 F5-TTS는 이 텍스트가 필수이고, CosyVoice는 WAV만으로도 동작하지만 품질 비교 시 동일 샘플을 사용하세요.
 ### 예시 대본 (약 30초)
 ```
 안녕하세요. 저는 한국어 음성 합성 테스트를 위한 참조 음성을 녹음하고 있습니다.
 오늘은 날씨가 맑고, 목소리가 자연스럽게 들리도록 천천히 말하겠습니다.
 숫자도 포함해 볼게요. 회의는 3월 15일 오후 2시에 있습니다.
 ```
 ## 전처리
 ```bash
 ./scripts/prepare_reference.sh samples/my_voice_30s.wav
 ```
 ## 기본 샘플 (모델 설치 검증용)
 모델 설치 직후에는 F5-TTS 기본 예제 음성으로 먼저 테스트할 수 있습니다:
 ```bash
 ./scripts/run_ab_compare.py --ref-audio auto
 ```
 `auto`는 F5-TTS 패키지 내장 영어 샘플을 사용합니다. 한국어 품질 비교는 **본인 녹음 샘플**로 다시 실행하세요.
--- a/samples/my_voice_ref.txt
+++ b/samples/my_voice_ref.txt
@@ -0,0 +1,3 @@
 안녕하세요. 저는 한국어 음성 합성 테스트를 위한 참조 음성을 녹음하고 있습니다.
 오늘은 날씨가 맑고, 목소리가 자연스럽게 들리도록 천천히 말하겠습니다.
 숫자도 포함해 볼게요. 회의는 3월 15일 오후 2시에 있습니다.
--- a/scripts/check_env.sh
+++ b/scripts/check_env.sh
@@ -0,0 +1,77 @@
 #!/usr/bin/env bash
 # NVIDIA GPU + CUDA 환경 점검
 set -euo pipefail
 ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 cd "$ROOT"
 echo "=== TTS 환경 점검 ==="
 echo "프로젝트: $ROOT"
 echo
 echo "--- OS / CPU ---"
 uname -a
 echo
 echo "--- Python ---"
 if command -v python3 &>/dev/null; then
  python3 --version
  which python3
 else
  echo "python3: 없음"
 fi
 echo
 echo "--- NVIDIA GPU ---"
 if command -v nvidia-smi &>/dev/null; then
  nvidia-smi
  echo
  nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv
 else
  echo "nvidia-smi: 사용 불가 (NVIDIA GPU 서버에서 실행하세요)"
 fi
 echo
 echo "--- CUDA (nvcc) ---"
 if command -v nvcc &>/dev/null; then
  nvcc --version | head -4
 else
  echo "nvcc: 없음 (PyTorch CUDA 빌드로도 동작 가능)"
 fi
 echo
 echo "--- PyTorch (API venv) ---"
 API_VENV="$ROOT/.venvs/api"
 if [[ -x "$API_VENV/bin/python" ]]; then
  "$API_VENV/bin/python" -c "
 import sys
 try:
    import torch
    print('torch:', torch.__version__)
    print('cuda available:', torch.cuda.is_available())
    if torch.cuda.is_available():
        print('device:', torch.cuda.get_device_name(0))
 except ImportError:
    print('torch: 미설치 (API만 사용 시 정상)')
 " 2>/dev/null || true
 else
  echo "API venv 없음 → ./scripts/setup_api.sh 실행"
 fi
 echo
 echo "--- 모델 venv ---"
 for name in f5tts cosyvoice; do
  V="$ROOT/.venvs/$name"
  if [[ -x "$V/bin/python" ]]; then
    echo "[$name] OK: $V"
  else
    echo "[$name] 없음 → setup_${name}.sh (f5tts는 setup_f5tts.sh)"
  fi
 done
 echo
 echo "--- 디렉터리 ---"
 for d in samples outputs models config backend web; do
  [[ -d "$ROOT/$d" ]] && echo "  $d: OK" || echo "  $d: MISSING"
 done
 echo "점검 완료."
--- a/scripts/compare_voice_lengths.sh
+++ b/scripts/compare_voice_lengths.sh
@@ -0,0 +1,45 @@
 #!/usr/bin/env bash
 # 내 목소리 reference 녹음 가이드 출력 + 길이별 비교 실행
 set -euo pipefail
 ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 echo "=== 내 목소리 녹음 가이드 ==="
 echo "자세한 내용: $ROOT/samples/README.md"
 echo
 echo "1) 30초 / 1분 / 3분 WAV를 samples/ 에 저장"
 echo "2) my_voice_ref.txt 에 녹음 대본 작성"
 echo "3) ./scripts/prepare_reference.sh samples/my_voice_30s.wav"
 echo
 shopt -s nullglob
 WAVS=("$ROOT"/samples/my_voice_*.wav)
 if [[ ${#WAVS[@]} -eq 0 ]]; then
  echo "아직 my_voice_*.wav 없음. 녹음 후 다시 실행하세요."
  exit 0
 fi
 OUT="$ROOT/outputs/voice_length_compare"
 mkdir -p "$OUT"
 PY="$ROOT/.venvs/cosyvoice/bin/python"
 WORKER="$ROOT/scripts/workers/cosy_infer.py"
 TEXT="안녕하세요. 이 문장은 reference 길이별 품질 비교를 위한 테스트입니다."
 if [[ ! -x "$PY" ]]; then
  echo "cosyvoice venv 없음. ./scripts/setup_cosyvoice.sh 후 재실행"
  exit 1
 fi
 REF_TXT=""
 [[ -f "$ROOT/samples/my_voice_ref.txt" ]] && REF_TXT=$(cat "$ROOT/samples/my_voice_ref.txt")
 for wav in "${WAVS[@]}"; do
  name=$(basename "$wav" .wav)
  echo "생성: $name"
  "$PY" "$WORKER" \
    --ref-audio "$wav" \
    --gen-text "$TEXT" \
    --prompt-text "$REF_TXT" \
    --out "$OUT/${name}_test.wav" || true
 done
 echo "결과: $OUT"
--- a/scripts/prepare_reference.sh
+++ b/scripts/prepare_reference.sh
@@ -0,0 +1,38 @@
 #!/usr/bin/env bash
 # reference WAV를 mono 24kHz로 정규화
 set -euo pipefail
 if [[ $# -lt 1 ]]; then
  echo "Usage: $0 input.wav [output.wav]"
  exit 1
 fi
 IN="$1"
 OUT="${2:-${IN%.wav}_24k_mono.wav}"
 ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 PY="${ROOT}/.venvs/api/bin/python"
 if [[ ! -x "$PY" ]]; then
  PY=python3
 fi
 "$PY" - <<PY
 import sys
 try:
    import soundfile as sf
    import numpy as np
 except ImportError:
    print("soundfile 필요: pip install soundfile")
    sys.exit(1)
 data, sr = sf.read("$IN", always_2d=False)
 if data.ndim > 1:
    data = data.mean(axis=1)
 target_sr = 24000
 if sr != target_sr:
    import librosa
    data = librosa.resample(data.astype(float), orig_sr=sr, target_sr=target_sr)
    sr = target_sr
 sf.write("$OUT", data, sr, subtype="PCM_16")
 print(f"Saved: $OUT ({sr} Hz mono)")
 PY
--- a/scripts/run_ab_compare.py
+++ b/scripts/run_ab_compare.py
@@ -0,0 +1,149 @@
 #!/usr/bin/env python3
 """
 F5-TTS vs CosyVoice3 A/B 비교.
 각 모델 전용 venv의 worker를 subprocess로 호출합니다.
 """
 from __future__ import annotations
 import argparse
 import json
 import subprocess
 import sys
 from pathlib import Path
 ROOT = Path(__file__).resolve().parents[1]
 CONFIG = ROOT / "config"
 F5_PY = ROOT / ".venvs" / "f5tts" / "bin" / "python"
 COSY_PY = ROOT / ".venvs" / "cosyvoice" / "bin" / "python"
 F5_WORKER = ROOT / "scripts" / "workers" / "f5_infer.py"
 COSY_WORKER = ROOT / "scripts" / "workers" / "cosy_infer.py"
 def load_sentences() -> list[dict]:
    with open(CONFIG / "test_sentences.json", encoding="utf-8") as f:
        return json.load(f)["cases"]
 def resolve_ref_audio(ref_arg: str) -> tuple[Path, str]:
    """(wav_path, ref_text for F5)"""
    if ref_arg == "auto":
        try:
            from importlib.resources import files
            wav = files("f5_tts").joinpath("infer/examples/basic/basic_ref_en.wav")
            ref_path = Path(str(wav))
            ref_text = "some call me nature, others call me mother nature."
            return ref_path, ref_text
        except Exception:
            samples = list((ROOT / "samples").glob("*.wav"))
            if not samples:
                raise SystemExit(
                    "reference 없음: samples/*.wav 녹음하거나 f5-tts venv 설치 후 --ref-audio auto"
                )
            ref_path = samples[0]
    else:
        ref_path = Path(ref_arg)
        if not ref_path.is_file():
            raise SystemExit(f"ref audio not found: {ref_path}")
    ref_text = ""
    txt_candidates = [
        ref_path.with_suffix(".txt"),
        ROOT / "samples" / "my_voice_ref.txt",
    ]
    for t in txt_candidates:
        if t.is_file():
            ref_text = t.read_text(encoding="utf-8").strip()
            break
    if not ref_text and ref_arg != "auto":
        ref_text = "참조 음성의 대본을 여기에 입력하세요."
    return ref_path, ref_text
 def run_worker(python: Path, worker: Path, cmd: list[str]) -> bool:
    if not python.is_file():
        print(f"SKIP: venv missing ({python.parent.parent.name})", file=sys.stderr)
        return False
    r = subprocess.run([str(python), str(worker), *cmd], cwd=str(ROOT))
    return r.returncode == 0
 def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument("--ref-audio", default="auto", help="WAV path or 'auto'")
    parser.add_argument("--models", default="both", choices=("both", "f5_tts", "cosyvoice"))
    parser.add_argument("--out-dir", default=str(ROOT / "outputs" / "ab_compare"))
    args = parser.parse_args()
    ref_path, ref_text = resolve_ref_audio(args.ref_audio)
    out_base = Path(args.out_dir)
    out_base.mkdir(parents=True, exist_ok=True)
    cases = load_sentences()
    print(f"Reference: {ref_path}")
    print(f"Cases: {len(cases)}")
    print(f"Output: {out_base}\n")
    ok = 0
    fail = 0
    for case in cases:
        cid = case["id"]
        text = case["text"]
        print(f"=== {cid}: {case['label']} ===")
        if args.models in ("both", "f5_tts"):
            out_f5 = out_base / "f5_tts" / f"{cid}.wav"
            if run_worker(
                F5_PY,
                F5_WORKER,
                [
                    "--ref-audio",
                    str(ref_path),
                    "--ref-text",
                    ref_text,
                    "--gen-text",
                    text,
                    "--out",
                    str(out_f5),
                ],
            ):
                ok += 1
            else:
                fail += 1
        if args.models in ("both", "cosyvoice"):
            out_cosy = out_base / "cosyvoice" / f"{cid}.wav"
            if run_worker(
                COSY_PY,
                COSY_WORKER,
                [
                    "--ref-audio",
                    str(ref_path),
                    "--gen-text",
                    text,
                    "--prompt-text",
                    ref_text,
                    "--out",
                    str(out_cosy),
                ],
            ):
                ok += 1
            else:
                fail += 1
    manifest = {
        "ref_audio": str(ref_path),
        "ref_text": ref_text,
        "cases": cases,
        "output_dir": str(out_base),
    }
    (out_base / "manifest.json").write_text(
        json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8"
    )
    print(f"\n완료: success={ok} fail={fail}")
    print(f"manifest: {out_base / 'manifest.json'}")
    return 0 if fail == 0 else 1
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/scripts/run_server.sh
+++ b/scripts/run_server.sh
@@ -0,0 +1,19 @@
 #!/usr/bin/env bash
 set -euo pipefail
 ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 cd "$ROOT"
 if [[ ! -x "$ROOT/.venvs/api/bin/uvicorn" ]]; then
  echo "API venv 없음. ./scripts/setup_api.sh 실행"
  exit 1
 fi
 export PYTHONPATH="$ROOT"
 # shellcheck disable=SC1091
 [[ -f "$ROOT/.env" ]] && source "$ROOT/.env"
 exec "$ROOT/.venvs/api/bin/uvicorn" backend.app.main:app \
  --host "${TTS_HOST:-0.0.0.0}" \
  --port "${TTS_PORT:-8000}" \
  --reload
--- a/scripts/select_model.sh
+++ b/scripts/select_model.sh
@@ -0,0 +1,34 @@
 #!/usr/bin/env bash
 # 최종 모델 선택을 .env 와 config에 반영
 set -euo pipefail
 MODEL="${1:-}"
 if [[ -z "$MODEL" || ! "$MODEL" =~ ^(cosyvoice|f5_tts)$ ]]; then
  echo "Usage: $0 cosyvoice|f5_tts"
  exit 1
 fi
 ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 ENV_FILE="$ROOT/.env"
 if [[ -f "$ENV_FILE" ]]; then
  if grep -q '^TTS_MODEL=' "$ENV_FILE"; then
    sed -i.bak "s/^TTS_MODEL=.*/TTS_MODEL=$MODEL/" "$ENV_FILE"
    rm -f "$ENV_FILE.bak"
  else
    echo "TTS_MODEL=$MODEL" >> "$ENV_FILE"
  fi
 else
  cp "$ROOT/.env.example" "$ENV_FILE"
  echo "TTS_MODEL=$MODEL" >> "$ENV_FILE"
 fi
 python3 - <<PY
 import json
 from pathlib import Path
 p = Path("$ROOT/config/model_choice.json")
 data = json.loads(p.read_text(encoding="utf-8"))
 data["selected_model"] = "$MODEL"
 p.write_text(json.dumps(data, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
 print("selected_model=$MODEL")
 PY
--- a/scripts/setup_api.sh
+++ b/scripts/setup_api.sh
@@ -0,0 +1,13 @@
 #!/usr/bin/env bash
 # FastAPI 서버용 경량 venv
 set -euo pipefail
 ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 VENV="$ROOT/.venvs/api"
 python3 -m venv "$VENV"
 "$VENV/bin/pip" install -U pip wheel
 "$VENV/bin/pip" install -r "$ROOT/backend/requirements.txt"
 echo "API venv 준비 완료: $VENV"
 echo "실행: $VENV/bin/uvicorn backend.app.main:app --host 0.0.0.0 --port 8000"
--- a/scripts/setup_cosyvoice.sh
+++ b/scripts/setup_cosyvoice.sh
@@ -0,0 +1,38 @@
 #!/usr/bin/env bash
 # CosyVoice3 전용 venv + 레포 클론 + 모델 다운로드
 set -euo pipefail
 ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 VENV="$ROOT/.venvs/cosyvoice"
 REPO="$ROOT/external/CosyVoice"
 MODEL_DIR="$ROOT/models/Fun-CosyVoice3-0.5B"
 mkdir -p "$ROOT/external" "$ROOT/models"
 if [[ ! -d "$REPO/.git" ]]; then
  echo "CosyVoice 레포 클론..."
  git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git "$REPO"
  cd "$REPO"
  git submodule update --init --recursive
 else
  echo "CosyVoice 레포 이미 존재: $REPO"
 fi
 python3 -m venv "$VENV"
 "$VENV/bin/pip" install -U pip wheel
 "$VENV/bin/pip" install torch torchaudio --index-url https://download.pytorch.org/whl/cu124
 "$VENV/bin/pip" install -r "$REPO/requirements.txt"
 "$VENV/bin/pip" install huggingface_hub modelscope
 echo "CosyVoice3 모델 다운로드 (Hugging Face)..."
 "$VENV/bin/python" - <<PY
 from huggingface_hub import snapshot_download
 snapshot_download(
    'FunAudioLLM/Fun-CosyVoice3-0.5B-2512',
    local_dir='$MODEL_DIR',
 )
 print('Model saved to $MODEL_DIR')
 PY
 echo "CosyVoice venv 준비 완료: $VENV"
 echo "테스트: $VENV/bin/python $ROOT/scripts/workers/cosy_infer.py --help"
--- a/scripts/setup_f5tts.sh
+++ b/scripts/setup_f5tts.sh
@@ -0,0 +1,16 @@
 #!/usr/bin/env bash
 # F5-TTS 전용 venv (NVIDIA CUDA)
 set -euo pipefail
 ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 VENV="$ROOT/.venvs/f5tts"
 python3 -m venv "$VENV"
 "$VENV/bin/pip" install -U pip wheel
 # CUDA 12.x PyTorch (서버 CUDA 버전에 맞게 cu124/cu128 조정)
 "$VENV/bin/pip" install torch torchaudio --index-url https://download.pytorch.org/whl/cu124
 "$VENV/bin/pip" install f5-tts
 echo "F5-TTS venv 준비 완료: $VENV"
 echo "테스트: $VENV/bin/python $ROOT/scripts/workers/f5_infer.py --help"
--- a/scripts/workers/cosy_infer.py
+++ b/scripts/workers/cosy_infer.py
@@ -0,0 +1,73 @@
 #!/usr/bin/env python3
 """CosyVoice3 zero-shot 추론 워커 (cosyvoice venv에서 실행)."""
 from __future__ import annotations
 import argparse
 import sys
 from pathlib import Path
 def main() -> int:
    parser = argparse.ArgumentParser(description="CosyVoice3 inference worker")
    parser.add_argument("--ref-audio", required=True)
    parser.add_argument("--prompt-text", default="", help="Text spoken in ref audio (with prefix)")
    parser.add_argument("--gen-text", required=True)
    parser.add_argument("--out", required=True)
    parser.add_argument(
        "--model-dir",
        default=None,
        help="Path to Fun-CosyVoice3-0.5B (default: PROJECT/models/Fun-CosyVoice3-0.5B)",
    )
    parser.add_argument(
        "--prompt-prefix",
        default="You are a helpful assistant.<|endofprompt|>",
    )
    args = parser.parse_args()
    root = Path(__file__).resolve().parents[2]
    repo = root / "external" / "CosyVoice"
    model_dir = Path(args.model_dir or root / "models" / "Fun-CosyVoice3-0.5B")
    ref = Path(args.ref_audio)
    out = Path(args.out)
    if not repo.is_dir():
        print(f"CosyVoice repo missing: {repo}. Run ./scripts/setup_cosyvoice.sh", file=sys.stderr)
        return 1
    if not model_dir.is_dir():
        print(f"Model dir missing: {model_dir}", file=sys.stderr)
        return 1
    if not ref.is_file():
        print(f"ref audio not found: {ref}", file=sys.stderr)
        return 1
    sys.path.insert(0, str(repo))
    sys.path.append(str(repo / "third_party" / "Matcha-TTS"))
    try:
        import torchaudio
        from cosyvoice.cli.cosyvoice import AutoModel
    except ImportError as e:
        print("CosyVoice import failed. Run ./scripts/setup_cosyvoice.sh", file=sys.stderr)
        print(e, file=sys.stderr)
        return 1
    prompt = args.prompt_prefix + (args.prompt_text or "")
    out.parent.mkdir(parents=True, exist_ok=True)
    cosyvoice = AutoModel(model_dir=str(model_dir))
    for i, result in enumerate(
        cosyvoice.inference_zero_shot(
            args.gen_text,
            prompt,
            str(ref),
            stream=False,
        )
    ):
        path = out if i == 0 else out.with_stem(f"{out.stem}_{i}")
        torchaudio.save(str(path), result["tts_speech"], cosyvoice.sample_rate)
        print(f"OK: {path}")
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/scripts/workers/f5_infer.py
+++ b/scripts/workers/f5_infer.py
@@ -0,0 +1,47 @@
 #!/usr/bin/env python3
 """F5-TTS 추론 워커 (f5tts venv에서 실행)."""
 from __future__ import annotations
 import argparse
 import sys
 from pathlib import Path
 def main() -> int:
    parser = argparse.ArgumentParser(description="F5-TTS inference worker")
    parser.add_argument("--ref-audio", required=True, help="Reference WAV path")
    parser.add_argument("--ref-text", required=True, help="Transcript of reference audio")
    parser.add_argument("--gen-text", required=True, help="Text to synthesize")
    parser.add_argument("--out", required=True, help="Output WAV path")
    parser.add_argument("--model", default="F5TTS_v1_Base")
    args = parser.parse_args()
    ref = Path(args.ref_audio)
    if not ref.is_file():
        print(f"ref audio not found: {ref}", file=sys.stderr)
        return 1
    out = Path(args.out)
    out.parent.mkdir(parents=True, exist_ok=True)
    try:
        from f5_tts.api import F5TTS
    except ImportError as e:
        print("f5-tts not installed. Run: ./scripts/setup_f5tts.sh", file=sys.stderr)
        print(e, file=sys.stderr)
        return 1
    tts = F5TTS(model=args.model)
    tts.infer(
        ref_file=str(ref),
        ref_text=args.ref_text,
        gen_text=args.gen_text,
        file_wave=str(out),
        remove_silence=True,
    )
    print(f"OK: {out}")
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/tests/test_preprocess.py
+++ b/tests/test_preprocess.py
@@ -0,0 +1,13 @@
 from backend.app.text_preprocess import preprocess_korean, split_sentences
 def test_preprocess_numbers():
    out = preprocess_korean("예산은 1,250,000원입니다.")
    assert "원" in out
    assert "1,250,000" not in out
 def test_split_sentences():
    chunks = split_sentences("첫 문장입니다. 두 번째 문장입니다.", max_chars=50)
    assert len(chunks) >= 1
    assert all(len(c) <= 50 for c in chunks)
--- a/web/app.js
+++ b/web/app.js
@@ -0,0 +1,101 @@
 const $ = (id) => document.getElementById(id);
 async function fetchHealth() {
  try {
    const res = await fetch("/api/health");
    const data = await res.json();
    $("healthInfo").textContent = `모델: ${data.model} · 샘플 ${data.samples_count}개`;
  } catch {
    $("healthInfo").textContent = "API 서버에 연결할 수 없습니다.";
  }
 }
 async function loadSamples() {
  const select = $("sampleSelect");
  try {
    const res = await fetch("/api/voice-samples");
    const data = await res.json();
    for (const s of data.samples) {
      const opt = document.createElement("option");
      opt.value = s.path;
      opt.textContent = `${s.label}/${s.id}${s.has_transcript ? "" : " (대본 없음)"}`;
      select.appendChild(opt);
    }
  } catch (e) {
    console.warn("samples load failed", e);
  }
 }
 async function uploadIfNeeded() {
  const fileInput = $("fileUpload");
  if (!fileInput.files?.length) return null;
  const form = new FormData();
  form.append("file", fileInput.files[0]);
  const refText = $("refText").value.trim();
  if (refText) form.append("ref_text", refText);
  const res = await fetch("/api/voice-sample", { method: "POST", body: form });
  if (!res.ok) {
    const err = await res.json().catch(() => ({}));
    throw new Error(err.detail || "업로드 실패");
  }
  const data = await res.json();
  return data.path;
 }
 $("generateBtn").addEventListener("click", async () => {
  const text = $("text").value.trim();
  if (!text) {
    $("status").textContent = "텍스트를 입력하세요.";
    return;
  }
  const btn = $("generateBtn");
  btn.disabled = true;
  $("status").textContent = "음성 생성 중… (GPU 추론은 수십 초 걸릴 수 있습니다)";
  $("resultSection").hidden = true;
  try {
    let refAudio = $("sampleSelect").value || null;
    const uploaded = await uploadIfNeeded();
    if (uploaded) refAudio = uploaded;
    const body = {
      text,
      preprocess: true,
      ref_text: $("refText").value.trim() || null,
      ref_audio: refAudio,
    };
    const res = await fetch("/api/tts", {
      method: "POST",
      headers: { "Content-Type": "application/json" },
      body: JSON.stringify(body),
    });
    if (!res.ok) {
      const err = await res.json().catch(() => ({}));
      const detail =
        typeof err.detail === "string"
          ? err.detail
          : JSON.stringify(err.detail || err);
      throw new Error(detail || res.statusText);
    }
    const data = await res.json();
    const url = data.audio_url + "?t=" + Date.now();
    $("player").src = url;
    $("downloadLink").href = url;
    $("downloadLink").download = `${data.job_id}.wav`;
    $("resultSection").hidden = false;
    $("status").textContent = `완료 (모델: ${data.model}, job: ${data.job_id})`;
  } catch (e) {
    $("status").textContent = `오류: ${e.message}`;
  } finally {
    btn.disabled = false;
  }
 });
 fetchHealth();
 loadSamples();
--- a/web/index.html
+++ b/web/index.html
@@ -0,0 +1,64 @@
 <!DOCTYPE html>
 <html lang="ko">
  <head>
    <meta charset="UTF-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <title>한국어 보이스 클로닝 TTS</title>
    <link rel="stylesheet" href="/style.css" />
  </head>
  <body>
    <main class="container">
      <header>
        <h1>한국어 보이스 클로닝 TTS</h1>
        <p class="subtitle">텍스트를 입력하면 reference 음성을 바탕으로 음성을 생성합니다.</p>
      </header>
      <section class="card">
        <label for="text">읽을 텍스트</label>
        <textarea
          id="text"
          rows="5"
          placeholder="안녕하세요. 오늘 날씨가 정말 좋네요."
        ></textarea>
        <div class="row">
          <div class="field">
            <label for="sampleSelect">Reference 음성</label>
            <select id="sampleSelect">
              <option value="">기본 샘플 사용</option>
            </select>
          </div>
          <div class="field">
            <label for="refText">Reference 대본 (선택)</label>
            <input
              id="refText"
              type="text"
              placeholder="녹음한 내용과 동일한 텍스트"
            />
          </div>
        </div>
        <div class="field">
          <label for="fileUpload">새 음성 업로드 (WAV)</label>
          <input id="fileUpload" type="file" accept=".wav,audio/wav" />
        </div>
        <button id="generateBtn" type="button">음성 생성</button>
        <p id="status" class="status" aria-live="polite"></p>
      </section>
      <section class="card" id="resultSection" hidden>
        <h2>결과</h2>
        <audio id="player" controls></audio>
        <p>
          <a id="downloadLink" href="#" download>WAV 다운로드</a>
        </p>
      </section>
      <footer>
        <span id="healthInfo">서버 확인 중…</span>
      </footer>
    </main>
    <script src="/app.js"></script>
  </body>
 </html>
--- a/web/style.css
+++ b/web/style.css
@@ -0,0 +1,133 @@
 :root {
  --bg: #0f1419;
  --card: #1a2332;
  --text: #e7ecf3;
  --muted: #8b9bb4;
  --accent: #3d8bfd;
  --accent-hover: #5ca0ff;
  --border: #2a3a52;
 }
 * {
  box-sizing: border-box;
 }
 body {
  margin: 0;
  font-family: "Pretendard", "Apple SD Gothic Neo", system-ui, sans-serif;
  background: var(--bg);
  color: var(--text);
  line-height: 1.5;
 }
 .container {
  max-width: 720px;
  margin: 0 auto;
  padding: 2rem 1.25rem 3rem;
 }
 header h1 {
  margin: 0 0 0.25rem;
  font-size: 1.75rem;
 }
 .subtitle {
  color: var(--muted);
  margin: 0 0 1.5rem;
 }
 .card {
  background: var(--card);
  border: 1px solid var(--border);
  border-radius: 12px;
  padding: 1.25rem;
  margin-bottom: 1rem;
 }
 label {
  display: block;
  font-size: 0.875rem;
  color: var(--muted);
  margin-bottom: 0.35rem;
 }
 textarea,
 input,
 select {
  width: 100%;
  padding: 0.65rem 0.75rem;
  border-radius: 8px;
  border: 1px solid var(--border);
  background: #0d1218;
  color: var(--text);
  font-size: 1rem;
 }
 textarea {
  resize: vertical;
  min-height: 120px;
 }
 .row {
  display: grid;
  grid-template-columns: 1fr 1fr;
  gap: 1rem;
  margin-top: 1rem;
 }
@media (max-width: 600px) {
  .row {
    grid-template-columns: 1fr;
  }
 }
 .field {
  margin-bottom: 1rem;
 }
 button {
  width: 100%;
  padding: 0.85rem;
  border: none;
  border-radius: 8px;
  background: var(--accent);
  color: #fff;
  font-size: 1rem;
  font-weight: 600;
  cursor: pointer;
 }
 button:hover:not(:disabled) {
  background: var(--accent-hover);
 }
 button:disabled {
  opacity: 0.6;
  cursor: not-allowed;
 }
 .status {
  margin-top: 0.75rem;
  font-size: 0.9rem;
  color: var(--muted);
  min-height: 1.25rem;
 }
 footer {
  font-size: 0.8rem;
  color: var(--muted);
 }
 #resultSection h2 {
  margin-top: 0;
  font-size: 1.1rem;
 }
 audio {
  width: 100%;
  margin-bottom: 0.5rem;
 }
 a {
  color: var(--accent);
 }
		`@@ -0,0 +1,3 @@`
							`from backend.app.tts.service import TTSService`

							`__all__ = ["TTSService"]`