Initial commit: Korean voice-cloning TTS prototype

FastAPI backend, web UI, CosyVoice3/F5-TTS setup scripts, and handoff docs for GPU PC continuation. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-04 13:36:37 +09:00
commit 7101fdcd65
36 changed files with 1937 additions and 0 deletions
--- a/scripts/check_env.sh
+++ b/scripts/check_env.sh
@@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+# NVIDIA GPU + CUDA 환경 점검
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$ROOT"
+
+echo "=== TTS 환경 점검 ==="
+echo "프로젝트: $ROOT"
+echo
+
+echo "--- OS / CPU ---"
+uname -a
+echo
+
+echo "--- Python ---"
+if command -v python3 &>/dev/null; then
+  python3 --version
+  which python3
+else
+  echo "python3: 없음"
+fi
+echo
+
+echo "--- NVIDIA GPU ---"
+if command -v nvidia-smi &>/dev/null; then
+  nvidia-smi
+  echo
+  nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv
+else
+  echo "nvidia-smi: 사용 불가 (NVIDIA GPU 서버에서 실행하세요)"
+fi
+echo
+
+echo "--- CUDA (nvcc) ---"
+if command -v nvcc &>/dev/null; then
+  nvcc --version | head -4
+else
+  echo "nvcc: 없음 (PyTorch CUDA 빌드로도 동작 가능)"
+fi
+echo
+
+echo "--- PyTorch (API venv) ---"
+API_VENV="$ROOT/.venvs/api"
+if [[ -x "$API_VENV/bin/python" ]]; then
+  "$API_VENV/bin/python" -c "
+import sys
+try:
+    import torch
+    print('torch:', torch.__version__)
+    print('cuda available:', torch.cuda.is_available())
+    if torch.cuda.is_available():
+        print('device:', torch.cuda.get_device_name(0))
+except ImportError:
+    print('torch: 미설치 (API만 사용 시 정상)')
+" 2>/dev/null || true
+else
+  echo "API venv 없음 → ./scripts/setup_api.sh 실행"
+fi
+echo
+
+echo "--- 모델 venv ---"
+for name in f5tts cosyvoice; do
+  V="$ROOT/.venvs/$name"
+  if [[ -x "$V/bin/python" ]]; then
+    echo "[$name] OK: $V"
+  else
+    echo "[$name] 없음 → setup_${name}.sh (f5tts는 setup_f5tts.sh)"
+  fi
+done
+echo
+
+echo "--- 디렉터리 ---"
+for d in samples outputs models config backend web; do
+  [[ -d "$ROOT/$d" ]] && echo "  $d: OK" || echo "  $d: MISSING"
+done
+echo "점검 완료."
--- a/scripts/compare_voice_lengths.sh
+++ b/scripts/compare_voice_lengths.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+# 내 목소리 reference 녹음 가이드 출력 + 길이별 비교 실행
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+echo "=== 내 목소리 녹음 가이드 ==="
+echo "자세한 내용: $ROOT/samples/README.md"
+echo
+echo "1) 30초 / 1분 / 3분 WAV를 samples/ 에 저장"
+echo "2) my_voice_ref.txt 에 녹음 대본 작성"
+echo "3) ./scripts/prepare_reference.sh samples/my_voice_30s.wav"
+echo
+
+shopt -s nullglob
+WAVS=("$ROOT"/samples/my_voice_*.wav)
+if [[ ${#WAVS[@]} -eq 0 ]]; then
+  echo "아직 my_voice_*.wav 없음. 녹음 후 다시 실행하세요."
+  exit 0
+fi
+
+OUT="$ROOT/outputs/voice_length_compare"
+mkdir -p "$OUT"
+PY="$ROOT/.venvs/cosyvoice/bin/python"
+WORKER="$ROOT/scripts/workers/cosy_infer.py"
+TEXT="안녕하세요. 이 문장은 reference 길이별 품질 비교를 위한 테스트입니다."
+
+if [[ ! -x "$PY" ]]; then
+  echo "cosyvoice venv 없음. ./scripts/setup_cosyvoice.sh 후 재실행"
+  exit 1
+fi
+
+REF_TXT=""
+[[ -f "$ROOT/samples/my_voice_ref.txt" ]] && REF_TXT=$(cat "$ROOT/samples/my_voice_ref.txt")
+
+for wav in "${WAVS[@]}"; do
+  name=$(basename "$wav" .wav)
+  echo "생성: $name"
+  "$PY" "$WORKER" \
+    --ref-audio "$wav" \
+    --gen-text "$TEXT" \
+    --prompt-text "$REF_TXT" \
+    --out "$OUT/${name}_test.wav" || true
+done
+
+echo "결과: $OUT"
--- a/scripts/prepare_reference.sh
+++ b/scripts/prepare_reference.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# reference WAV를 mono 24kHz로 정규화
+set -euo pipefail
+
+if [[ $# -lt 1 ]]; then
+  echo "Usage: $0 input.wav [output.wav]"
+  exit 1
+fi
+
+IN="$1"
+OUT="${2:-${IN%.wav}_24k_mono.wav}"
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+PY="${ROOT}/.venvs/api/bin/python"
+if [[ ! -x "$PY" ]]; then
+  PY=python3
+fi
+
+"$PY" - <<PY
+import sys
+try:
+    import soundfile as sf
+    import numpy as np
+except ImportError:
+    print("soundfile 필요: pip install soundfile")
+    sys.exit(1)
+
+data, sr = sf.read("$IN", always_2d=False)
+if data.ndim > 1:
+    data = data.mean(axis=1)
+target_sr = 24000
+if sr != target_sr:
+    import librosa
+    data = librosa.resample(data.astype(float), orig_sr=sr, target_sr=target_sr)
+    sr = target_sr
+sf.write("$OUT", data, sr, subtype="PCM_16")
+print(f"Saved: $OUT ({sr} Hz mono)")
+PY
--- a/scripts/run_ab_compare.py
+++ b/scripts/run_ab_compare.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+"""
+F5-TTS vs CosyVoice3 A/B 비교.
+각 모델 전용 venv의 worker를 subprocess로 호출합니다.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+CONFIG = ROOT / "config"
+F5_PY = ROOT / ".venvs" / "f5tts" / "bin" / "python"
+COSY_PY = ROOT / ".venvs" / "cosyvoice" / "bin" / "python"
+F5_WORKER = ROOT / "scripts" / "workers" / "f5_infer.py"
+COSY_WORKER = ROOT / "scripts" / "workers" / "cosy_infer.py"
+
+
+def load_sentences() -> list[dict]:
+    with open(CONFIG / "test_sentences.json", encoding="utf-8") as f:
+        return json.load(f)["cases"]
+
+
+def resolve_ref_audio(ref_arg: str) -> tuple[Path, str]:
+    """(wav_path, ref_text for F5)"""
+    if ref_arg == "auto":
+        try:
+            from importlib.resources import files
+
+            wav = files("f5_tts").joinpath("infer/examples/basic/basic_ref_en.wav")
+            ref_path = Path(str(wav))
+            ref_text = "some call me nature, others call me mother nature."
+            return ref_path, ref_text
+        except Exception:
+            samples = list((ROOT / "samples").glob("*.wav"))
+            if not samples:
+                raise SystemExit(
+                    "reference 없음: samples/*.wav 녹음하거나 f5-tts venv 설치 후 --ref-audio auto"
+                )
+            ref_path = samples[0]
+    else:
+        ref_path = Path(ref_arg)
+        if not ref_path.is_file():
+            raise SystemExit(f"ref audio not found: {ref_path}")
+
+    ref_text = ""
+    txt_candidates = [
+        ref_path.with_suffix(".txt"),
+        ROOT / "samples" / "my_voice_ref.txt",
+    ]
+    for t in txt_candidates:
+        if t.is_file():
+            ref_text = t.read_text(encoding="utf-8").strip()
+            break
+    if not ref_text and ref_arg != "auto":
+        ref_text = "참조 음성의 대본을 여기에 입력하세요."
+    return ref_path, ref_text
+
+
+def run_worker(python: Path, worker: Path, cmd: list[str]) -> bool:
+    if not python.is_file():
+        print(f"SKIP: venv missing ({python.parent.parent.name})", file=sys.stderr)
+        return False
+    r = subprocess.run([str(python), str(worker), *cmd], cwd=str(ROOT))
+    return r.returncode == 0
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ref-audio", default="auto", help="WAV path or 'auto'")
+    parser.add_argument("--models", default="both", choices=("both", "f5_tts", "cosyvoice"))
+    parser.add_argument("--out-dir", default=str(ROOT / "outputs" / "ab_compare"))
+    args = parser.parse_args()
+
+    ref_path, ref_text = resolve_ref_audio(args.ref_audio)
+    out_base = Path(args.out_dir)
+    out_base.mkdir(parents=True, exist_ok=True)
+
+    cases = load_sentences()
+    print(f"Reference: {ref_path}")
+    print(f"Cases: {len(cases)}")
+    print(f"Output: {out_base}\n")
+
+    ok = 0
+    fail = 0
+    for case in cases:
+        cid = case["id"]
+        text = case["text"]
+        print(f"=== {cid}: {case['label']} ===")
+
+        if args.models in ("both", "f5_tts"):
+            out_f5 = out_base / "f5_tts" / f"{cid}.wav"
+            if run_worker(
+                F5_PY,
+                F5_WORKER,
+                [
+                    "--ref-audio",
+                    str(ref_path),
+                    "--ref-text",
+                    ref_text,
+                    "--gen-text",
+                    text,
+                    "--out",
+                    str(out_f5),
+                ],
+            ):
+                ok += 1
+            else:
+                fail += 1
+
+        if args.models in ("both", "cosyvoice"):
+            out_cosy = out_base / "cosyvoice" / f"{cid}.wav"
+            if run_worker(
+                COSY_PY,
+                COSY_WORKER,
+                [
+                    "--ref-audio",
+                    str(ref_path),
+                    "--gen-text",
+                    text,
+                    "--prompt-text",
+                    ref_text,
+                    "--out",
+                    str(out_cosy),
+                ],
+            ):
+                ok += 1
+            else:
+                fail += 1
+
+    manifest = {
+        "ref_audio": str(ref_path),
+        "ref_text": ref_text,
+        "cases": cases,
+        "output_dir": str(out_base),
+    }
+    (out_base / "manifest.json").write_text(
+        json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8"
+    )
+    print(f"\n완료: success={ok} fail={fail}")
+    print(f"manifest: {out_base / 'manifest.json'}")
+    return 0 if fail == 0 else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/run_server.sh
+++ b/scripts/run_server.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$ROOT"
+
+if [[ ! -x "$ROOT/.venvs/api/bin/uvicorn" ]]; then
+  echo "API venv 없음. ./scripts/setup_api.sh 실행"
+  exit 1
+fi
+
+export PYTHONPATH="$ROOT"
+# shellcheck disable=SC1091
+[[ -f "$ROOT/.env" ]] && source "$ROOT/.env"
+
+exec "$ROOT/.venvs/api/bin/uvicorn" backend.app.main:app \
+  --host "${TTS_HOST:-0.0.0.0}" \
+  --port "${TTS_PORT:-8000}" \
+  --reload
--- a/scripts/select_model.sh
+++ b/scripts/select_model.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+# 최종 모델 선택을 .env 와 config에 반영
+set -euo pipefail
+
+MODEL="${1:-}"
+if [[ -z "$MODEL" || ! "$MODEL" =~ ^(cosyvoice|f5_tts)$ ]]; then
+  echo "Usage: $0 cosyvoice|f5_tts"
+  exit 1
+fi
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+ENV_FILE="$ROOT/.env"
+
+if [[ -f "$ENV_FILE" ]]; then
+  if grep -q '^TTS_MODEL=' "$ENV_FILE"; then
+    sed -i.bak "s/^TTS_MODEL=.*/TTS_MODEL=$MODEL/" "$ENV_FILE"
+    rm -f "$ENV_FILE.bak"
+  else
+    echo "TTS_MODEL=$MODEL" >> "$ENV_FILE"
+  fi
+else
+  cp "$ROOT/.env.example" "$ENV_FILE"
+  echo "TTS_MODEL=$MODEL" >> "$ENV_FILE"
+fi
+
+python3 - <<PY
+import json
+from pathlib import Path
+p = Path("$ROOT/config/model_choice.json")
+data = json.loads(p.read_text(encoding="utf-8"))
+data["selected_model"] = "$MODEL"
+p.write_text(json.dumps(data, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
+print("selected_model=$MODEL")
+PY
--- a/scripts/setup_api.sh
+++ b/scripts/setup_api.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+# FastAPI 서버용 경량 venv
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+VENV="$ROOT/.venvs/api"
+
+python3 -m venv "$VENV"
+"$VENV/bin/pip" install -U pip wheel
+"$VENV/bin/pip" install -r "$ROOT/backend/requirements.txt"
+
+echo "API venv 준비 완료: $VENV"
+echo "실행: $VENV/bin/uvicorn backend.app.main:app --host 0.0.0.0 --port 8000"
--- a/scripts/setup_cosyvoice.sh
+++ b/scripts/setup_cosyvoice.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# CosyVoice3 전용 venv + 레포 클론 + 모델 다운로드
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+VENV="$ROOT/.venvs/cosyvoice"
+REPO="$ROOT/external/CosyVoice"
+MODEL_DIR="$ROOT/models/Fun-CosyVoice3-0.5B"
+
+mkdir -p "$ROOT/external" "$ROOT/models"
+
+if [[ ! -d "$REPO/.git" ]]; then
+  echo "CosyVoice 레포 클론..."
+  git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git "$REPO"
+  cd "$REPO"
+  git submodule update --init --recursive
+else
+  echo "CosyVoice 레포 이미 존재: $REPO"
+fi
+
+python3 -m venv "$VENV"
+"$VENV/bin/pip" install -U pip wheel
+"$VENV/bin/pip" install torch torchaudio --index-url https://download.pytorch.org/whl/cu124
+"$VENV/bin/pip" install -r "$REPO/requirements.txt"
+"$VENV/bin/pip" install huggingface_hub modelscope
+
+echo "CosyVoice3 모델 다운로드 (Hugging Face)..."
+"$VENV/bin/python" - <<PY
+from huggingface_hub import snapshot_download
+snapshot_download(
+    'FunAudioLLM/Fun-CosyVoice3-0.5B-2512',
+    local_dir='$MODEL_DIR',
+)
+print('Model saved to $MODEL_DIR')
+PY
+
+echo "CosyVoice venv 준비 완료: $VENV"
+echo "테스트: $VENV/bin/python $ROOT/scripts/workers/cosy_infer.py --help"
--- a/scripts/setup_f5tts.sh
+++ b/scripts/setup_f5tts.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+# F5-TTS 전용 venv (NVIDIA CUDA)
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+VENV="$ROOT/.venvs/f5tts"
+
+python3 -m venv "$VENV"
+"$VENV/bin/pip" install -U pip wheel
+
+# CUDA 12.x PyTorch (서버 CUDA 버전에 맞게 cu124/cu128 조정)
+"$VENV/bin/pip" install torch torchaudio --index-url https://download.pytorch.org/whl/cu124
+"$VENV/bin/pip" install f5-tts
+
+echo "F5-TTS venv 준비 완료: $VENV"
+echo "테스트: $VENV/bin/python $ROOT/scripts/workers/f5_infer.py --help"
--- a/scripts/workers/cosy_infer.py
+++ b/scripts/workers/cosy_infer.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+"""CosyVoice3 zero-shot 추론 워커 (cosyvoice venv에서 실행)."""
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="CosyVoice3 inference worker")
+    parser.add_argument("--ref-audio", required=True)
+    parser.add_argument("--prompt-text", default="", help="Text spoken in ref audio (with prefix)")
+    parser.add_argument("--gen-text", required=True)
+    parser.add_argument("--out", required=True)
+    parser.add_argument(
+        "--model-dir",
+        default=None,
+        help="Path to Fun-CosyVoice3-0.5B (default: PROJECT/models/Fun-CosyVoice3-0.5B)",
+    )
+    parser.add_argument(
+        "--prompt-prefix",
+        default="You are a helpful assistant.<|endofprompt|>",
+    )
+    args = parser.parse_args()
+
+    root = Path(__file__).resolve().parents[2]
+    repo = root / "external" / "CosyVoice"
+    model_dir = Path(args.model_dir or root / "models" / "Fun-CosyVoice3-0.5B")
+    ref = Path(args.ref_audio)
+    out = Path(args.out)
+
+    if not repo.is_dir():
+        print(f"CosyVoice repo missing: {repo}. Run ./scripts/setup_cosyvoice.sh", file=sys.stderr)
+        return 1
+    if not model_dir.is_dir():
+        print(f"Model dir missing: {model_dir}", file=sys.stderr)
+        return 1
+    if not ref.is_file():
+        print(f"ref audio not found: {ref}", file=sys.stderr)
+        return 1
+
+    sys.path.insert(0, str(repo))
+    sys.path.append(str(repo / "third_party" / "Matcha-TTS"))
+
+    try:
+        import torchaudio
+        from cosyvoice.cli.cosyvoice import AutoModel
+    except ImportError as e:
+        print("CosyVoice import failed. Run ./scripts/setup_cosyvoice.sh", file=sys.stderr)
+        print(e, file=sys.stderr)
+        return 1
+
+    prompt = args.prompt_prefix + (args.prompt_text or "")
+    out.parent.mkdir(parents=True, exist_ok=True)
+
+    cosyvoice = AutoModel(model_dir=str(model_dir))
+    for i, result in enumerate(
+        cosyvoice.inference_zero_shot(
+            args.gen_text,
+            prompt,
+            str(ref),
+            stream=False,
+        )
+    ):
+        path = out if i == 0 else out.with_stem(f"{out.stem}_{i}")
+        torchaudio.save(str(path), result["tts_speech"], cosyvoice.sample_rate)
+        print(f"OK: {path}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/workers/f5_infer.py
+++ b/scripts/workers/f5_infer.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+"""F5-TTS 추론 워커 (f5tts venv에서 실행)."""
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="F5-TTS inference worker")
+    parser.add_argument("--ref-audio", required=True, help="Reference WAV path")
+    parser.add_argument("--ref-text", required=True, help="Transcript of reference audio")
+    parser.add_argument("--gen-text", required=True, help="Text to synthesize")
+    parser.add_argument("--out", required=True, help="Output WAV path")
+    parser.add_argument("--model", default="F5TTS_v1_Base")
+    args = parser.parse_args()
+
+    ref = Path(args.ref_audio)
+    if not ref.is_file():
+        print(f"ref audio not found: {ref}", file=sys.stderr)
+        return 1
+
+    out = Path(args.out)
+    out.parent.mkdir(parents=True, exist_ok=True)
+
+    try:
+        from f5_tts.api import F5TTS
+    except ImportError as e:
+        print("f5-tts not installed. Run: ./scripts/setup_f5tts.sh", file=sys.stderr)
+        print(e, file=sys.stderr)
+        return 1
+
+    tts = F5TTS(model=args.model)
+    tts.infer(
+        ref_file=str(ref),
+        ref_text=args.ref_text,
+        gen_text=args.gen_text,
+        file_wave=str(out),
+        remove_silence=True,
+    )
+    print(f"OK: {out}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())