Initial commit: Korean voice-cloning TTS prototype

FastAPI backend, web UI, CosyVoice3/F5-TTS setup scripts, and handoff docs for GPU PC continuation. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-04 13:36:37 +09:00
commit 7101fdcd65
36 changed files with 1937 additions and 0 deletions
--- a/scripts/run_ab_compare.py
+++ b/scripts/run_ab_compare.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+"""
+F5-TTS vs CosyVoice3 A/B 비교.
+각 모델 전용 venv의 worker를 subprocess로 호출합니다.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+CONFIG = ROOT / "config"
+F5_PY = ROOT / ".venvs" / "f5tts" / "bin" / "python"
+COSY_PY = ROOT / ".venvs" / "cosyvoice" / "bin" / "python"
+F5_WORKER = ROOT / "scripts" / "workers" / "f5_infer.py"
+COSY_WORKER = ROOT / "scripts" / "workers" / "cosy_infer.py"
+
+
+def load_sentences() -> list[dict]:
+    with open(CONFIG / "test_sentences.json", encoding="utf-8") as f:
+        return json.load(f)["cases"]
+
+
+def resolve_ref_audio(ref_arg: str) -> tuple[Path, str]:
+    """(wav_path, ref_text for F5)"""
+    if ref_arg == "auto":
+        try:
+            from importlib.resources import files
+
+            wav = files("f5_tts").joinpath("infer/examples/basic/basic_ref_en.wav")
+            ref_path = Path(str(wav))
+            ref_text = "some call me nature, others call me mother nature."
+            return ref_path, ref_text
+        except Exception:
+            samples = list((ROOT / "samples").glob("*.wav"))
+            if not samples:
+                raise SystemExit(
+                    "reference 없음: samples/*.wav 녹음하거나 f5-tts venv 설치 후 --ref-audio auto"
+                )
+            ref_path = samples[0]
+    else:
+        ref_path = Path(ref_arg)
+        if not ref_path.is_file():
+            raise SystemExit(f"ref audio not found: {ref_path}")
+
+    ref_text = ""
+    txt_candidates = [
+        ref_path.with_suffix(".txt"),
+        ROOT / "samples" / "my_voice_ref.txt",
+    ]
+    for t in txt_candidates:
+        if t.is_file():
+            ref_text = t.read_text(encoding="utf-8").strip()
+            break
+    if not ref_text and ref_arg != "auto":
+        ref_text = "참조 음성의 대본을 여기에 입력하세요."
+    return ref_path, ref_text
+
+
+def run_worker(python: Path, worker: Path, cmd: list[str]) -> bool:
+    if not python.is_file():
+        print(f"SKIP: venv missing ({python.parent.parent.name})", file=sys.stderr)
+        return False
+    r = subprocess.run([str(python), str(worker), *cmd], cwd=str(ROOT))
+    return r.returncode == 0
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ref-audio", default="auto", help="WAV path or 'auto'")
+    parser.add_argument("--models", default="both", choices=("both", "f5_tts", "cosyvoice"))
+    parser.add_argument("--out-dir", default=str(ROOT / "outputs" / "ab_compare"))
+    args = parser.parse_args()
+
+    ref_path, ref_text = resolve_ref_audio(args.ref_audio)
+    out_base = Path(args.out_dir)
+    out_base.mkdir(parents=True, exist_ok=True)
+
+    cases = load_sentences()
+    print(f"Reference: {ref_path}")
+    print(f"Cases: {len(cases)}")
+    print(f"Output: {out_base}\n")
+
+    ok = 0
+    fail = 0
+    for case in cases:
+        cid = case["id"]
+        text = case["text"]
+        print(f"=== {cid}: {case['label']} ===")
+
+        if args.models in ("both", "f5_tts"):
+            out_f5 = out_base / "f5_tts" / f"{cid}.wav"
+            if run_worker(
+                F5_PY,
+                F5_WORKER,
+                [
+                    "--ref-audio",
+                    str(ref_path),
+                    "--ref-text",
+                    ref_text,
+                    "--gen-text",
+                    text,
+                    "--out",
+                    str(out_f5),
+                ],
+            ):
+                ok += 1
+            else:
+                fail += 1
+
+        if args.models in ("both", "cosyvoice"):
+            out_cosy = out_base / "cosyvoice" / f"{cid}.wav"
+            if run_worker(
+                COSY_PY,
+                COSY_WORKER,
+                [
+                    "--ref-audio",
+                    str(ref_path),
+                    "--gen-text",
+                    text,
+                    "--prompt-text",
+                    ref_text,
+                    "--out",
+                    str(out_cosy),
+                ],
+            ):
+                ok += 1
+            else:
+                fail += 1
+
+    manifest = {
+        "ref_audio": str(ref_path),
+        "ref_text": ref_text,
+        "cases": cases,
+        "output_dir": str(out_base),
+    }
+    (out_base / "manifest.json").write_text(
+        json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8"
+    )
+    print(f"\n완료: success={ok} fail={fail}")
+    print(f"manifest: {out_base / 'manifest.json'}")
+    return 0 if fail == 0 else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())