#!/usr/bin/env python3 """ F5-TTS vs CosyVoice3 A/B 비교. 각 모델 전용 venv의 worker를 subprocess로 호출합니다. """ from __future__ import annotations import argparse import json import subprocess import sys from pathlib import Path ROOT = Path(__file__).resolve().parents[1] CONFIG = ROOT / "config" F5_PY = ROOT / ".venvs" / "f5tts" / "bin" / "python" COSY_PY = ROOT / ".venvs" / "cosyvoice" / "bin" / "python" F5_WORKER = ROOT / "scripts" / "workers" / "f5_infer.py" COSY_WORKER = ROOT / "scripts" / "workers" / "cosy_infer.py" def load_sentences() -> list[dict]: with open(CONFIG / "test_sentences.json", encoding="utf-8") as f: return json.load(f)["cases"] def resolve_ref_audio(ref_arg: str) -> tuple[Path, str]: """(wav_path, ref_text for F5)""" if ref_arg == "auto": try: from importlib.resources import files wav = files("f5_tts").joinpath("infer/examples/basic/basic_ref_en.wav") ref_path = Path(str(wav)) ref_text = "some call me nature, others call me mother nature." return ref_path, ref_text except Exception: samples = list((ROOT / "samples").glob("*.wav")) if not samples: raise SystemExit( "reference 없음: samples/*.wav 녹음하거나 f5-tts venv 설치 후 --ref-audio auto" ) ref_path = samples[0] else: ref_path = Path(ref_arg) if not ref_path.is_file(): raise SystemExit(f"ref audio not found: {ref_path}") ref_text = "" txt_candidates = [ ref_path.with_suffix(".txt"), ROOT / "samples" / "my_voice_ref.txt", ] for t in txt_candidates: if t.is_file(): ref_text = t.read_text(encoding="utf-8").strip() break if not ref_text and ref_arg != "auto": ref_text = "참조 음성의 대본을 여기에 입력하세요." return ref_path, ref_text def run_worker(python: Path, worker: Path, cmd: list[str]) -> bool: if not python.is_file(): print(f"SKIP: venv missing ({python.parent.parent.name})", file=sys.stderr) return False r = subprocess.run([str(python), str(worker), *cmd], cwd=str(ROOT)) return r.returncode == 0 def main() -> int: parser = argparse.ArgumentParser() parser.add_argument("--ref-audio", default="auto", help="WAV path or 'auto'") parser.add_argument("--models", default="both", choices=("both", "f5_tts", "cosyvoice")) parser.add_argument("--out-dir", default=str(ROOT / "outputs" / "ab_compare")) args = parser.parse_args() ref_path, ref_text = resolve_ref_audio(args.ref_audio) out_base = Path(args.out_dir) out_base.mkdir(parents=True, exist_ok=True) cases = load_sentences() print(f"Reference: {ref_path}") print(f"Cases: {len(cases)}") print(f"Output: {out_base}\n") ok = 0 fail = 0 for case in cases: cid = case["id"] text = case["text"] print(f"=== {cid}: {case['label']} ===") if args.models in ("both", "f5_tts"): out_f5 = out_base / "f5_tts" / f"{cid}.wav" if run_worker( F5_PY, F5_WORKER, [ "--ref-audio", str(ref_path), "--ref-text", ref_text, "--gen-text", text, "--out", str(out_f5), ], ): ok += 1 else: fail += 1 if args.models in ("both", "cosyvoice"): out_cosy = out_base / "cosyvoice" / f"{cid}.wav" if run_worker( COSY_PY, COSY_WORKER, [ "--ref-audio", str(ref_path), "--gen-text", text, "--prompt-text", ref_text, "--out", str(out_cosy), ], ): ok += 1 else: fail += 1 manifest = { "ref_audio": str(ref_path), "ref_text": ref_text, "cases": cases, "output_dir": str(out_base), } (out_base / "manifest.json").write_text( json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8" ) print(f"\n완료: success={ok} fail={fail}") print(f"manifest: {out_base / 'manifest.json'}") return 0 if fail == 0 else 1 if __name__ == "__main__": raise SystemExit(main())