Initial commit: Korean voice-cloning TTS prototype

FastAPI backend, web UI, CosyVoice3/F5-TTS setup scripts, and handoff docs for GPU PC continuation.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
2026-06-04 13:36:37 +09:00
commit 7101fdcd65
36 changed files with 1937 additions and 0 deletions

77
scripts/check_env.sh Executable file
View File

@@ -0,0 +1,77 @@
#!/usr/bin/env bash
# NVIDIA GPU + CUDA 환경 점검
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$ROOT"
echo "=== TTS 환경 점검 ==="
echo "프로젝트: $ROOT"
echo
echo "--- OS / CPU ---"
uname -a
echo
echo "--- Python ---"
if command -v python3 &>/dev/null; then
python3 --version
which python3
else
echo "python3: 없음"
fi
echo
echo "--- NVIDIA GPU ---"
if command -v nvidia-smi &>/dev/null; then
nvidia-smi
echo
nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv
else
echo "nvidia-smi: 사용 불가 (NVIDIA GPU 서버에서 실행하세요)"
fi
echo
echo "--- CUDA (nvcc) ---"
if command -v nvcc &>/dev/null; then
nvcc --version | head -4
else
echo "nvcc: 없음 (PyTorch CUDA 빌드로도 동작 가능)"
fi
echo
echo "--- PyTorch (API venv) ---"
API_VENV="$ROOT/.venvs/api"
if [[ -x "$API_VENV/bin/python" ]]; then
"$API_VENV/bin/python" -c "
import sys
try:
import torch
print('torch:', torch.__version__)
print('cuda available:', torch.cuda.is_available())
if torch.cuda.is_available():
print('device:', torch.cuda.get_device_name(0))
except ImportError:
print('torch: 미설치 (API만 사용 시 정상)')
" 2>/dev/null || true
else
echo "API venv 없음 → ./scripts/setup_api.sh 실행"
fi
echo
echo "--- 모델 venv ---"
for name in f5tts cosyvoice; do
V="$ROOT/.venvs/$name"
if [[ -x "$V/bin/python" ]]; then
echo "[$name] OK: $V"
else
echo "[$name] 없음 → setup_${name}.sh (f5tts는 setup_f5tts.sh)"
fi
done
echo
echo "--- 디렉터리 ---"
for d in samples outputs models config backend web; do
[[ -d "$ROOT/$d" ]] && echo " $d: OK" || echo " $d: MISSING"
done
echo "점검 완료."

View File

@@ -0,0 +1,45 @@
#!/usr/bin/env bash
# 내 목소리 reference 녹음 가이드 출력 + 길이별 비교 실행
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
echo "=== 내 목소리 녹음 가이드 ==="
echo "자세한 내용: $ROOT/samples/README.md"
echo
echo "1) 30초 / 1분 / 3분 WAV를 samples/ 에 저장"
echo "2) my_voice_ref.txt 에 녹음 대본 작성"
echo "3) ./scripts/prepare_reference.sh samples/my_voice_30s.wav"
echo
shopt -s nullglob
WAVS=("$ROOT"/samples/my_voice_*.wav)
if [[ ${#WAVS[@]} -eq 0 ]]; then
echo "아직 my_voice_*.wav 없음. 녹음 후 다시 실행하세요."
exit 0
fi
OUT="$ROOT/outputs/voice_length_compare"
mkdir -p "$OUT"
PY="$ROOT/.venvs/cosyvoice/bin/python"
WORKER="$ROOT/scripts/workers/cosy_infer.py"
TEXT="안녕하세요. 이 문장은 reference 길이별 품질 비교를 위한 테스트입니다."
if [[ ! -x "$PY" ]]; then
echo "cosyvoice venv 없음. ./scripts/setup_cosyvoice.sh 후 재실행"
exit 1
fi
REF_TXT=""
[[ -f "$ROOT/samples/my_voice_ref.txt" ]] && REF_TXT=$(cat "$ROOT/samples/my_voice_ref.txt")
for wav in "${WAVS[@]}"; do
name=$(basename "$wav" .wav)
echo "생성: $name"
"$PY" "$WORKER" \
--ref-audio "$wav" \
--gen-text "$TEXT" \
--prompt-text "$REF_TXT" \
--out "$OUT/${name}_test.wav" || true
done
echo "결과: $OUT"

38
scripts/prepare_reference.sh Executable file
View File

@@ -0,0 +1,38 @@
#!/usr/bin/env bash
# reference WAV를 mono 24kHz로 정규화
set -euo pipefail
if [[ $# -lt 1 ]]; then
echo "Usage: $0 input.wav [output.wav]"
exit 1
fi
IN="$1"
OUT="${2:-${IN%.wav}_24k_mono.wav}"
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
PY="${ROOT}/.venvs/api/bin/python"
if [[ ! -x "$PY" ]]; then
PY=python3
fi
"$PY" - <<PY
import sys
try:
import soundfile as sf
import numpy as np
except ImportError:
print("soundfile 필요: pip install soundfile")
sys.exit(1)
data, sr = sf.read("$IN", always_2d=False)
if data.ndim > 1:
data = data.mean(axis=1)
target_sr = 24000
if sr != target_sr:
import librosa
data = librosa.resample(data.astype(float), orig_sr=sr, target_sr=target_sr)
sr = target_sr
sf.write("$OUT", data, sr, subtype="PCM_16")
print(f"Saved: $OUT ({sr} Hz mono)")
PY

149
scripts/run_ab_compare.py Normal file
View File

@@ -0,0 +1,149 @@
#!/usr/bin/env python3
"""
F5-TTS vs CosyVoice3 A/B 비교.
각 모델 전용 venv의 worker를 subprocess로 호출합니다.
"""
from __future__ import annotations
import argparse
import json
import subprocess
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
CONFIG = ROOT / "config"
F5_PY = ROOT / ".venvs" / "f5tts" / "bin" / "python"
COSY_PY = ROOT / ".venvs" / "cosyvoice" / "bin" / "python"
F5_WORKER = ROOT / "scripts" / "workers" / "f5_infer.py"
COSY_WORKER = ROOT / "scripts" / "workers" / "cosy_infer.py"
def load_sentences() -> list[dict]:
with open(CONFIG / "test_sentences.json", encoding="utf-8") as f:
return json.load(f)["cases"]
def resolve_ref_audio(ref_arg: str) -> tuple[Path, str]:
"""(wav_path, ref_text for F5)"""
if ref_arg == "auto":
try:
from importlib.resources import files
wav = files("f5_tts").joinpath("infer/examples/basic/basic_ref_en.wav")
ref_path = Path(str(wav))
ref_text = "some call me nature, others call me mother nature."
return ref_path, ref_text
except Exception:
samples = list((ROOT / "samples").glob("*.wav"))
if not samples:
raise SystemExit(
"reference 없음: samples/*.wav 녹음하거나 f5-tts venv 설치 후 --ref-audio auto"
)
ref_path = samples[0]
else:
ref_path = Path(ref_arg)
if not ref_path.is_file():
raise SystemExit(f"ref audio not found: {ref_path}")
ref_text = ""
txt_candidates = [
ref_path.with_suffix(".txt"),
ROOT / "samples" / "my_voice_ref.txt",
]
for t in txt_candidates:
if t.is_file():
ref_text = t.read_text(encoding="utf-8").strip()
break
if not ref_text and ref_arg != "auto":
ref_text = "참조 음성의 대본을 여기에 입력하세요."
return ref_path, ref_text
def run_worker(python: Path, worker: Path, cmd: list[str]) -> bool:
if not python.is_file():
print(f"SKIP: venv missing ({python.parent.parent.name})", file=sys.stderr)
return False
r = subprocess.run([str(python), str(worker), *cmd], cwd=str(ROOT))
return r.returncode == 0
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--ref-audio", default="auto", help="WAV path or 'auto'")
parser.add_argument("--models", default="both", choices=("both", "f5_tts", "cosyvoice"))
parser.add_argument("--out-dir", default=str(ROOT / "outputs" / "ab_compare"))
args = parser.parse_args()
ref_path, ref_text = resolve_ref_audio(args.ref_audio)
out_base = Path(args.out_dir)
out_base.mkdir(parents=True, exist_ok=True)
cases = load_sentences()
print(f"Reference: {ref_path}")
print(f"Cases: {len(cases)}")
print(f"Output: {out_base}\n")
ok = 0
fail = 0
for case in cases:
cid = case["id"]
text = case["text"]
print(f"=== {cid}: {case['label']} ===")
if args.models in ("both", "f5_tts"):
out_f5 = out_base / "f5_tts" / f"{cid}.wav"
if run_worker(
F5_PY,
F5_WORKER,
[
"--ref-audio",
str(ref_path),
"--ref-text",
ref_text,
"--gen-text",
text,
"--out",
str(out_f5),
],
):
ok += 1
else:
fail += 1
if args.models in ("both", "cosyvoice"):
out_cosy = out_base / "cosyvoice" / f"{cid}.wav"
if run_worker(
COSY_PY,
COSY_WORKER,
[
"--ref-audio",
str(ref_path),
"--gen-text",
text,
"--prompt-text",
ref_text,
"--out",
str(out_cosy),
],
):
ok += 1
else:
fail += 1
manifest = {
"ref_audio": str(ref_path),
"ref_text": ref_text,
"cases": cases,
"output_dir": str(out_base),
}
(out_base / "manifest.json").write_text(
json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8"
)
print(f"\n완료: success={ok} fail={fail}")
print(f"manifest: {out_base / 'manifest.json'}")
return 0 if fail == 0 else 1
if __name__ == "__main__":
raise SystemExit(main())

19
scripts/run_server.sh Executable file
View File

@@ -0,0 +1,19 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$ROOT"
if [[ ! -x "$ROOT/.venvs/api/bin/uvicorn" ]]; then
echo "API venv 없음. ./scripts/setup_api.sh 실행"
exit 1
fi
export PYTHONPATH="$ROOT"
# shellcheck disable=SC1091
[[ -f "$ROOT/.env" ]] && source "$ROOT/.env"
exec "$ROOT/.venvs/api/bin/uvicorn" backend.app.main:app \
--host "${TTS_HOST:-0.0.0.0}" \
--port "${TTS_PORT:-8000}" \
--reload

34
scripts/select_model.sh Executable file
View File

@@ -0,0 +1,34 @@
#!/usr/bin/env bash
# 최종 모델 선택을 .env 와 config에 반영
set -euo pipefail
MODEL="${1:-}"
if [[ -z "$MODEL" || ! "$MODEL" =~ ^(cosyvoice|f5_tts)$ ]]; then
echo "Usage: $0 cosyvoice|f5_tts"
exit 1
fi
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
ENV_FILE="$ROOT/.env"
if [[ -f "$ENV_FILE" ]]; then
if grep -q '^TTS_MODEL=' "$ENV_FILE"; then
sed -i.bak "s/^TTS_MODEL=.*/TTS_MODEL=$MODEL/" "$ENV_FILE"
rm -f "$ENV_FILE.bak"
else
echo "TTS_MODEL=$MODEL" >> "$ENV_FILE"
fi
else
cp "$ROOT/.env.example" "$ENV_FILE"
echo "TTS_MODEL=$MODEL" >> "$ENV_FILE"
fi
python3 - <<PY
import json
from pathlib import Path
p = Path("$ROOT/config/model_choice.json")
data = json.loads(p.read_text(encoding="utf-8"))
data["selected_model"] = "$MODEL"
p.write_text(json.dumps(data, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
print("selected_model=$MODEL")
PY

13
scripts/setup_api.sh Executable file
View File

@@ -0,0 +1,13 @@
#!/usr/bin/env bash
# FastAPI 서버용 경량 venv
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
VENV="$ROOT/.venvs/api"
python3 -m venv "$VENV"
"$VENV/bin/pip" install -U pip wheel
"$VENV/bin/pip" install -r "$ROOT/backend/requirements.txt"
echo "API venv 준비 완료: $VENV"
echo "실행: $VENV/bin/uvicorn backend.app.main:app --host 0.0.0.0 --port 8000"

38
scripts/setup_cosyvoice.sh Executable file
View File

@@ -0,0 +1,38 @@
#!/usr/bin/env bash
# CosyVoice3 전용 venv + 레포 클론 + 모델 다운로드
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
VENV="$ROOT/.venvs/cosyvoice"
REPO="$ROOT/external/CosyVoice"
MODEL_DIR="$ROOT/models/Fun-CosyVoice3-0.5B"
mkdir -p "$ROOT/external" "$ROOT/models"
if [[ ! -d "$REPO/.git" ]]; then
echo "CosyVoice 레포 클론..."
git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git "$REPO"
cd "$REPO"
git submodule update --init --recursive
else
echo "CosyVoice 레포 이미 존재: $REPO"
fi
python3 -m venv "$VENV"
"$VENV/bin/pip" install -U pip wheel
"$VENV/bin/pip" install torch torchaudio --index-url https://download.pytorch.org/whl/cu124
"$VENV/bin/pip" install -r "$REPO/requirements.txt"
"$VENV/bin/pip" install huggingface_hub modelscope
echo "CosyVoice3 모델 다운로드 (Hugging Face)..."
"$VENV/bin/python" - <<PY
from huggingface_hub import snapshot_download
snapshot_download(
'FunAudioLLM/Fun-CosyVoice3-0.5B-2512',
local_dir='$MODEL_DIR',
)
print('Model saved to $MODEL_DIR')
PY
echo "CosyVoice venv 준비 완료: $VENV"
echo "테스트: $VENV/bin/python $ROOT/scripts/workers/cosy_infer.py --help"

16
scripts/setup_f5tts.sh Executable file
View File

@@ -0,0 +1,16 @@
#!/usr/bin/env bash
# F5-TTS 전용 venv (NVIDIA CUDA)
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
VENV="$ROOT/.venvs/f5tts"
python3 -m venv "$VENV"
"$VENV/bin/pip" install -U pip wheel
# CUDA 12.x PyTorch (서버 CUDA 버전에 맞게 cu124/cu128 조정)
"$VENV/bin/pip" install torch torchaudio --index-url https://download.pytorch.org/whl/cu124
"$VENV/bin/pip" install f5-tts
echo "F5-TTS venv 준비 완료: $VENV"
echo "테스트: $VENV/bin/python $ROOT/scripts/workers/f5_infer.py --help"

View File

@@ -0,0 +1,73 @@
#!/usr/bin/env python3
"""CosyVoice3 zero-shot 추론 워커 (cosyvoice venv에서 실행)."""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
def main() -> int:
parser = argparse.ArgumentParser(description="CosyVoice3 inference worker")
parser.add_argument("--ref-audio", required=True)
parser.add_argument("--prompt-text", default="", help="Text spoken in ref audio (with prefix)")
parser.add_argument("--gen-text", required=True)
parser.add_argument("--out", required=True)
parser.add_argument(
"--model-dir",
default=None,
help="Path to Fun-CosyVoice3-0.5B (default: PROJECT/models/Fun-CosyVoice3-0.5B)",
)
parser.add_argument(
"--prompt-prefix",
default="You are a helpful assistant.<|endofprompt|>",
)
args = parser.parse_args()
root = Path(__file__).resolve().parents[2]
repo = root / "external" / "CosyVoice"
model_dir = Path(args.model_dir or root / "models" / "Fun-CosyVoice3-0.5B")
ref = Path(args.ref_audio)
out = Path(args.out)
if not repo.is_dir():
print(f"CosyVoice repo missing: {repo}. Run ./scripts/setup_cosyvoice.sh", file=sys.stderr)
return 1
if not model_dir.is_dir():
print(f"Model dir missing: {model_dir}", file=sys.stderr)
return 1
if not ref.is_file():
print(f"ref audio not found: {ref}", file=sys.stderr)
return 1
sys.path.insert(0, str(repo))
sys.path.append(str(repo / "third_party" / "Matcha-TTS"))
try:
import torchaudio
from cosyvoice.cli.cosyvoice import AutoModel
except ImportError as e:
print("CosyVoice import failed. Run ./scripts/setup_cosyvoice.sh", file=sys.stderr)
print(e, file=sys.stderr)
return 1
prompt = args.prompt_prefix + (args.prompt_text or "")
out.parent.mkdir(parents=True, exist_ok=True)
cosyvoice = AutoModel(model_dir=str(model_dir))
for i, result in enumerate(
cosyvoice.inference_zero_shot(
args.gen_text,
prompt,
str(ref),
stream=False,
)
):
path = out if i == 0 else out.with_stem(f"{out.stem}_{i}")
torchaudio.save(str(path), result["tts_speech"], cosyvoice.sample_rate)
print(f"OK: {path}")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,47 @@
#!/usr/bin/env python3
"""F5-TTS 추론 워커 (f5tts venv에서 실행)."""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
def main() -> int:
parser = argparse.ArgumentParser(description="F5-TTS inference worker")
parser.add_argument("--ref-audio", required=True, help="Reference WAV path")
parser.add_argument("--ref-text", required=True, help="Transcript of reference audio")
parser.add_argument("--gen-text", required=True, help="Text to synthesize")
parser.add_argument("--out", required=True, help="Output WAV path")
parser.add_argument("--model", default="F5TTS_v1_Base")
args = parser.parse_args()
ref = Path(args.ref_audio)
if not ref.is_file():
print(f"ref audio not found: {ref}", file=sys.stderr)
return 1
out = Path(args.out)
out.parent.mkdir(parents=True, exist_ok=True)
try:
from f5_tts.api import F5TTS
except ImportError as e:
print("f5-tts not installed. Run: ./scripts/setup_f5tts.sh", file=sys.stderr)
print(e, file=sys.stderr)
return 1
tts = F5TTS(model=args.model)
tts.infer(
ref_file=str(ref),
ref_text=args.ref_text,
gen_text=args.gen_text,
file_wave=str(out),
remove_silence=True,
)
print(f"OK: {out}")
return 0
if __name__ == "__main__":
raise SystemExit(main())