Initial commit: Korean voice-cloning TTS prototype
FastAPI backend, web UI, CosyVoice3/F5-TTS setup scripts, and handoff docs for GPU PC continuation. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
77
scripts/check_env.sh
Executable file
77
scripts/check_env.sh
Executable file
@@ -0,0 +1,77 @@
|
||||
#!/usr/bin/env bash
|
||||
# NVIDIA GPU + CUDA 환경 점검
|
||||
set -euo pipefail
|
||||
|
||||
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
cd "$ROOT"
|
||||
|
||||
echo "=== TTS 환경 점검 ==="
|
||||
echo "프로젝트: $ROOT"
|
||||
echo
|
||||
|
||||
echo "--- OS / CPU ---"
|
||||
uname -a
|
||||
echo
|
||||
|
||||
echo "--- Python ---"
|
||||
if command -v python3 &>/dev/null; then
|
||||
python3 --version
|
||||
which python3
|
||||
else
|
||||
echo "python3: 없음"
|
||||
fi
|
||||
echo
|
||||
|
||||
echo "--- NVIDIA GPU ---"
|
||||
if command -v nvidia-smi &>/dev/null; then
|
||||
nvidia-smi
|
||||
echo
|
||||
nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv
|
||||
else
|
||||
echo "nvidia-smi: 사용 불가 (NVIDIA GPU 서버에서 실행하세요)"
|
||||
fi
|
||||
echo
|
||||
|
||||
echo "--- CUDA (nvcc) ---"
|
||||
if command -v nvcc &>/dev/null; then
|
||||
nvcc --version | head -4
|
||||
else
|
||||
echo "nvcc: 없음 (PyTorch CUDA 빌드로도 동작 가능)"
|
||||
fi
|
||||
echo
|
||||
|
||||
echo "--- PyTorch (API venv) ---"
|
||||
API_VENV="$ROOT/.venvs/api"
|
||||
if [[ -x "$API_VENV/bin/python" ]]; then
|
||||
"$API_VENV/bin/python" -c "
|
||||
import sys
|
||||
try:
|
||||
import torch
|
||||
print('torch:', torch.__version__)
|
||||
print('cuda available:', torch.cuda.is_available())
|
||||
if torch.cuda.is_available():
|
||||
print('device:', torch.cuda.get_device_name(0))
|
||||
except ImportError:
|
||||
print('torch: 미설치 (API만 사용 시 정상)')
|
||||
" 2>/dev/null || true
|
||||
else
|
||||
echo "API venv 없음 → ./scripts/setup_api.sh 실행"
|
||||
fi
|
||||
echo
|
||||
|
||||
echo "--- 모델 venv ---"
|
||||
for name in f5tts cosyvoice; do
|
||||
V="$ROOT/.venvs/$name"
|
||||
if [[ -x "$V/bin/python" ]]; then
|
||||
echo "[$name] OK: $V"
|
||||
else
|
||||
echo "[$name] 없음 → setup_${name}.sh (f5tts는 setup_f5tts.sh)"
|
||||
fi
|
||||
done
|
||||
echo
|
||||
|
||||
echo "--- 디렉터리 ---"
|
||||
for d in samples outputs models config backend web; do
|
||||
[[ -d "$ROOT/$d" ]] && echo " $d: OK" || echo " $d: MISSING"
|
||||
done
|
||||
echo "점검 완료."
|
||||
45
scripts/compare_voice_lengths.sh
Executable file
45
scripts/compare_voice_lengths.sh
Executable file
@@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env bash
|
||||
# 내 목소리 reference 녹음 가이드 출력 + 길이별 비교 실행
|
||||
set -euo pipefail
|
||||
|
||||
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
echo "=== 내 목소리 녹음 가이드 ==="
|
||||
echo "자세한 내용: $ROOT/samples/README.md"
|
||||
echo
|
||||
echo "1) 30초 / 1분 / 3분 WAV를 samples/ 에 저장"
|
||||
echo "2) my_voice_ref.txt 에 녹음 대본 작성"
|
||||
echo "3) ./scripts/prepare_reference.sh samples/my_voice_30s.wav"
|
||||
echo
|
||||
|
||||
shopt -s nullglob
|
||||
WAVS=("$ROOT"/samples/my_voice_*.wav)
|
||||
if [[ ${#WAVS[@]} -eq 0 ]]; then
|
||||
echo "아직 my_voice_*.wav 없음. 녹음 후 다시 실행하세요."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
OUT="$ROOT/outputs/voice_length_compare"
|
||||
mkdir -p "$OUT"
|
||||
PY="$ROOT/.venvs/cosyvoice/bin/python"
|
||||
WORKER="$ROOT/scripts/workers/cosy_infer.py"
|
||||
TEXT="안녕하세요. 이 문장은 reference 길이별 품질 비교를 위한 테스트입니다."
|
||||
|
||||
if [[ ! -x "$PY" ]]; then
|
||||
echo "cosyvoice venv 없음. ./scripts/setup_cosyvoice.sh 후 재실행"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
REF_TXT=""
|
||||
[[ -f "$ROOT/samples/my_voice_ref.txt" ]] && REF_TXT=$(cat "$ROOT/samples/my_voice_ref.txt")
|
||||
|
||||
for wav in "${WAVS[@]}"; do
|
||||
name=$(basename "$wav" .wav)
|
||||
echo "생성: $name"
|
||||
"$PY" "$WORKER" \
|
||||
--ref-audio "$wav" \
|
||||
--gen-text "$TEXT" \
|
||||
--prompt-text "$REF_TXT" \
|
||||
--out "$OUT/${name}_test.wav" || true
|
||||
done
|
||||
|
||||
echo "결과: $OUT"
|
||||
38
scripts/prepare_reference.sh
Executable file
38
scripts/prepare_reference.sh
Executable file
@@ -0,0 +1,38 @@
|
||||
#!/usr/bin/env bash
|
||||
# reference WAV를 mono 24kHz로 정규화
|
||||
set -euo pipefail
|
||||
|
||||
if [[ $# -lt 1 ]]; then
|
||||
echo "Usage: $0 input.wav [output.wav]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
IN="$1"
|
||||
OUT="${2:-${IN%.wav}_24k_mono.wav}"
|
||||
|
||||
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
PY="${ROOT}/.venvs/api/bin/python"
|
||||
if [[ ! -x "$PY" ]]; then
|
||||
PY=python3
|
||||
fi
|
||||
|
||||
"$PY" - <<PY
|
||||
import sys
|
||||
try:
|
||||
import soundfile as sf
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
print("soundfile 필요: pip install soundfile")
|
||||
sys.exit(1)
|
||||
|
||||
data, sr = sf.read("$IN", always_2d=False)
|
||||
if data.ndim > 1:
|
||||
data = data.mean(axis=1)
|
||||
target_sr = 24000
|
||||
if sr != target_sr:
|
||||
import librosa
|
||||
data = librosa.resample(data.astype(float), orig_sr=sr, target_sr=target_sr)
|
||||
sr = target_sr
|
||||
sf.write("$OUT", data, sr, subtype="PCM_16")
|
||||
print(f"Saved: $OUT ({sr} Hz mono)")
|
||||
PY
|
||||
149
scripts/run_ab_compare.py
Normal file
149
scripts/run_ab_compare.py
Normal file
@@ -0,0 +1,149 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
F5-TTS vs CosyVoice3 A/B 비교.
|
||||
각 모델 전용 venv의 worker를 subprocess로 호출합니다.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
CONFIG = ROOT / "config"
|
||||
F5_PY = ROOT / ".venvs" / "f5tts" / "bin" / "python"
|
||||
COSY_PY = ROOT / ".venvs" / "cosyvoice" / "bin" / "python"
|
||||
F5_WORKER = ROOT / "scripts" / "workers" / "f5_infer.py"
|
||||
COSY_WORKER = ROOT / "scripts" / "workers" / "cosy_infer.py"
|
||||
|
||||
|
||||
def load_sentences() -> list[dict]:
|
||||
with open(CONFIG / "test_sentences.json", encoding="utf-8") as f:
|
||||
return json.load(f)["cases"]
|
||||
|
||||
|
||||
def resolve_ref_audio(ref_arg: str) -> tuple[Path, str]:
|
||||
"""(wav_path, ref_text for F5)"""
|
||||
if ref_arg == "auto":
|
||||
try:
|
||||
from importlib.resources import files
|
||||
|
||||
wav = files("f5_tts").joinpath("infer/examples/basic/basic_ref_en.wav")
|
||||
ref_path = Path(str(wav))
|
||||
ref_text = "some call me nature, others call me mother nature."
|
||||
return ref_path, ref_text
|
||||
except Exception:
|
||||
samples = list((ROOT / "samples").glob("*.wav"))
|
||||
if not samples:
|
||||
raise SystemExit(
|
||||
"reference 없음: samples/*.wav 녹음하거나 f5-tts venv 설치 후 --ref-audio auto"
|
||||
)
|
||||
ref_path = samples[0]
|
||||
else:
|
||||
ref_path = Path(ref_arg)
|
||||
if not ref_path.is_file():
|
||||
raise SystemExit(f"ref audio not found: {ref_path}")
|
||||
|
||||
ref_text = ""
|
||||
txt_candidates = [
|
||||
ref_path.with_suffix(".txt"),
|
||||
ROOT / "samples" / "my_voice_ref.txt",
|
||||
]
|
||||
for t in txt_candidates:
|
||||
if t.is_file():
|
||||
ref_text = t.read_text(encoding="utf-8").strip()
|
||||
break
|
||||
if not ref_text and ref_arg != "auto":
|
||||
ref_text = "참조 음성의 대본을 여기에 입력하세요."
|
||||
return ref_path, ref_text
|
||||
|
||||
|
||||
def run_worker(python: Path, worker: Path, cmd: list[str]) -> bool:
|
||||
if not python.is_file():
|
||||
print(f"SKIP: venv missing ({python.parent.parent.name})", file=sys.stderr)
|
||||
return False
|
||||
r = subprocess.run([str(python), str(worker), *cmd], cwd=str(ROOT))
|
||||
return r.returncode == 0
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--ref-audio", default="auto", help="WAV path or 'auto'")
|
||||
parser.add_argument("--models", default="both", choices=("both", "f5_tts", "cosyvoice"))
|
||||
parser.add_argument("--out-dir", default=str(ROOT / "outputs" / "ab_compare"))
|
||||
args = parser.parse_args()
|
||||
|
||||
ref_path, ref_text = resolve_ref_audio(args.ref_audio)
|
||||
out_base = Path(args.out_dir)
|
||||
out_base.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
cases = load_sentences()
|
||||
print(f"Reference: {ref_path}")
|
||||
print(f"Cases: {len(cases)}")
|
||||
print(f"Output: {out_base}\n")
|
||||
|
||||
ok = 0
|
||||
fail = 0
|
||||
for case in cases:
|
||||
cid = case["id"]
|
||||
text = case["text"]
|
||||
print(f"=== {cid}: {case['label']} ===")
|
||||
|
||||
if args.models in ("both", "f5_tts"):
|
||||
out_f5 = out_base / "f5_tts" / f"{cid}.wav"
|
||||
if run_worker(
|
||||
F5_PY,
|
||||
F5_WORKER,
|
||||
[
|
||||
"--ref-audio",
|
||||
str(ref_path),
|
||||
"--ref-text",
|
||||
ref_text,
|
||||
"--gen-text",
|
||||
text,
|
||||
"--out",
|
||||
str(out_f5),
|
||||
],
|
||||
):
|
||||
ok += 1
|
||||
else:
|
||||
fail += 1
|
||||
|
||||
if args.models in ("both", "cosyvoice"):
|
||||
out_cosy = out_base / "cosyvoice" / f"{cid}.wav"
|
||||
if run_worker(
|
||||
COSY_PY,
|
||||
COSY_WORKER,
|
||||
[
|
||||
"--ref-audio",
|
||||
str(ref_path),
|
||||
"--gen-text",
|
||||
text,
|
||||
"--prompt-text",
|
||||
ref_text,
|
||||
"--out",
|
||||
str(out_cosy),
|
||||
],
|
||||
):
|
||||
ok += 1
|
||||
else:
|
||||
fail += 1
|
||||
|
||||
manifest = {
|
||||
"ref_audio": str(ref_path),
|
||||
"ref_text": ref_text,
|
||||
"cases": cases,
|
||||
"output_dir": str(out_base),
|
||||
}
|
||||
(out_base / "manifest.json").write_text(
|
||||
json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8"
|
||||
)
|
||||
print(f"\n완료: success={ok} fail={fail}")
|
||||
print(f"manifest: {out_base / 'manifest.json'}")
|
||||
return 0 if fail == 0 else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
19
scripts/run_server.sh
Executable file
19
scripts/run_server.sh
Executable file
@@ -0,0 +1,19 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
cd "$ROOT"
|
||||
|
||||
if [[ ! -x "$ROOT/.venvs/api/bin/uvicorn" ]]; then
|
||||
echo "API venv 없음. ./scripts/setup_api.sh 실행"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
export PYTHONPATH="$ROOT"
|
||||
# shellcheck disable=SC1091
|
||||
[[ -f "$ROOT/.env" ]] && source "$ROOT/.env"
|
||||
|
||||
exec "$ROOT/.venvs/api/bin/uvicorn" backend.app.main:app \
|
||||
--host "${TTS_HOST:-0.0.0.0}" \
|
||||
--port "${TTS_PORT:-8000}" \
|
||||
--reload
|
||||
34
scripts/select_model.sh
Executable file
34
scripts/select_model.sh
Executable file
@@ -0,0 +1,34 @@
|
||||
#!/usr/bin/env bash
|
||||
# 최종 모델 선택을 .env 와 config에 반영
|
||||
set -euo pipefail
|
||||
|
||||
MODEL="${1:-}"
|
||||
if [[ -z "$MODEL" || ! "$MODEL" =~ ^(cosyvoice|f5_tts)$ ]]; then
|
||||
echo "Usage: $0 cosyvoice|f5_tts"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
ENV_FILE="$ROOT/.env"
|
||||
|
||||
if [[ -f "$ENV_FILE" ]]; then
|
||||
if grep -q '^TTS_MODEL=' "$ENV_FILE"; then
|
||||
sed -i.bak "s/^TTS_MODEL=.*/TTS_MODEL=$MODEL/" "$ENV_FILE"
|
||||
rm -f "$ENV_FILE.bak"
|
||||
else
|
||||
echo "TTS_MODEL=$MODEL" >> "$ENV_FILE"
|
||||
fi
|
||||
else
|
||||
cp "$ROOT/.env.example" "$ENV_FILE"
|
||||
echo "TTS_MODEL=$MODEL" >> "$ENV_FILE"
|
||||
fi
|
||||
|
||||
python3 - <<PY
|
||||
import json
|
||||
from pathlib import Path
|
||||
p = Path("$ROOT/config/model_choice.json")
|
||||
data = json.loads(p.read_text(encoding="utf-8"))
|
||||
data["selected_model"] = "$MODEL"
|
||||
p.write_text(json.dumps(data, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
||||
print("selected_model=$MODEL")
|
||||
PY
|
||||
13
scripts/setup_api.sh
Executable file
13
scripts/setup_api.sh
Executable file
@@ -0,0 +1,13 @@
|
||||
#!/usr/bin/env bash
|
||||
# FastAPI 서버용 경량 venv
|
||||
set -euo pipefail
|
||||
|
||||
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
VENV="$ROOT/.venvs/api"
|
||||
|
||||
python3 -m venv "$VENV"
|
||||
"$VENV/bin/pip" install -U pip wheel
|
||||
"$VENV/bin/pip" install -r "$ROOT/backend/requirements.txt"
|
||||
|
||||
echo "API venv 준비 완료: $VENV"
|
||||
echo "실행: $VENV/bin/uvicorn backend.app.main:app --host 0.0.0.0 --port 8000"
|
||||
38
scripts/setup_cosyvoice.sh
Executable file
38
scripts/setup_cosyvoice.sh
Executable file
@@ -0,0 +1,38 @@
|
||||
#!/usr/bin/env bash
|
||||
# CosyVoice3 전용 venv + 레포 클론 + 모델 다운로드
|
||||
set -euo pipefail
|
||||
|
||||
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
VENV="$ROOT/.venvs/cosyvoice"
|
||||
REPO="$ROOT/external/CosyVoice"
|
||||
MODEL_DIR="$ROOT/models/Fun-CosyVoice3-0.5B"
|
||||
|
||||
mkdir -p "$ROOT/external" "$ROOT/models"
|
||||
|
||||
if [[ ! -d "$REPO/.git" ]]; then
|
||||
echo "CosyVoice 레포 클론..."
|
||||
git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git "$REPO"
|
||||
cd "$REPO"
|
||||
git submodule update --init --recursive
|
||||
else
|
||||
echo "CosyVoice 레포 이미 존재: $REPO"
|
||||
fi
|
||||
|
||||
python3 -m venv "$VENV"
|
||||
"$VENV/bin/pip" install -U pip wheel
|
||||
"$VENV/bin/pip" install torch torchaudio --index-url https://download.pytorch.org/whl/cu124
|
||||
"$VENV/bin/pip" install -r "$REPO/requirements.txt"
|
||||
"$VENV/bin/pip" install huggingface_hub modelscope
|
||||
|
||||
echo "CosyVoice3 모델 다운로드 (Hugging Face)..."
|
||||
"$VENV/bin/python" - <<PY
|
||||
from huggingface_hub import snapshot_download
|
||||
snapshot_download(
|
||||
'FunAudioLLM/Fun-CosyVoice3-0.5B-2512',
|
||||
local_dir='$MODEL_DIR',
|
||||
)
|
||||
print('Model saved to $MODEL_DIR')
|
||||
PY
|
||||
|
||||
echo "CosyVoice venv 준비 완료: $VENV"
|
||||
echo "테스트: $VENV/bin/python $ROOT/scripts/workers/cosy_infer.py --help"
|
||||
16
scripts/setup_f5tts.sh
Executable file
16
scripts/setup_f5tts.sh
Executable file
@@ -0,0 +1,16 @@
|
||||
#!/usr/bin/env bash
|
||||
# F5-TTS 전용 venv (NVIDIA CUDA)
|
||||
set -euo pipefail
|
||||
|
||||
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
VENV="$ROOT/.venvs/f5tts"
|
||||
|
||||
python3 -m venv "$VENV"
|
||||
"$VENV/bin/pip" install -U pip wheel
|
||||
|
||||
# CUDA 12.x PyTorch (서버 CUDA 버전에 맞게 cu124/cu128 조정)
|
||||
"$VENV/bin/pip" install torch torchaudio --index-url https://download.pytorch.org/whl/cu124
|
||||
"$VENV/bin/pip" install f5-tts
|
||||
|
||||
echo "F5-TTS venv 준비 완료: $VENV"
|
||||
echo "테스트: $VENV/bin/python $ROOT/scripts/workers/f5_infer.py --help"
|
||||
73
scripts/workers/cosy_infer.py
Normal file
73
scripts/workers/cosy_infer.py
Normal file
@@ -0,0 +1,73 @@
|
||||
#!/usr/bin/env python3
|
||||
"""CosyVoice3 zero-shot 추론 워커 (cosyvoice venv에서 실행)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="CosyVoice3 inference worker")
|
||||
parser.add_argument("--ref-audio", required=True)
|
||||
parser.add_argument("--prompt-text", default="", help="Text spoken in ref audio (with prefix)")
|
||||
parser.add_argument("--gen-text", required=True)
|
||||
parser.add_argument("--out", required=True)
|
||||
parser.add_argument(
|
||||
"--model-dir",
|
||||
default=None,
|
||||
help="Path to Fun-CosyVoice3-0.5B (default: PROJECT/models/Fun-CosyVoice3-0.5B)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--prompt-prefix",
|
||||
default="You are a helpful assistant.<|endofprompt|>",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
root = Path(__file__).resolve().parents[2]
|
||||
repo = root / "external" / "CosyVoice"
|
||||
model_dir = Path(args.model_dir or root / "models" / "Fun-CosyVoice3-0.5B")
|
||||
ref = Path(args.ref_audio)
|
||||
out = Path(args.out)
|
||||
|
||||
if not repo.is_dir():
|
||||
print(f"CosyVoice repo missing: {repo}. Run ./scripts/setup_cosyvoice.sh", file=sys.stderr)
|
||||
return 1
|
||||
if not model_dir.is_dir():
|
||||
print(f"Model dir missing: {model_dir}", file=sys.stderr)
|
||||
return 1
|
||||
if not ref.is_file():
|
||||
print(f"ref audio not found: {ref}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
sys.path.insert(0, str(repo))
|
||||
sys.path.append(str(repo / "third_party" / "Matcha-TTS"))
|
||||
|
||||
try:
|
||||
import torchaudio
|
||||
from cosyvoice.cli.cosyvoice import AutoModel
|
||||
except ImportError as e:
|
||||
print("CosyVoice import failed. Run ./scripts/setup_cosyvoice.sh", file=sys.stderr)
|
||||
print(e, file=sys.stderr)
|
||||
return 1
|
||||
|
||||
prompt = args.prompt_prefix + (args.prompt_text or "")
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
cosyvoice = AutoModel(model_dir=str(model_dir))
|
||||
for i, result in enumerate(
|
||||
cosyvoice.inference_zero_shot(
|
||||
args.gen_text,
|
||||
prompt,
|
||||
str(ref),
|
||||
stream=False,
|
||||
)
|
||||
):
|
||||
path = out if i == 0 else out.with_stem(f"{out.stem}_{i}")
|
||||
torchaudio.save(str(path), result["tts_speech"], cosyvoice.sample_rate)
|
||||
print(f"OK: {path}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
47
scripts/workers/f5_infer.py
Normal file
47
scripts/workers/f5_infer.py
Normal file
@@ -0,0 +1,47 @@
|
||||
#!/usr/bin/env python3
|
||||
"""F5-TTS 추론 워커 (f5tts venv에서 실행)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="F5-TTS inference worker")
|
||||
parser.add_argument("--ref-audio", required=True, help="Reference WAV path")
|
||||
parser.add_argument("--ref-text", required=True, help="Transcript of reference audio")
|
||||
parser.add_argument("--gen-text", required=True, help="Text to synthesize")
|
||||
parser.add_argument("--out", required=True, help="Output WAV path")
|
||||
parser.add_argument("--model", default="F5TTS_v1_Base")
|
||||
args = parser.parse_args()
|
||||
|
||||
ref = Path(args.ref_audio)
|
||||
if not ref.is_file():
|
||||
print(f"ref audio not found: {ref}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
out = Path(args.out)
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
from f5_tts.api import F5TTS
|
||||
except ImportError as e:
|
||||
print("f5-tts not installed. Run: ./scripts/setup_f5tts.sh", file=sys.stderr)
|
||||
print(e, file=sys.stderr)
|
||||
return 1
|
||||
|
||||
tts = F5TTS(model=args.model)
|
||||
tts.infer(
|
||||
ref_file=str(ref),
|
||||
ref_text=args.ref_text,
|
||||
gen_text=args.gen_text,
|
||||
file_wave=str(out),
|
||||
remove_silence=True,
|
||||
)
|
||||
print(f"OK: {out}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user