FastAPI backend, web UI, CosyVoice3/F5-TTS setup scripts, and handoff docs for GPU PC continuation. Co-authored-by: Cursor <cursoragent@cursor.com>
74 lines
2.3 KiB
Python
74 lines
2.3 KiB
Python
#!/usr/bin/env python3
|
|
"""CosyVoice3 zero-shot 추론 워커 (cosyvoice venv에서 실행)."""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(description="CosyVoice3 inference worker")
|
|
parser.add_argument("--ref-audio", required=True)
|
|
parser.add_argument("--prompt-text", default="", help="Text spoken in ref audio (with prefix)")
|
|
parser.add_argument("--gen-text", required=True)
|
|
parser.add_argument("--out", required=True)
|
|
parser.add_argument(
|
|
"--model-dir",
|
|
default=None,
|
|
help="Path to Fun-CosyVoice3-0.5B (default: PROJECT/models/Fun-CosyVoice3-0.5B)",
|
|
)
|
|
parser.add_argument(
|
|
"--prompt-prefix",
|
|
default="You are a helpful assistant.<|endofprompt|>",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
root = Path(__file__).resolve().parents[2]
|
|
repo = root / "external" / "CosyVoice"
|
|
model_dir = Path(args.model_dir or root / "models" / "Fun-CosyVoice3-0.5B")
|
|
ref = Path(args.ref_audio)
|
|
out = Path(args.out)
|
|
|
|
if not repo.is_dir():
|
|
print(f"CosyVoice repo missing: {repo}. Run ./scripts/setup_cosyvoice.sh", file=sys.stderr)
|
|
return 1
|
|
if not model_dir.is_dir():
|
|
print(f"Model dir missing: {model_dir}", file=sys.stderr)
|
|
return 1
|
|
if not ref.is_file():
|
|
print(f"ref audio not found: {ref}", file=sys.stderr)
|
|
return 1
|
|
|
|
sys.path.insert(0, str(repo))
|
|
sys.path.append(str(repo / "third_party" / "Matcha-TTS"))
|
|
|
|
try:
|
|
import torchaudio
|
|
from cosyvoice.cli.cosyvoice import AutoModel
|
|
except ImportError as e:
|
|
print("CosyVoice import failed. Run ./scripts/setup_cosyvoice.sh", file=sys.stderr)
|
|
print(e, file=sys.stderr)
|
|
return 1
|
|
|
|
prompt = args.prompt_prefix + (args.prompt_text or "")
|
|
out.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
cosyvoice = AutoModel(model_dir=str(model_dir))
|
|
for i, result in enumerate(
|
|
cosyvoice.inference_zero_shot(
|
|
args.gen_text,
|
|
prompt,
|
|
str(ref),
|
|
stream=False,
|
|
)
|
|
):
|
|
path = out if i == 0 else out.with_stem(f"{out.stem}_{i}")
|
|
torchaudio.save(str(path), result["tts_speech"], cosyvoice.sample_rate)
|
|
print(f"OK: {path}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|