Initial commit: Korean voice-cloning TTS prototype

FastAPI backend, web UI, CosyVoice3/F5-TTS setup scripts, and handoff docs for GPU PC continuation.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
2026-06-04 13:36:37 +09:00
commit 7101fdcd65
36 changed files with 1937 additions and 0 deletions

1
backend/app/__init__.py Normal file
View File

@@ -0,0 +1 @@
"""Korean voice-cloning TTS API."""

65
backend/app/config.py Normal file
View File

@@ -0,0 +1,65 @@
from __future__ import annotations
from functools import lru_cache
from pathlib import Path
import yaml
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
ROOT = Path(__file__).resolve().parents[2]
class AppSettings(BaseSettings):
model_config = SettingsConfigDict(
env_file=str(ROOT / ".env"),
env_file_encoding="utf-8",
extra="ignore",
)
tts_model: str = Field(default="cosyvoice", validation_alias="TTS_MODEL")
host: str = Field(default="0.0.0.0", validation_alias="TTS_HOST")
port: int = Field(default=8000, validation_alias="TTS_PORT")
samples_dir: Path = Field(default=ROOT / "samples")
outputs_dir: Path = Field(default=ROOT / "outputs" / "api")
uploads_dir: Path = Field(default=ROOT / "backend" / "data" / "uploads")
default_ref_audio: str | None = Field(default=None, validation_alias="TTS_REF_AUDIO")
default_ref_text: str | None = Field(default=None, validation_alias="TTS_REF_TEXT")
cosyvoice_model_dir: Path = Field(default=ROOT / "models" / "Fun-CosyVoice3-0.5B")
cosyvoice_prompt_prefix: str = (
"You are a helpful assistant.<|endofprompt|>"
)
chunk_max_chars: int = 120
@lru_cache
def get_settings() -> AppSettings:
yaml_path = ROOT / "config" / "settings.yaml"
data: dict = {}
if yaml_path.is_file():
with open(yaml_path, encoding="utf-8") as f:
raw = yaml.safe_load(f) or {}
data["tts_model"] = raw.get("default_model", "cosyvoice")
gen = raw.get("generation") or {}
data["chunk_max_chars"] = gen.get("chunk_max_chars", 120)
cv = raw.get("cosyvoice") or {}
if cv.get("model_dir"):
data["cosyvoice_model_dir"] = ROOT / cv["model_dir"]
if cv.get("prompt_prefix"):
data["cosyvoice_prompt_prefix"] = cv["prompt_prefix"]
srv = raw.get("server") or {}
data["host"] = srv.get("host", "0.0.0.0")
data["port"] = srv.get("port", 8000)
paths = raw.get("paths") or {}
if paths.get("samples_dir"):
data["samples_dir"] = ROOT / paths["samples_dir"]
if paths.get("outputs_dir"):
data["outputs_dir"] = ROOT / paths["outputs_dir"] / "api"
if paths.get("uploads_dir"):
data["uploads_dir"] = ROOT / paths["uploads_dir"]
return AppSettings(**{k: v for k, v in data.items() if v is not None})
def project_root() -> Path:
return ROOT

170
backend/app/main.py Normal file
View File

@@ -0,0 +1,170 @@
from __future__ import annotations
import shutil
import uuid
from pathlib import Path
from fastapi import FastAPI, File, Form, HTTPException, UploadFile
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse
from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel, Field
from backend.app.config import get_settings, project_root
from backend.app.text_preprocess import preprocess_korean
from backend.app.tts.service import TTSService
ROOT = project_root()
WEB_DIR = ROOT / "web"
app = FastAPI(
title="Korean Voice Cloning TTS",
description="CosyVoice / F5-TTS 기반 한국어 보이스 클로닝 API",
version="0.1.0",
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
_tts: TTSService | None = None
def get_tts() -> TTSService:
global _tts
if _tts is None:
_tts = TTSService()
return _tts
class TTSRequest(BaseModel):
text: str = Field(..., min_length=1, max_length=5000)
ref_audio: str | None = Field(
default=None, description="samples/ 또는 uploads/ 기준 상대/절대 경로"
)
ref_text: str | None = None
preprocess: bool = True
class TTSResponse(BaseModel):
job_id: str
audio_url: str
model: str
text_preview: str
class HealthResponse(BaseModel):
status: str
model: str
samples_count: int
@app.get("/api/health", response_model=HealthResponse)
def health() -> HealthResponse:
s = get_settings()
samples = list(s.samples_dir.glob("*.wav"))
return HealthResponse(
status="ok",
model=s.tts_model,
samples_count=len(samples),
)
@app.post("/api/tts", response_model=TTSResponse)
def create_tts(body: TTSRequest) -> TTSResponse:
text = preprocess_korean(body.text) if body.preprocess else body.text.strip()
if not text:
raise HTTPException(400, "text is empty")
ref_path: Path | None = None
if body.ref_audio:
p = Path(body.ref_audio)
if not p.is_absolute():
for base in (get_settings().samples_dir, get_settings().uploads_dir):
candidate = base / p
if candidate.is_file():
p = candidate
break
if not p.is_file():
raise HTTPException(404, f"ref_audio not found: {body.ref_audio}")
ref_path = p
try:
job_id, _ = get_tts().synthesize_to_file(
text, ref_audio=ref_path, ref_text=body.ref_text
)
except FileNotFoundError as e:
raise HTTPException(404, str(e)) from e
except RuntimeError as e:
raise HTTPException(503, str(e)) from e
return TTSResponse(
job_id=job_id,
audio_url=f"/api/audio/{job_id}",
model=get_settings().tts_model,
text_preview=text[:80] + ("" if len(text) > 80 else ""),
)
@app.get("/api/audio/{job_id}")
def get_audio(job_id: str) -> FileResponse:
path = get_settings().outputs_dir / job_id / "output.wav"
if not path.is_file():
alt = get_settings().outputs_dir / job_id / "part_000.wav"
path = alt if alt.is_file() else path
if not path.is_file():
raise HTTPException(404, "audio not found")
return FileResponse(path, media_type="audio/wav", filename=f"{job_id}.wav")
@app.get("/api/voice-samples")
def list_voice_samples() -> dict:
s = get_settings()
samples = []
for d, label in ((s.samples_dir, "samples"), (s.uploads_dir, "uploads")):
for wav in sorted(d.glob("*.wav")):
txt = wav.with_suffix(".txt")
samples.append(
{
"id": wav.stem,
"path": str(wav),
"label": label,
"has_transcript": txt.is_file(),
}
)
return {"samples": samples, "default_model": s.tts_model}
@app.post("/api/voice-sample")
async def upload_voice_sample(
file: UploadFile = File(...),
ref_text: str = Form(""),
) -> dict:
if not file.filename or not file.filename.lower().endswith(".wav"):
raise HTTPException(400, "WAV 파일만 업로드 가능합니다")
sample_id = uuid.uuid4().hex[:10]
dest = get_settings().uploads_dir / f"{sample_id}.wav"
with open(dest, "wb") as f:
shutil.copyfileobj(file.file, f)
if ref_text.strip():
(dest.with_suffix(".txt")).write_text(ref_text.strip(), encoding="utf-8")
return {
"id": sample_id,
"path": str(dest),
"message": "업로드 완료. TTS 요청 시 ref_audio에 이 path를 사용하세요.",
}
if WEB_DIR.is_dir():
app.mount("/", StaticFiles(directory=str(WEB_DIR), html=True), name="web")
@app.on_event("startup")
def startup() -> None:
get_settings().outputs_dir.mkdir(parents=True, exist_ok=True)
get_settings().uploads_dir.mkdir(parents=True, exist_ok=True)

View File

@@ -0,0 +1,95 @@
"""한국어 TTS용 간단한 텍스트 정규화."""
from __future__ import annotations
import re
_RE_MULTI_SPACE = re.compile(r"\s+")
_RE_EMAIL = re.compile(r"[\w.+-]+@[\w.-]+\.\w+")
_RE_URL = re.compile(r"https?://\S+")
def _digits_to_korean(num_str: str) -> str:
"""정수 문자열을 한글 읽기로 변환 (간단 버전)."""
if not num_str.isdigit():
return num_str
n = int(num_str.replace(",", ""))
if n == 0:
return ""
units = ["", "", "", ""]
small = ["", "", "", "", "", "", "", "", "", ""]
ten = ["", "", "", ""]
def chunk_to_korean(x: int) -> str:
if x == 0:
return ""
parts: list[str] = []
s = f"{x:04d}"
for i, d in enumerate(s):
di = int(d)
if di == 0:
continue
if i == 0 and di == 1 and len(s) > 1:
parts.append(ten[3 - i])
elif di == 1 and i > 0:
parts.append(ten[3 - i])
else:
parts.append(small[di] + ten[3 - i])
return "".join(parts)
if n < 10000:
return chunk_to_korean(n)
result: list[str] = []
u = 0
while n > 0 and u < len(units):
part = n % 10000
n //= 10000
if part:
result.append(chunk_to_korean(part) + units[u])
u += 1
return "".join(reversed(result)) or num_str
def _replace_numbers(text: str) -> str:
def repl(m: re.Match[str]) -> str:
raw = m.group(0).replace(",", "")
return _digits_to_korean(raw)
return re.sub(r"\d[\d,]*", repl, text)
def preprocess_korean(text: str) -> str:
t = text.strip()
t = _RE_URL.sub(" 링크 ", t)
t = _RE_EMAIL.sub(" 이메일 ", t)
t = t.replace("&", " 앤드 ")
t = t.replace("%", " 퍼센트 ")
t = _replace_numbers(t)
t = _RE_MULTI_SPACE.sub(" ", t)
return t.strip()
def split_sentences(text: str, max_chars: int = 120) -> list[str]:
"""긴 텍스트를 문장 단위로 분리."""
parts = re.split(r"(?<=[.!?…])\s+|\n+", preprocess_korean(text))
chunks: list[str] = []
buf = ""
for p in parts:
p = p.strip()
if not p:
continue
if len(buf) + len(p) + 1 <= max_chars:
buf = f"{buf} {p}".strip() if buf else p
else:
if buf:
chunks.append(buf)
if len(p) <= max_chars:
buf = p
else:
for i in range(0, len(p), max_chars):
chunks.append(p[i : i + max_chars])
buf = ""
if buf:
chunks.append(buf)
return chunks or [text]

View File

@@ -0,0 +1,3 @@
from backend.app.tts.service import TTSService
__all__ = ["TTSService"]

18
backend/app/tts/base.py Normal file
View File

@@ -0,0 +1,18 @@
from __future__ import annotations
from abc import ABC, abstractmethod
from pathlib import Path
class TTSEngine(ABC):
name: str
@abstractmethod
def synthesize(
self,
text: str,
ref_audio: Path,
ref_text: str,
out_path: Path,
) -> Path:
"""단일 텍스트 청크를 WAV로 생성."""

View File

@@ -0,0 +1,101 @@
from __future__ import annotations
import subprocess
import sys
from pathlib import Path
from backend.app.config import project_root
from backend.app.tts.base import TTSEngine
ROOT = project_root()
class SubprocessEngine(TTSEngine):
def __init__(self, venv_name: str, worker_name: str) -> None:
self._python = ROOT / ".venvs" / venv_name / "bin" / "python"
self._worker = ROOT / "scripts" / "workers" / worker_name
def _run(self, args: list[str]) -> None:
if not self._python.is_file():
raise RuntimeError(
f"{self._python.parent.parent.name} venv 없음. "
f"scripts/setup_{self._python.parent.parent.name}.sh 실행"
)
cmd = [str(self._python), str(self._worker), *args]
proc = subprocess.run(cmd, cwd=str(ROOT), capture_output=True, text=True)
if proc.returncode != 0:
raise RuntimeError(
f"{self.name} inference failed:\n{proc.stderr or proc.stdout}"
)
class F5TTSEngine(SubprocessEngine):
name = "f5_tts"
def __init__(self) -> None:
super().__init__("f5tts", "f5_infer.py")
def synthesize(
self,
text: str,
ref_audio: Path,
ref_text: str,
out_path: Path,
) -> Path:
out_path.parent.mkdir(parents=True, exist_ok=True)
self._run(
[
"--ref-audio",
str(ref_audio),
"--ref-text",
ref_text or "reference audio transcript",
"--gen-text",
text,
"--out",
str(out_path),
]
)
return out_path
class CosyVoiceEngine(SubprocessEngine):
name = "cosyvoice"
def __init__(self, model_dir: Path, prompt_prefix: str) -> None:
super().__init__("cosyvoice", "cosy_infer.py")
self._model_dir = model_dir
self._prompt_prefix = prompt_prefix
def synthesize(
self,
text: str,
ref_audio: Path,
ref_text: str,
out_path: Path,
) -> Path:
out_path.parent.mkdir(parents=True, exist_ok=True)
self._run(
[
"--ref-audio",
str(ref_audio),
"--gen-text",
text,
"--prompt-text",
ref_text or "",
"--out",
str(out_path),
"--model-dir",
str(self._model_dir),
"--prompt-prefix",
self._prompt_prefix,
]
)
return out_path
def create_engine(model: str, model_dir: Path, prompt_prefix: str) -> TTSEngine:
if model == "f5_tts":
return F5TTSEngine()
if model == "cosyvoice":
return CosyVoiceEngine(model_dir, prompt_prefix)
raise ValueError(f"Unknown model: {model}. Use cosyvoice or f5_tts.")

View File

@@ -0,0 +1,97 @@
from __future__ import annotations
import uuid
import wave
from pathlib import Path
from backend.app.config import AppSettings, get_settings, project_root
from backend.app.text_preprocess import split_sentences
from backend.app.tts.engines_subprocess import create_engine
ROOT = project_root()
class TTSService:
def __init__(self, settings: AppSettings | None = None) -> None:
self.settings = settings or get_settings()
self.engine = create_engine(
self.settings.tts_model,
self.settings.cosyvoice_model_dir,
self.settings.cosyvoice_prompt_prefix,
)
self.settings.outputs_dir.mkdir(parents=True, exist_ok=True)
self.settings.uploads_dir.mkdir(parents=True, exist_ok=True)
def resolve_reference(
self,
ref_audio: Path | None = None,
ref_text: str | None = None,
) -> tuple[Path, str]:
if ref_audio and ref_audio.is_file():
audio = ref_audio
elif self.settings.default_ref_audio:
audio = Path(self.settings.default_ref_audio)
else:
samples = sorted(self.settings.samples_dir.glob("*.wav"))
if not samples:
raise FileNotFoundError(
"reference WAV 없음. samples/에 녹음하거나 TTS_REF_AUDIO 설정"
)
audio = samples[0]
text = ref_text or self.settings.default_ref_text or ""
if not text:
for candidate in (
audio.with_suffix(".txt"),
self.settings.samples_dir / "my_voice_ref.txt",
):
if candidate.is_file():
text = candidate.read_text(encoding="utf-8").strip()
break
if not text and self.settings.tts_model == "f5_tts":
text = "참조 음성의 대본을 samples/my_voice_ref.txt 에 저장하세요."
return audio, text
def synthesize_to_file(
self,
text: str,
ref_audio: Path | None = None,
ref_text: str | None = None,
job_id: str | None = None,
) -> tuple[str, Path]:
ref_path, ref_txt = self.resolve_reference(ref_audio, ref_text)
chunks = split_sentences(text, self.settings.chunk_max_chars)
job_id = job_id or uuid.uuid4().hex[:12]
job_dir = self.settings.outputs_dir / job_id
job_dir.mkdir(parents=True, exist_ok=True)
chunk_paths: list[Path] = []
for i, chunk in enumerate(chunks):
out = job_dir / f"part_{i:03d}.wav"
self.engine.synthesize(chunk, ref_path, ref_txt, out)
chunk_paths.append(out)
final = job_dir / "output.wav"
if len(chunk_paths) == 1:
chunk_paths[0].replace(final)
else:
_concat_wav(chunk_paths, final)
return job_id, final
def _concat_wav(paths: list[Path], out: Path) -> None:
"""동일 포맷 WAV 단순 연결."""
with wave.open(str(paths[0]), "rb") as w0:
params = w0.getparams()
frames = [w0.readframes(w0.getnframes())]
for p in paths[1:]:
with wave.open(str(p), "rb") as w:
if w.getparams() != params:
raise ValueError(f"WAV format mismatch: {p}")
frames.append(w.readframes(w.getframes()))
out.parent.mkdir(parents=True, exist_ok=True)
with wave.open(str(out), "wb") as wo:
wo.setparams(params)
for f in frames:
wo.writeframes(f)

View File

9
backend/requirements.txt Normal file
View File

@@ -0,0 +1,9 @@
fastapi>=0.115.0
uvicorn[standard]>=0.32.0
python-multipart>=0.0.12
pydantic>=2.9.0
pydantic-settings>=2.6.0
pyyaml>=6.0.2
aiofiles>=24.1.0
soundfile>=0.12.1
librosa>=0.10.2