Initial commit: Korean voice-cloning TTS prototype
FastAPI backend, web UI, CosyVoice3/F5-TTS setup scripts, and handoff docs for GPU PC continuation. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
1
backend/app/__init__.py
Normal file
1
backend/app/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Korean voice-cloning TTS API."""
|
||||
65
backend/app/config.py
Normal file
65
backend/app/config.py
Normal file
@@ -0,0 +1,65 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
from pydantic import Field
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[2]
|
||||
|
||||
|
||||
class AppSettings(BaseSettings):
|
||||
model_config = SettingsConfigDict(
|
||||
env_file=str(ROOT / ".env"),
|
||||
env_file_encoding="utf-8",
|
||||
extra="ignore",
|
||||
)
|
||||
|
||||
tts_model: str = Field(default="cosyvoice", validation_alias="TTS_MODEL")
|
||||
host: str = Field(default="0.0.0.0", validation_alias="TTS_HOST")
|
||||
port: int = Field(default=8000, validation_alias="TTS_PORT")
|
||||
samples_dir: Path = Field(default=ROOT / "samples")
|
||||
outputs_dir: Path = Field(default=ROOT / "outputs" / "api")
|
||||
uploads_dir: Path = Field(default=ROOT / "backend" / "data" / "uploads")
|
||||
default_ref_audio: str | None = Field(default=None, validation_alias="TTS_REF_AUDIO")
|
||||
default_ref_text: str | None = Field(default=None, validation_alias="TTS_REF_TEXT")
|
||||
cosyvoice_model_dir: Path = Field(default=ROOT / "models" / "Fun-CosyVoice3-0.5B")
|
||||
cosyvoice_prompt_prefix: str = (
|
||||
"You are a helpful assistant.<|endofprompt|>"
|
||||
)
|
||||
chunk_max_chars: int = 120
|
||||
|
||||
|
||||
@lru_cache
|
||||
def get_settings() -> AppSettings:
|
||||
yaml_path = ROOT / "config" / "settings.yaml"
|
||||
data: dict = {}
|
||||
if yaml_path.is_file():
|
||||
with open(yaml_path, encoding="utf-8") as f:
|
||||
raw = yaml.safe_load(f) or {}
|
||||
data["tts_model"] = raw.get("default_model", "cosyvoice")
|
||||
gen = raw.get("generation") or {}
|
||||
data["chunk_max_chars"] = gen.get("chunk_max_chars", 120)
|
||||
cv = raw.get("cosyvoice") or {}
|
||||
if cv.get("model_dir"):
|
||||
data["cosyvoice_model_dir"] = ROOT / cv["model_dir"]
|
||||
if cv.get("prompt_prefix"):
|
||||
data["cosyvoice_prompt_prefix"] = cv["prompt_prefix"]
|
||||
srv = raw.get("server") or {}
|
||||
data["host"] = srv.get("host", "0.0.0.0")
|
||||
data["port"] = srv.get("port", 8000)
|
||||
paths = raw.get("paths") or {}
|
||||
if paths.get("samples_dir"):
|
||||
data["samples_dir"] = ROOT / paths["samples_dir"]
|
||||
if paths.get("outputs_dir"):
|
||||
data["outputs_dir"] = ROOT / paths["outputs_dir"] / "api"
|
||||
if paths.get("uploads_dir"):
|
||||
data["uploads_dir"] = ROOT / paths["uploads_dir"]
|
||||
|
||||
return AppSettings(**{k: v for k, v in data.items() if v is not None})
|
||||
|
||||
|
||||
def project_root() -> Path:
|
||||
return ROOT
|
||||
170
backend/app/main.py
Normal file
170
backend/app/main.py
Normal file
@@ -0,0 +1,170 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import shutil
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import FastAPI, File, Form, HTTPException, UploadFile
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import FileResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from backend.app.config import get_settings, project_root
|
||||
from backend.app.text_preprocess import preprocess_korean
|
||||
from backend.app.tts.service import TTSService
|
||||
|
||||
ROOT = project_root()
|
||||
WEB_DIR = ROOT / "web"
|
||||
|
||||
app = FastAPI(
|
||||
title="Korean Voice Cloning TTS",
|
||||
description="CosyVoice / F5-TTS 기반 한국어 보이스 클로닝 API",
|
||||
version="0.1.0",
|
||||
)
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
_tts: TTSService | None = None
|
||||
|
||||
|
||||
def get_tts() -> TTSService:
|
||||
global _tts
|
||||
if _tts is None:
|
||||
_tts = TTSService()
|
||||
return _tts
|
||||
|
||||
|
||||
class TTSRequest(BaseModel):
|
||||
text: str = Field(..., min_length=1, max_length=5000)
|
||||
ref_audio: str | None = Field(
|
||||
default=None, description="samples/ 또는 uploads/ 기준 상대/절대 경로"
|
||||
)
|
||||
ref_text: str | None = None
|
||||
preprocess: bool = True
|
||||
|
||||
|
||||
class TTSResponse(BaseModel):
|
||||
job_id: str
|
||||
audio_url: str
|
||||
model: str
|
||||
text_preview: str
|
||||
|
||||
|
||||
class HealthResponse(BaseModel):
|
||||
status: str
|
||||
model: str
|
||||
samples_count: int
|
||||
|
||||
|
||||
@app.get("/api/health", response_model=HealthResponse)
|
||||
def health() -> HealthResponse:
|
||||
s = get_settings()
|
||||
samples = list(s.samples_dir.glob("*.wav"))
|
||||
return HealthResponse(
|
||||
status="ok",
|
||||
model=s.tts_model,
|
||||
samples_count=len(samples),
|
||||
)
|
||||
|
||||
|
||||
@app.post("/api/tts", response_model=TTSResponse)
|
||||
def create_tts(body: TTSRequest) -> TTSResponse:
|
||||
text = preprocess_korean(body.text) if body.preprocess else body.text.strip()
|
||||
if not text:
|
||||
raise HTTPException(400, "text is empty")
|
||||
|
||||
ref_path: Path | None = None
|
||||
if body.ref_audio:
|
||||
p = Path(body.ref_audio)
|
||||
if not p.is_absolute():
|
||||
for base in (get_settings().samples_dir, get_settings().uploads_dir):
|
||||
candidate = base / p
|
||||
if candidate.is_file():
|
||||
p = candidate
|
||||
break
|
||||
if not p.is_file():
|
||||
raise HTTPException(404, f"ref_audio not found: {body.ref_audio}")
|
||||
ref_path = p
|
||||
|
||||
try:
|
||||
job_id, _ = get_tts().synthesize_to_file(
|
||||
text, ref_audio=ref_path, ref_text=body.ref_text
|
||||
)
|
||||
except FileNotFoundError as e:
|
||||
raise HTTPException(404, str(e)) from e
|
||||
except RuntimeError as e:
|
||||
raise HTTPException(503, str(e)) from e
|
||||
|
||||
return TTSResponse(
|
||||
job_id=job_id,
|
||||
audio_url=f"/api/audio/{job_id}",
|
||||
model=get_settings().tts_model,
|
||||
text_preview=text[:80] + ("…" if len(text) > 80 else ""),
|
||||
)
|
||||
|
||||
|
||||
@app.get("/api/audio/{job_id}")
|
||||
def get_audio(job_id: str) -> FileResponse:
|
||||
path = get_settings().outputs_dir / job_id / "output.wav"
|
||||
if not path.is_file():
|
||||
alt = get_settings().outputs_dir / job_id / "part_000.wav"
|
||||
path = alt if alt.is_file() else path
|
||||
if not path.is_file():
|
||||
raise HTTPException(404, "audio not found")
|
||||
return FileResponse(path, media_type="audio/wav", filename=f"{job_id}.wav")
|
||||
|
||||
|
||||
@app.get("/api/voice-samples")
|
||||
def list_voice_samples() -> dict:
|
||||
s = get_settings()
|
||||
samples = []
|
||||
for d, label in ((s.samples_dir, "samples"), (s.uploads_dir, "uploads")):
|
||||
for wav in sorted(d.glob("*.wav")):
|
||||
txt = wav.with_suffix(".txt")
|
||||
samples.append(
|
||||
{
|
||||
"id": wav.stem,
|
||||
"path": str(wav),
|
||||
"label": label,
|
||||
"has_transcript": txt.is_file(),
|
||||
}
|
||||
)
|
||||
return {"samples": samples, "default_model": s.tts_model}
|
||||
|
||||
|
||||
@app.post("/api/voice-sample")
|
||||
async def upload_voice_sample(
|
||||
file: UploadFile = File(...),
|
||||
ref_text: str = Form(""),
|
||||
) -> dict:
|
||||
if not file.filename or not file.filename.lower().endswith(".wav"):
|
||||
raise HTTPException(400, "WAV 파일만 업로드 가능합니다")
|
||||
|
||||
sample_id = uuid.uuid4().hex[:10]
|
||||
dest = get_settings().uploads_dir / f"{sample_id}.wav"
|
||||
with open(dest, "wb") as f:
|
||||
shutil.copyfileobj(file.file, f)
|
||||
|
||||
if ref_text.strip():
|
||||
(dest.with_suffix(".txt")).write_text(ref_text.strip(), encoding="utf-8")
|
||||
|
||||
return {
|
||||
"id": sample_id,
|
||||
"path": str(dest),
|
||||
"message": "업로드 완료. TTS 요청 시 ref_audio에 이 path를 사용하세요.",
|
||||
}
|
||||
|
||||
|
||||
if WEB_DIR.is_dir():
|
||||
app.mount("/", StaticFiles(directory=str(WEB_DIR), html=True), name="web")
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
def startup() -> None:
|
||||
get_settings().outputs_dir.mkdir(parents=True, exist_ok=True)
|
||||
get_settings().uploads_dir.mkdir(parents=True, exist_ok=True)
|
||||
95
backend/app/text_preprocess.py
Normal file
95
backend/app/text_preprocess.py
Normal file
@@ -0,0 +1,95 @@
|
||||
"""한국어 TTS용 간단한 텍스트 정규화."""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
|
||||
_RE_MULTI_SPACE = re.compile(r"\s+")
|
||||
_RE_EMAIL = re.compile(r"[\w.+-]+@[\w.-]+\.\w+")
|
||||
_RE_URL = re.compile(r"https?://\S+")
|
||||
|
||||
|
||||
def _digits_to_korean(num_str: str) -> str:
|
||||
"""정수 문자열을 한글 읽기로 변환 (간단 버전)."""
|
||||
if not num_str.isdigit():
|
||||
return num_str
|
||||
n = int(num_str.replace(",", ""))
|
||||
if n == 0:
|
||||
return "영"
|
||||
units = ["", "만", "억", "조"]
|
||||
small = ["", "일", "이", "삼", "사", "오", "육", "칠", "팔", "구"]
|
||||
ten = ["", "십", "백", "천"]
|
||||
|
||||
def chunk_to_korean(x: int) -> str:
|
||||
if x == 0:
|
||||
return ""
|
||||
parts: list[str] = []
|
||||
s = f"{x:04d}"
|
||||
for i, d in enumerate(s):
|
||||
di = int(d)
|
||||
if di == 0:
|
||||
continue
|
||||
if i == 0 and di == 1 and len(s) > 1:
|
||||
parts.append(ten[3 - i])
|
||||
elif di == 1 and i > 0:
|
||||
parts.append(ten[3 - i])
|
||||
else:
|
||||
parts.append(small[di] + ten[3 - i])
|
||||
return "".join(parts)
|
||||
|
||||
if n < 10000:
|
||||
return chunk_to_korean(n)
|
||||
|
||||
result: list[str] = []
|
||||
u = 0
|
||||
while n > 0 and u < len(units):
|
||||
part = n % 10000
|
||||
n //= 10000
|
||||
if part:
|
||||
result.append(chunk_to_korean(part) + units[u])
|
||||
u += 1
|
||||
return "".join(reversed(result)) or num_str
|
||||
|
||||
|
||||
def _replace_numbers(text: str) -> str:
|
||||
def repl(m: re.Match[str]) -> str:
|
||||
raw = m.group(0).replace(",", "")
|
||||
return _digits_to_korean(raw)
|
||||
|
||||
return re.sub(r"\d[\d,]*", repl, text)
|
||||
|
||||
|
||||
def preprocess_korean(text: str) -> str:
|
||||
t = text.strip()
|
||||
t = _RE_URL.sub(" 링크 ", t)
|
||||
t = _RE_EMAIL.sub(" 이메일 ", t)
|
||||
t = t.replace("&", " 앤드 ")
|
||||
t = t.replace("%", " 퍼센트 ")
|
||||
t = _replace_numbers(t)
|
||||
t = _RE_MULTI_SPACE.sub(" ", t)
|
||||
return t.strip()
|
||||
|
||||
|
||||
def split_sentences(text: str, max_chars: int = 120) -> list[str]:
|
||||
"""긴 텍스트를 문장 단위로 분리."""
|
||||
parts = re.split(r"(?<=[.!?…])\s+|\n+", preprocess_korean(text))
|
||||
chunks: list[str] = []
|
||||
buf = ""
|
||||
for p in parts:
|
||||
p = p.strip()
|
||||
if not p:
|
||||
continue
|
||||
if len(buf) + len(p) + 1 <= max_chars:
|
||||
buf = f"{buf} {p}".strip() if buf else p
|
||||
else:
|
||||
if buf:
|
||||
chunks.append(buf)
|
||||
if len(p) <= max_chars:
|
||||
buf = p
|
||||
else:
|
||||
for i in range(0, len(p), max_chars):
|
||||
chunks.append(p[i : i + max_chars])
|
||||
buf = ""
|
||||
if buf:
|
||||
chunks.append(buf)
|
||||
return chunks or [text]
|
||||
3
backend/app/tts/__init__.py
Normal file
3
backend/app/tts/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from backend.app.tts.service import TTSService
|
||||
|
||||
__all__ = ["TTSService"]
|
||||
18
backend/app/tts/base.py
Normal file
18
backend/app/tts/base.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class TTSEngine(ABC):
|
||||
name: str
|
||||
|
||||
@abstractmethod
|
||||
def synthesize(
|
||||
self,
|
||||
text: str,
|
||||
ref_audio: Path,
|
||||
ref_text: str,
|
||||
out_path: Path,
|
||||
) -> Path:
|
||||
"""단일 텍스트 청크를 WAV로 생성."""
|
||||
101
backend/app/tts/engines_subprocess.py
Normal file
101
backend/app/tts/engines_subprocess.py
Normal file
@@ -0,0 +1,101 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from backend.app.config import project_root
|
||||
from backend.app.tts.base import TTSEngine
|
||||
|
||||
ROOT = project_root()
|
||||
|
||||
|
||||
class SubprocessEngine(TTSEngine):
|
||||
def __init__(self, venv_name: str, worker_name: str) -> None:
|
||||
self._python = ROOT / ".venvs" / venv_name / "bin" / "python"
|
||||
self._worker = ROOT / "scripts" / "workers" / worker_name
|
||||
|
||||
def _run(self, args: list[str]) -> None:
|
||||
if not self._python.is_file():
|
||||
raise RuntimeError(
|
||||
f"{self._python.parent.parent.name} venv 없음. "
|
||||
f"scripts/setup_{self._python.parent.parent.name}.sh 실행"
|
||||
)
|
||||
cmd = [str(self._python), str(self._worker), *args]
|
||||
proc = subprocess.run(cmd, cwd=str(ROOT), capture_output=True, text=True)
|
||||
if proc.returncode != 0:
|
||||
raise RuntimeError(
|
||||
f"{self.name} inference failed:\n{proc.stderr or proc.stdout}"
|
||||
)
|
||||
|
||||
|
||||
class F5TTSEngine(SubprocessEngine):
|
||||
name = "f5_tts"
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__("f5tts", "f5_infer.py")
|
||||
|
||||
def synthesize(
|
||||
self,
|
||||
text: str,
|
||||
ref_audio: Path,
|
||||
ref_text: str,
|
||||
out_path: Path,
|
||||
) -> Path:
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
self._run(
|
||||
[
|
||||
"--ref-audio",
|
||||
str(ref_audio),
|
||||
"--ref-text",
|
||||
ref_text or "reference audio transcript",
|
||||
"--gen-text",
|
||||
text,
|
||||
"--out",
|
||||
str(out_path),
|
||||
]
|
||||
)
|
||||
return out_path
|
||||
|
||||
|
||||
class CosyVoiceEngine(SubprocessEngine):
|
||||
name = "cosyvoice"
|
||||
|
||||
def __init__(self, model_dir: Path, prompt_prefix: str) -> None:
|
||||
super().__init__("cosyvoice", "cosy_infer.py")
|
||||
self._model_dir = model_dir
|
||||
self._prompt_prefix = prompt_prefix
|
||||
|
||||
def synthesize(
|
||||
self,
|
||||
text: str,
|
||||
ref_audio: Path,
|
||||
ref_text: str,
|
||||
out_path: Path,
|
||||
) -> Path:
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
self._run(
|
||||
[
|
||||
"--ref-audio",
|
||||
str(ref_audio),
|
||||
"--gen-text",
|
||||
text,
|
||||
"--prompt-text",
|
||||
ref_text or "",
|
||||
"--out",
|
||||
str(out_path),
|
||||
"--model-dir",
|
||||
str(self._model_dir),
|
||||
"--prompt-prefix",
|
||||
self._prompt_prefix,
|
||||
]
|
||||
)
|
||||
return out_path
|
||||
|
||||
|
||||
def create_engine(model: str, model_dir: Path, prompt_prefix: str) -> TTSEngine:
|
||||
if model == "f5_tts":
|
||||
return F5TTSEngine()
|
||||
if model == "cosyvoice":
|
||||
return CosyVoiceEngine(model_dir, prompt_prefix)
|
||||
raise ValueError(f"Unknown model: {model}. Use cosyvoice or f5_tts.")
|
||||
97
backend/app/tts/service.py
Normal file
97
backend/app/tts/service.py
Normal file
@@ -0,0 +1,97 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
import wave
|
||||
from pathlib import Path
|
||||
|
||||
from backend.app.config import AppSettings, get_settings, project_root
|
||||
from backend.app.text_preprocess import split_sentences
|
||||
from backend.app.tts.engines_subprocess import create_engine
|
||||
|
||||
ROOT = project_root()
|
||||
|
||||
|
||||
class TTSService:
|
||||
def __init__(self, settings: AppSettings | None = None) -> None:
|
||||
self.settings = settings or get_settings()
|
||||
self.engine = create_engine(
|
||||
self.settings.tts_model,
|
||||
self.settings.cosyvoice_model_dir,
|
||||
self.settings.cosyvoice_prompt_prefix,
|
||||
)
|
||||
self.settings.outputs_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.settings.uploads_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def resolve_reference(
|
||||
self,
|
||||
ref_audio: Path | None = None,
|
||||
ref_text: str | None = None,
|
||||
) -> tuple[Path, str]:
|
||||
if ref_audio and ref_audio.is_file():
|
||||
audio = ref_audio
|
||||
elif self.settings.default_ref_audio:
|
||||
audio = Path(self.settings.default_ref_audio)
|
||||
else:
|
||||
samples = sorted(self.settings.samples_dir.glob("*.wav"))
|
||||
if not samples:
|
||||
raise FileNotFoundError(
|
||||
"reference WAV 없음. samples/에 녹음하거나 TTS_REF_AUDIO 설정"
|
||||
)
|
||||
audio = samples[0]
|
||||
|
||||
text = ref_text or self.settings.default_ref_text or ""
|
||||
if not text:
|
||||
for candidate in (
|
||||
audio.with_suffix(".txt"),
|
||||
self.settings.samples_dir / "my_voice_ref.txt",
|
||||
):
|
||||
if candidate.is_file():
|
||||
text = candidate.read_text(encoding="utf-8").strip()
|
||||
break
|
||||
if not text and self.settings.tts_model == "f5_tts":
|
||||
text = "참조 음성의 대본을 samples/my_voice_ref.txt 에 저장하세요."
|
||||
return audio, text
|
||||
|
||||
def synthesize_to_file(
|
||||
self,
|
||||
text: str,
|
||||
ref_audio: Path | None = None,
|
||||
ref_text: str | None = None,
|
||||
job_id: str | None = None,
|
||||
) -> tuple[str, Path]:
|
||||
ref_path, ref_txt = self.resolve_reference(ref_audio, ref_text)
|
||||
chunks = split_sentences(text, self.settings.chunk_max_chars)
|
||||
job_id = job_id or uuid.uuid4().hex[:12]
|
||||
job_dir = self.settings.outputs_dir / job_id
|
||||
job_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
chunk_paths: list[Path] = []
|
||||
for i, chunk in enumerate(chunks):
|
||||
out = job_dir / f"part_{i:03d}.wav"
|
||||
self.engine.synthesize(chunk, ref_path, ref_txt, out)
|
||||
chunk_paths.append(out)
|
||||
|
||||
final = job_dir / "output.wav"
|
||||
if len(chunk_paths) == 1:
|
||||
chunk_paths[0].replace(final)
|
||||
else:
|
||||
_concat_wav(chunk_paths, final)
|
||||
|
||||
return job_id, final
|
||||
|
||||
|
||||
def _concat_wav(paths: list[Path], out: Path) -> None:
|
||||
"""동일 포맷 WAV 단순 연결."""
|
||||
with wave.open(str(paths[0]), "rb") as w0:
|
||||
params = w0.getparams()
|
||||
frames = [w0.readframes(w0.getnframes())]
|
||||
for p in paths[1:]:
|
||||
with wave.open(str(p), "rb") as w:
|
||||
if w.getparams() != params:
|
||||
raise ValueError(f"WAV format mismatch: {p}")
|
||||
frames.append(w.readframes(w.getframes()))
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
with wave.open(str(out), "wb") as wo:
|
||||
wo.setparams(params)
|
||||
for f in frames:
|
||||
wo.writeframes(f)
|
||||
0
backend/data/uploads/.gitkeep
Normal file
0
backend/data/uploads/.gitkeep
Normal file
9
backend/requirements.txt
Normal file
9
backend/requirements.txt
Normal file
@@ -0,0 +1,9 @@
|
||||
fastapi>=0.115.0
|
||||
uvicorn[standard]>=0.32.0
|
||||
python-multipart>=0.0.12
|
||||
pydantic>=2.9.0
|
||||
pydantic-settings>=2.6.0
|
||||
pyyaml>=6.0.2
|
||||
aiofiles>=24.1.0
|
||||
soundfile>=0.12.1
|
||||
librosa>=0.10.2
|
||||
Reference in New Issue
Block a user