"""한국어 TTS용 간단한 텍스트 정규화.""" from __future__ import annotations import re _RE_MULTI_SPACE = re.compile(r"\s+") _RE_EMAIL = re.compile(r"[\w.+-]+@[\w.-]+\.\w+") _RE_URL = re.compile(r"https?://\S+") def _digits_to_korean(num_str: str) -> str: """정수 문자열을 한글 읽기로 변환 (간단 버전).""" if not num_str.isdigit(): return num_str n = int(num_str.replace(",", "")) if n == 0: return "영" units = ["", "만", "억", "조"] small = ["", "일", "이", "삼", "사", "오", "육", "칠", "팔", "구"] ten = ["", "십", "백", "천"] def chunk_to_korean(x: int) -> str: if x == 0: return "" parts: list[str] = [] s = f"{x:04d}" for i, d in enumerate(s): di = int(d) if di == 0: continue if i == 0 and di == 1 and len(s) > 1: parts.append(ten[3 - i]) elif di == 1 and i > 0: parts.append(ten[3 - i]) else: parts.append(small[di] + ten[3 - i]) return "".join(parts) if n < 10000: return chunk_to_korean(n) result: list[str] = [] u = 0 while n > 0 and u < len(units): part = n % 10000 n //= 10000 if part: result.append(chunk_to_korean(part) + units[u]) u += 1 return "".join(reversed(result)) or num_str def _replace_numbers(text: str) -> str: def repl(m: re.Match[str]) -> str: raw = m.group(0).replace(",", "") return _digits_to_korean(raw) return re.sub(r"\d[\d,]*", repl, text) def preprocess_korean(text: str) -> str: t = text.strip() t = _RE_URL.sub(" 링크 ", t) t = _RE_EMAIL.sub(" 이메일 ", t) t = t.replace("&", " 앤드 ") t = t.replace("%", " 퍼센트 ") t = _replace_numbers(t) t = _RE_MULTI_SPACE.sub(" ", t) return t.strip() def split_sentences(text: str, max_chars: int = 120) -> list[str]: """긴 텍스트를 문장 단위로 분리.""" parts = re.split(r"(?<=[.!?…])\s+|\n+", preprocess_korean(text)) chunks: list[str] = [] buf = "" for p in parts: p = p.strip() if not p: continue if len(buf) + len(p) + 1 <= max_chars: buf = f"{buf} {p}".strip() if buf else p else: if buf: chunks.append(buf) if len(p) <= max_chars: buf = p else: for i in range(0, len(p), max_chars): chunks.append(p[i : i + max_chars]) buf = "" if buf: chunks.append(buf) return chunks or [text]