Spaces:

pnnbao-ump
/

VieNeu-TTS-0.3B

Running on Zero

App Files Files Community

pnnbao-ump commited on Jan 8

Commit

79d5798

verified ·

1 Parent(s): 41e7f51

Upload 44 files

Browse files

Files changed (45) hide show

.gitattributes +11 -0
app.py +339 -0
config.yaml +77 -0
packages.txt +3 -0
requirements.txt +9 -0
sample/Bình (nam miền Bắc).pt +3 -0
sample/Bình (nam miền Bắc).txt +1 -0
sample/Bình (nam miền Bắc).wav +3 -0
sample/Dung (nữ miền Nam).pt +3 -0
sample/Dung (nữ miền Nam).txt +1 -0
sample/Dung (nữ miền Nam).wav +3 -0
sample/Hương (nữ miền Bắc).pt +3 -0
sample/Hương (nữ miền Bắc).txt +1 -0
sample/Hương (nữ miền Bắc).wav +3 -0
sample/Ly (nữ miền Bắc).pt +3 -0
sample/Ly (nữ miền Bắc).txt +1 -0
sample/Ly (nữ miền Bắc).wav +3 -0
sample/Nguyên (nam miền Nam).pt +3 -0
sample/Nguyên (nam miền Nam).txt +1 -0
sample/Nguyên (nam miền Nam).wav +3 -0
sample/Ngọc (nữ miền Bắc).pt +3 -0
sample/Ngọc (nữ miền Bắc).txt +1 -0
sample/Ngọc (nữ miền Bắc).wav +3 -0
sample/Sơn (nam miền Nam).pt +3 -0
sample/Sơn (nam miền Nam).txt +1 -0
sample/Sơn (nam miền Nam).wav +3 -0
sample/Tuyên (nam miền Bắc).pt +3 -0
sample/Tuyên (nam miền Bắc).txt +1 -0
sample/Tuyên (nam miền Bắc).wav +3 -0
sample/Vĩnh (nam miền Nam).pt +3 -0
sample/Vĩnh (nam miền Nam).txt +1 -0
sample/Vĩnh (nam miền Nam).wav +3 -0
sample/Đoan (nữ miền Nam).pt +3 -0
sample/Đoan (nữ miền Nam).txt +1 -0
sample/Đoan (nữ miền Nam).wav +3 -0
utils/__init__.py +0 -0
utils/__pycache__/__init__.cpython-312.pyc +0 -0
utils/__pycache__/core_utils.cpython-312.pyc +0 -0
utils/__pycache__/normalize_text.cpython-312.pyc +0 -0
utils/__pycache__/phonemize_text.cpython-312.pyc +0 -0
utils/core_utils.py +53 -0
utils/normalize_text.py +407 -0
utils/phoneme_dict.json +3 -0
utils/phonemize_text.py +346 -0
vieneu_tts.py +859 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+sample/Bình[[:space:]](nam[[:space:]]miền[[:space:]]Bắc).wav filter=lfs diff=lfs merge=lfs -text
+sample/Dung[[:space:]](nữ[[:space:]]miền[[:space:]]Nam).wav filter=lfs diff=lfs merge=lfs -text
+sample/Đoan[[:space:]](nữ[[:space:]]miền[[:space:]]Nam).wav filter=lfs diff=lfs merge=lfs -text
+sample/Hương[[:space:]](nữ[[:space:]]miền[[:space:]]Bắc).wav filter=lfs diff=lfs merge=lfs -text
+sample/Ly[[:space:]](nữ[[:space:]]miền[[:space:]]Bắc).wav filter=lfs diff=lfs merge=lfs -text
+sample/Ngọc[[:space:]](nữ[[:space:]]miền[[:space:]]Bắc).wav filter=lfs diff=lfs merge=lfs -text
+sample/Nguyên[[:space:]](nam[[:space:]]miền[[:space:]]Nam).wav filter=lfs diff=lfs merge=lfs -text
+sample/Sơn[[:space:]](nam[[:space:]]miền[[:space:]]Nam).wav filter=lfs diff=lfs merge=lfs -text
+sample/Tuyên[[:space:]](nam[[:space:]]miền[[:space:]]Bắc).wav filter=lfs diff=lfs merge=lfs -text
+sample/Vĩnh[[:space:]](nam[[:space:]]miền[[:space:]]Nam).wav filter=lfs diff=lfs merge=lfs -text
+utils/phoneme_dict.json filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,339 @@

+import spaces  # PHẢI import TRƯỚC mọi thứ trên HF Spaces ZeroGPU
+import os
+os.environ['SPACES_ZERO_GPU'] = '1'  # Set environment variable explicitly
+import gradio as gr
+import soundfile as sf
+import tempfile
+import torch
+from vieneu_tts import VieNeuTTS
+import time
+print("⏳ Đang khởi động VieNeu-TTS...")
+# --- 1. SETUP MODEL ---
+print("📦 Đang tải model...")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"🖥️ Sử dụng thiết bị: {device.upper()}")
+try:
+    tts = VieNeuTTS(
+        backbone_repo="pnnbao-ump/VieNeu-TTS-0.3B",
+        backbone_device=device,
+        codec_repo="neuphonic/distill-neucodec",
+        codec_device=device
+    )
+    print("✅ Model đã tải xong!")
+except Exception as e:
+    print(f"⚠️ Không thể tải model (Chế độ UI Demo): {e}")
+    class MockTTS:
+        def encode_reference(self, path): return None
+        def infer(self, text, ref, ref_text):
+            import numpy as np
+            # Giả lập độ trễ để test tính năng đo thời gian
+            time.sleep(1.5)
+            return np.random.uniform(-0.5, 0.5, 24000*3)
+    tts = MockTTS()
+# --- 2. DATA ---
+VOICE_SAMPLES = {
+    "Tuyên (nam miền Bắc)": {"audio": "./sample/Tuyên (nam miền Bắc).wav", "text": "./sample/Tuyên (nam miền Bắc).txt"},
+    "Vĩnh (nam miền Nam)": {"audio": "./sample/Vĩnh (nam miền Nam).wav", "text": "./sample/Vĩnh (nam miền Nam).txt"},
+    "Bình (nam miền Bắc)": {"audio": "./sample/Bình (nam miền Bắc).wav", "text": "./sample/Bình (nam miền Bắc).txt"},
+    "Nguyên (nam miền Nam)": {"audio": "./sample/Nguyên (nam miền Nam).wav", "text": "./sample/Nguyên (nam miền Nam).txt"},
+    "Sơn (nam miền Nam)": {"audio": "./sample/Sơn (nam miền Nam).wav", "text": "./sample/Sơn (nam miền Nam).txt"},
+    "Đoan (nữ miền Nam)": {"audio": "./sample/Đoan (nữ miền Nam).wav", "text": "./sample/Đoan (nữ miền Nam).txt"},
+    "Ngọc (nữ miền Bắc)": {"audio": "./sample/Ngọc (nữ miền Bắc).wav", "text": "./sample/Ngọc (nữ miền Bắc).txt"},
+    "Ly (nữ miền Bắc)": {"audio": "./sample/Ly (nữ miền Bắc).wav", "text": "./sample/Ly (nữ miền Bắc).txt"},
+    "Dung (nữ miền Nam)": {"audio": "./sample/Dung (nữ miền Nam).wav", "text": "./sample/Dung (nữ miền Nam).txt"}
+}
+# --- 3. HELPER FUNCTIONS ---
+def load_reference_info(voice_choice):
+    if voice_choice in VOICE_SAMPLES:
+        audio_path = VOICE_SAMPLES[voice_choice]["audio"]
+        text_path = VOICE_SAMPLES[voice_choice]["text"]
+        try:
+            if os.path.exists(text_path):
+                with open(text_path, "r", encoding="utf-8") as f:
+                    ref_text = f.read()
+                return audio_path, ref_text
+            else:
+                return audio_path, "⚠️ Không tìm thấy file text mẫu."
+        except Exception as e:
+            return None, f"❌ Lỗi: {str(e)}"
+    return None, ""
+@spaces.GPU(duration=120)
+def synthesize_speech(text, voice_choice, custom_audio, custom_text, mode_tab):
+    try:
+        if not text or text.strip() == "":
+            return None, "⚠️ Vui lòng nhập văn bản cần tổng hợp!"
+        # --- LOGIC CHECK LIMIT 250 ---
+        if len(text) > 250:
+            return None, f"❌ Văn bản quá dài ({len(text)}/250 ký tự)! Vui lòng cắt ngắn lại để đảm bảo chất lượng."
+        # Logic chọn Reference
+        if mode_tab == "custom_mode":
+            if custom_audio is None or not custom_text:
+                return None, "⚠️ Vui lòng tải lên Audio và nhập nội dung Audio đó."
+            ref_audio_path = custom_audio
+            ref_text_raw = custom_text
+            print("🎨 Mode: Custom Voice")
+        else: # Preset
+            if voice_choice not in VOICE_SAMPLES:
+                 return None, "⚠️ Vui lòng chọn một giọng mẫu."
+            ref_audio_path = VOICE_SAMPLES[voice_choice]["audio"]
+            ref_text_path = VOICE_SAMPLES[voice_choice]["text"]
+            if not os.path.exists(ref_audio_path):
+                 return None, f"❌ Không tìm thấy file audio: {ref_audio_path}"
+            with open(ref_text_path, "r", encoding="utf-8") as f:
+                ref_text_raw = f.read()
+            print(f"🎤 Mode: Preset Voice ({voice_choice})")
+        # Inference & Đo thời gian
+        print(f"📝 Text: {text[:50]}...")
+        start_time = time.time()
+        ref_codes = tts.encode_reference(ref_audio_path)
+        wav = tts.infer(text, ref_codes, ref_text_raw)
+        end_time = time.time()
+        process_time = end_time - start_time
+        # Save
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+            sf.write(tmp_file.name, wav, 24000)
+            output_path = tmp_file.name
+        return output_path, f"✅ Thành công! (Thời gian: {process_time:.2f}s)"
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return None, f"❌ Lỗi hệ thống: {str(e)}"
+# --- 4. UI SETUP ---
+theme = gr.themes.Soft(
+    primary_hue="indigo",
+    secondary_hue="cyan",
+    neutral_hue="slate",
+    font=[gr.themes.GoogleFont('Inter'), 'ui-sans-serif', 'system-ui'],
+).set(
+    button_primary_background_fill="linear-gradient(90deg, #6366f1 0%, #0ea5e9 100%)",
+    button_primary_background_fill_hover="linear-gradient(90deg, #4f46e5 0%, #0284c7 100%)",
+    block_shadow="0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06)",
+)
+css = """
+.container { max-width: 1200px; margin: auto; }
+.header-box {
+    text-align: center;
+    margin-bottom: 25px;
+    padding: 25px;
+    background: linear-gradient(135deg, #0f172a 0%, #1e293b 100%);
+    border-radius: 12px;
+    border: 1px solid #334155;
+    box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.3);
+}
+.header-title {
+    font-size: 2.5rem;
+    font-weight: 800;
+    color: white;
+    background: -webkit-linear-gradient(45deg, #60A5FA, #22D3EE);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    margin-bottom: 10px;
+}
+.header-desc {
+    font-size: 1.1rem;
+    color: #cbd5e1;
+    margin-bottom: 15px;
+}
+.link-group a {
+    text-decoration: none;
+    margin: 0 10px;
+    font-weight: 600;
+    color: #94a3b8;
+    transition: color 0.2s;
+}
+.link-group a:hover { color: #38bdf8; text-shadow: 0 0 5px rgba(56, 189, 248, 0.5); }
+.status-box { font-weight: bold; text-align: center; border: none; background: transparent; }
+.warning-banner {
+    background: linear-gradient(135deg, #fef3c7 0%, #fde68a 100%);
+    border: 2px solid #f59e0b;
+    border-radius: 8px;
+    padding: 15px 20px;
+    margin: 15px 0;
+    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+}
+.warning-banner-title {
+    font-size: 1.1rem;
+    font-weight: 700;
+    color: #92400e;
+    margin-bottom: 8px;
+    display: flex;
+    align-items: center;
+    gap: 8px;
+}
+.warning-banner-content {
+    color: #78350f;
+    font-size: 0.95rem;
+    line-height: 1.6;
+}
+.warning-banner-content strong {
+    color: #92400e;
+    font-weight: 600;
+}
+.warning-banner-content code {
+    background: #fef3c7;
+    padding: 2px 6px;
+    border-radius: 3px;
+    font-family: monospace;
+    color: #92400e;
+    font-weight: 500;
+}
+"""
+EXAMPLES_LIST = [
+    ["Về miền Tây không chỉ để ngắm nhìn sông nước hữu tình, mà còn để cảm nhận tấm chân tình của người dân nơi đây. Cùng ngồi xuồng ba lá len lỏi qua rặng dừa nước, nghe câu vọng cổ ngọt ngào thì còn gì bằng.", "Vĩnh (nam miền Nam)"],
+    ["Hà Nội những ngày vào thu mang một vẻ đẹp trầm mặc và cổ kính đến lạ thường. Đi dạo quanh Hồ Gươm vào sáng sớm, hít hà mùi hoa sữa nồng nàn và thưởng thức chút cốm làng Vòng là trải nghiệm khó quên.", "Bình (nam miền Bắc)"],
+    ["Sự bùng nổ của trí tuệ nhân tạo đang định hình lại cách chúng ta làm việc và sinh sống. Từ xe tự lái đến trợ lý ảo thông minh, công nghệ đang dần xóa nhòa ranh giới giữa thực tại và những bộ phim viễn tưởng.", "Tuyên (nam miền Bắc)"],
+    ["Sài Gòn hối hả là thế, nhưng chỉ cần tấp vào một quán cà phê ven đường, gọi ly bạc xỉu đá và ngắm nhìn dòng người qua lại, bạn sẽ thấy thành phố này cũng có những khoảng lặng thật bình yên và đáng yêu.", "Nguyên (nam miền Nam)"],
+    ["Ngày xửa ngày xưa, ở một ngôi làng nọ có cô Tấm xinh đẹp, nết na nhưng sớm mồ côi mẹ. Dù bị mẹ kế và Cám hãm hại đủ đường, Tấm vẫn giữ được tấm lòng lương thiện và cuối cùng tìm được hạnh phúc xứng đáng.", "Đoan (nữ miền Nam)"],
+    ["Dạ em chào anh chị, hiện tại bên em đang có chương trình ưu đãi đặc biệt cho căn hộ hướng sông này. Với thiết kế hiện đại và không gian xanh mát, đây chắc chắn là tổ ấm lý tưởng mà gia đình mình đang tìm kiếm.", "Ly (nữ miền Bắc)"],
+]
+with gr.Blocks(theme=theme, css=css, title="VieNeu-TTS Studio") as demo:
+    with gr.Column(elem_classes="container"):
+        # Header
+        gr.HTML("""
+            <div class="header-box">
+                <div class="header-title">🦜 VieNeu-TTS Studio</div>
+                <div class="header-desc">
+                    Phiên bản: VieNeu-TTS (model mới nhất, train trên 1000 giờ dữ liệu)
+                </div>
+                <div class="link-group">
+                    <a href="https://huggingface.co/pnnbao-ump/VieNeu-TTS" target="_blank">🤗 Model Card</a> •
+                    <a href="https://huggingface.co/datasets/pnnbao-ump/VieNeu-TTS-1000h" target="_blank">📖 Dataset 1000h</a> •
+                    <a href="https://github.com/pnnbao97/VieNeu-TTS" target="_blank">🦜 GitHub</a>
+                </div>
+            </div>
+        """)
+        # Performance Warning Banner
+        gr.HTML("""
+            <div class="warning-banner">
+                <div class="warning-banner-title">
+                    ⚠️ Lưu ý về hiệu năng
+                </div>
+                <div class="warning-banner-content">
+                    <strong>Demo này chạy trên HF Spaces với ZeroGPU (shared GPU)</strong> nên tốc độ sẽ <strong>chậm hơn</strong> và <strong>bị giới hạn 250 ký tự</strong> vì không thể triển khai lmdeploy trên HF space.<br><br>
+                    💡 <strong>Muốn tốc độ cực nhanh và không giới hạn ký tự?</strong> Hãy clone mã nguồn từ <a href="https://github.com/pnnbao97/VieNeu-TTS" target="_blank" style="color: #92400e; text-decoration: underline;">GitHub</a> và cài <code>lmdeploy</code> để chạy trên GPU của bạn:<br>
+                    🚀 Với LMDeploy + GPU local, tốc độ sẽ <strong>nhanh hơn 5-10 lần</strong> so với demo này!
+                </div>
+            </div>
+        """)
+    with gr.Row(elem_classes="container", equal_height=False):
+        # --- LEFT: INPUT ---
+        with gr.Column(scale=3, variant="panel"):
+            gr.Markdown("### 📝 Văn bản đầu vào")
+            text_input = gr.Textbox(
+                label="Nhập văn bản",
+                placeholder="Nhập nội dung tiếng Việt cần chuyển thành giọng nói...",
+                lines=4,
+                value="Sự bùng nổ của trí tuệ nhân tạo đang định hình lại cách chúng ta làm việc và sinh sống. Từ xe tự lái đến trợ lý ảo thông minh, công nghệ đang dần xóa nhòa ranh giới giữa thực tại và những bộ phim viễn tưởng.",
+                show_label=False
+            )
+            # Counter
+            with gr.Row():
+                char_count = gr.HTML("<div style='text-align: right; color: #64748B; font-size: 0.8rem;'>0 / 250 ký tự</div>")
+            gr.Markdown("### 🗣️ Chọn giọng đọc")
+            with gr.Tabs() as tabs:
+                with gr.TabItem("👤 Giọng có sẵn (Preset)", id="preset_mode"):
+                    voice_select = gr.Dropdown(
+                        choices=list(VOICE_SAMPLES.keys()),
+                        value="Tuyên (nam miền Bắc)",
+                        label="Danh sách giọng",
+                        interactive=True
+                    )
+                    with gr.Accordion("Thông tin giọng mẫu", open=False):
+                        ref_audio_preview = gr.Audio(label="Audio mẫu", interactive=False, type="filepath")
+                        ref_text_preview = gr.Markdown("...")
+                with gr.TabItem("🎙️ Giọng tùy chỉnh (Custom)", id="custom_mode"):
+                    gr.Markdown("Tải lên giọng của bạn (Zero-shot Cloning)")
+                    custom_audio = gr.Audio(label="File ghi âm (.wav)", type="filepath")
+                    custom_text = gr.Textbox(label="Nội dung ghi âm", placeholder="Nhập chính xác lời thoại...")
+            current_mode = gr.State(value="preset_mode")
+            btn_generate = gr.Button("Tổng hợp giọng nói", variant="primary", size="lg")
+        # --- RIGHT: OUTPUT ---
+        with gr.Column(scale=2):
+            gr.Markdown("### 🎧 Kết quả")
+            with gr.Group():
+                audio_output = gr.Audio(label="Audio đầu ra", type="filepath", autoplay=True)
+                status_output = gr.Textbox(label="Trạng thái", show_label=False, elem_classes="status-box", placeholder="Sẵn sàng...")
+    # --- EXAMPLES ---
+    with gr.Row(elem_classes="container"):
+        with gr.Column():
+            gr.Markdown("### 📚 Ví dụ mẫu")
+            gr.Examples(examples=EXAMPLES_LIST, inputs=[text_input, voice_select], label="Thử nghiệm nhanh")
+    # --- LOGIC ---
+    def update_count(text):
+        l = len(text)
+        if l > 250:
+            color = "#dc2626"
+            msg = f"⚠️ <b>{l} / 250</b> - Quá giới hạn!"
+        elif l > 200:
+            color = "#ea580c"
+            msg = f"{l} / 250"
+        else:
+            color = "#64748B"
+            msg = f"{l} / 250 ký tự"
+        return f"<div style='text-align: right; color: {color}; font-size: 0.8rem; font-weight: bold'>{msg}</div>"
+    text_input.change(update_count, text_input, char_count)
+    def update_ref_preview(voice):
+        audio, text = load_reference_info(voice)
+        return audio, f"> *\"{text}\"*"
+    voice_select.change(update_ref_preview, voice_select, [ref_audio_preview, ref_text_preview])
+    demo.load(update_ref_preview, voice_select, [ref_audio_preview, ref_text_preview])
+    # Tab handling
+    def set_preset_mode():
+        return "preset_mode"
+    def set_custom_mode():
+        return "custom_mode"
+    tabs.children[0].select(fn=set_preset_mode, outputs=current_mode)
+    tabs.children[1].select(fn=set_custom_mode, outputs=current_mode)
+    btn_generate.click(
+        fn=synthesize_speech,
+        inputs=[text_input, voice_select, custom_audio, custom_text, current_mode],
+        outputs=[audio_output, status_output]
+    )
+if __name__ == "__main__":
+    demo.queue().launch(
+        server_name="0.0.0.0",
+        server_port=7860
+    )

config.yaml ADDED Viewed

	@@ -0,0 +1,77 @@

+text_settings:
+  max_chars_per_chunk: 256
+  max_total_chars_streaming: 3000
+backbone_configs:
+  "VieNeu-TTS (GPU)":
+    repo: pnnbao-ump/VieNeu-TTS
+    supports_streaming: false
+    description: Chất lượng cao nhất, yêu cầu GPU
+  "VieNeu-TTS-0.3B (GPU)":
+    repo: pnnbao-ump/VieNeu-TTS-0.3B
+    supports_streaming: false
+    description: Phiên bản nhẹ cho GPU, tốc độ nhanh x2 so với phiên bản gốc
+  "VieNeu-TTS-q8-gguf":
+    repo: pnnbao-ump/VieNeu-TTS-q8-gguf
+    supports_streaming: true
+    description: Phiên bản GGUF có chất lượng cao nhất
+  "VieNeu-TTS-q4-gguf":
+    repo: pnnbao-ump/VieNeu-TTS-q4-gguf
+    supports_streaming: true
+    description: Cân bằng giữa chất lượng và tốc độ
+  "VieNeu-TTS-0.3B-q4-gguf":
+    repo: pnnbao-ump/VieNeu-TTS-0.3B-q4-gguf
+    supports_streaming: true
+    description: Phiên bản cực nhẹ, chạy mượt trên CPU
+codec_configs:
+  "NeuCodec (Standard)":
+    repo: neuphonic/neucodec
+    description: Codec chuẩn, tốc độ trung bình
+    use_preencoded: false
+  "NeuCodec (Distill)":
+    repo: neuphonic/distill-neucodec
+    description: Codec tối ưu, tốc độ cao
+    use_preencoded: false
+  "NeuCodec ONNX (Fast CPU)":
+    repo: neuphonic/neucodec-onnx-decoder-int8
+    description: Tối ưu cho CPU, cần pre-encoded codes
+    use_preencoded: true
+voice_samples:
+  "Tuyên (nam miền Bắc)":
+    audio: ./sample/Tuyên (nam miền Bắc).wav
+    text: ./sample/Tuyên (nam miền Bắc).txt
+    codes: ./sample/Tuyên (nam miền Bắc).pt
+  "Vĩnh (nam miền Nam)":
+    audio: ./sample/Vĩnh (nam miền Nam).wav
+    text: ./sample/Vĩnh (nam miền Nam).txt
+    codes: ./sample/Vĩnh (nam miền Nam).pt
+  "Bình (nam miền Bắc)":
+    audio: ./sample/Bình (nam miền Bắc).wav
+    text: ./sample/Bình (nam miền Bắc).txt
+    codes: ./sample/Bình (nam miền Bắc).pt
+  "Nguyên (nam miền Nam)":
+    audio: ./sample/Nguyên (nam miền Nam).wav
+    text: ./sample/Nguyên (nam miền Nam).txt
+    codes: ./sample/Nguyên (nam miền Nam).pt
+  "Sơn (nam miền Nam)":
+    audio: ./sample/Sơn (nam miền Nam).wav
+    text: ./sample/Sơn (nam miền Nam).txt
+    codes: ./sample/Sơn (nam miền Nam).pt
+  "Đoan (nữ miền Nam)":
+    audio: ./sample/Đoan (nữ miền Nam).wav
+    text: ./sample/Đoan (nữ miền Nam).txt
+    codes: ./sample/Đoan (nữ miền Nam).pt
+  "Ngọc (nữ miền Bắc)":
+    audio: ./sample/Ngọc (nữ miền Bắc).wav
+    text: ./sample/Ngọc (nữ miền Bắc).txt
+    codes: ./sample/Ngọc (nữ miền Bắc).pt
+  "Ly (nữ miền Bắc)":
+    audio: ./sample/Ly (nữ miền Bắc).wav
+    text: ./sample/Ly (nữ miền Bắc).txt
+    codes: ./sample/Ly (nữ miền Bắc).pt
+  "Dung (nữ miền Nam)":
+    audio: ./sample/Dung (nữ miền Nam).wav
+    text: ./sample/Dung (nữ miền Nam).txt
+    codes: ./sample/Dung (nữ miền Nam).pt

packages.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+espeak-ng
+libespeak-ng1
+ffmpeg

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio
+spaces
+torchaudio
+transformers
+librosa
+soundfile
+numpy
+phonemizer
+neucodec

sample/Bình (nam miền Bắc).pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f896d618fc46c3e131eda7b4168e25e9c2fb2d7ea0e864bedff2577fbd0bd30
+size 2089

sample/Bình (nam miền Bắc).txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Anh chỉ muốn được nhìn nhận như là một huấn luyện viên.

sample/Bình (nam miền Bắc).wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:135f087ced48606c4d406b770a11e344d4d9aa6bd7adfb3e5c26f69cd9cc6df1
+size 127054

sample/Dung (nữ miền Nam).pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dc4d65b6504470cb00e46763915060590595fbe4d47912eeacecd2bf1bade262
+size 2153

sample/Dung (nữ miền Nam).txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Tục ngữ có câu, sai một li, đi một dặm.

sample/Dung (nữ miền Nam).wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:56e42039d0c96ad19e9f78ecb7218853202022b2a8460010d34ffb7879b17409
+size 143438

sample/Hương (nữ miền Bắc).pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:919035b7c762956a7d568cebc6e69fea22eb9be02bf906c1d32c1db1d8c7b9ff
+size 2217

sample/Hương (nữ miền Bắc).txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Tuy nhiên, lúc này có một vấn đề khó khăn nảy sinh.

sample/Hương (nữ miền Bắc).wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c064b4ec64df44ea1306e87b25b84905e540d0fe29885629d2fe8bc8a5e53bc
+size 155756

sample/Ly (nữ miền Bắc).pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:69b6bc9bb1062122dc3755be907d87f232fa8be5129b54f6994dead35f4935c6
+size 2153

sample/Ly (nữ miền Bắc).txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Chúng ta có thể áp dụng logic tương tự với người khác.

sample/Ly (nữ miền Bắc).wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0d4e47cfa5ed0b753c2bed07c58e26da89ee2977ca5e941244a6bbafd8869d5e
+size 147534

sample/Nguyên (nam miền Nam).pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e6ebaa0b2977589afa7e7f811b0553151bd8312c96a70b1b666bd9d0fd50edf
+size 2345

sample/Nguyên (nam miền Nam).txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Hiểu biết về bản thân và người khác bắt đầu từ chính cơ thể mình.

sample/Nguyên (nam miền Nam).wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4fc9655e24a7048c3c908494f2cbaf4c42d3d139d68c21d4f50c60e03aa19727
+size 196124

sample/Ngọc (nữ miền Bắc).pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:78ab670f177092dc8586e45536faea20fdb84471dc8d8a8b1b95dd76a4ed3d0d
+size 2281

sample/Ngọc (nữ miền Bắc).txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Trong phòng rất tù mù, nên có thể dễ dàng che dấu nó.

sample/Ngọc (nữ miền Bắc).wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:475a73298fbe86e5d92e7fb95c6c26e897e5a2ffbdc3fa9e062df4025767af93
+size 174956

sample/Sơn (nam miền Nam).pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:114cb04ee2357d06de2f038853bbeb0dc57fc8ed30e085118a9e0bf5a70f7857
+size 2281

sample/Sơn (nam miền Nam).txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Trên thực tế, các nghi ngờ đã bắt đầu xuất hiện.

sample/Sơn (nam miền Nam).wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f6733df04e5f3477a00136c6baeaf7a196c93df0ee13b9bfa3d8ba61034f063
+size 174044

sample/Tuyên (nam miền Bắc).pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e79eb6ee9cc7cd35cb4fbbef107249ed3209608b59644c52f55a34941a531873
+size 2473

sample/Tuyên (nam miền Bắc).txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Bạn cầm khúc cây, và ném vào bãi cỏ xanh tươi rậm rạp ở đằng xa.

sample/Tuyên (nam miền Bắc).wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f6b7ac2605db0a2cf634ce0f3a55a87a89f4c2e3bc06f83433e6af583c1f3692
+size 217166

sample/Vĩnh (nam miền Nam).pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c87342d3a6a8cbaaf2139c21e7554eea19aba6aa03248e4426238a1c2507e447
+size 2217

sample/Vĩnh (nam miền Nam).txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Đến cuối thế kỷ 19, ngành đánh bắt cá được thương mại hóa.

sample/Vĩnh (nam miền Nam).wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:632a5c8fa34fe03001cc3c44427b5e0ee70f767377bc788b59a5dc9afa9fba49
+size 164492

sample/Đoan (nữ miền Nam).pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:28b48dbae193adc88aa26243086ba3ce862def7035d9793613c2967df29f9afe
+size 2793

sample/Đoan (nữ miền Nam).txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Nuôi con theo phong cách Do Thái, không chỉ tốt cho đứa trẻ, mà còn tốt cho cả các bậc cha mẹ.

sample/Đoan (nữ miền Nam).wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3e319ed45dd2a1458a52edfe43a83a36eff813f19399ac2e59ee3f93cace74be
+size 294830

utils/__init__.py ADDED Viewed

File without changes

utils/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (133 Bytes). View file

utils/__pycache__/core_utils.cpython-312.pyc ADDED Viewed

Binary file (2.22 kB). View file

utils/__pycache__/normalize_text.cpython-312.pyc ADDED Viewed

Binary file (24.6 kB). View file

utils/__pycache__/phonemize_text.cpython-312.pyc ADDED Viewed

Binary file (13 kB). View file

utils/core_utils.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import re
+import os
+from typing import List
+def split_text_into_chunks(text: str, max_chars: int = 256) -> List[str]:
+    """
+    Split raw text into chunks no longer than max_chars.
+    """
+    sentences = re.split(r"(?<=[\.\!\?\…\n])\s+|(?<=\n)", text.strip())
+    chunks: List[str] = []
+    buffer = ""
+    def flush_buffer():
+        nonlocal buffer
+        if buffer:
+            chunks.append(buffer.strip())
+            buffer = ""
+    for sentence in sentences:
+        sentence = sentence.strip()
+        if not sentence:
+            continue
+        if len(sentence) <= max_chars:
+            candidate = f"{buffer} {sentence}".strip() if buffer else sentence
+            if len(candidate) <= max_chars:
+                buffer = candidate
+            else:
+                flush_buffer()
+                buffer = sentence
+            continue
+        flush_buffer()
+        words = sentence.split()
+        current = ""
+        for word in words:
+            candidate = f"{current} {word}".strip() if current else word
+            if len(candidate) > max_chars and current:
+                chunks.append(current.strip())
+                current = word
+            else:
+                current = candidate
+        if current:
+            chunks.append(current.strip())
+    flush_buffer()
+    return [chunk for chunk in chunks if chunk]
+def env_bool(name: str, default: bool = False) -> bool:
+    v = os.getenv(name)
+    if v is None:
+        return default
+    return v.strip().lower() in ("1", "true", "yes", "y", "on")

utils/normalize_text.py ADDED Viewed

	@@ -0,0 +1,407 @@

+import re
+class VietnameseTTSNormalizer:
+    """
+    A text normalizer for Vietnamese Text-to-Speech systems.
+    Converts numbers, dates, units, and special characters into readable Vietnamese text.
+    """
+    def __init__(self):
+        self.units = {
+            'km': 'ki lô mét', 'dm': 'đê xi mét', 'cm': 'xen ti mét',
+            'mm': 'mi li mét', 'nm': 'na nô mét', 'µm': 'mic rô mét',
+            'μm': 'mic rô mét', 'm': 'mét',
+            'kg': 'ki lô gam', 'g': 'gam', 'mg': 'mi li gam',
+            'km²': 'ki lô mét vuông', 'km2': 'ki lô mét vuông',
+            'm²': 'mét vuông', 'm2': 'mét vuông',
+            'cm²': 'xen ti mét vuông', 'cm2': 'xen ti mét vuông',
+            'mm²': 'mi li mét vuông', 'mm2': 'mi li mét vuông',
+            'ha': 'héc ta',
+            'km³': 'ki lô mét khối', 'km3': 'ki lô mét khối',
+            'm³': 'mét khối', 'm3': 'mét khối',
+            'cm³': 'xen ti mét khối', 'cm3': 'xen ti mét khối',
+            'mm³': 'mi li mét khối', 'mm3': 'mi li mét khối',
+            'l': 'lít', 'dl': 'đê xi lít', 'ml': 'mi li lít', 'hl': 'héc tô lít',
+            'v': 'vôn', 'kv': 'ki lô vôn', 'mv': 'mi li vôn',
+            'a': 'am pe', 'ma': 'mi li am pe', 'ka': 'ki lô am pe',
+            'w': 'oát', 'kw': 'ki lô oát', 'mw': 'mê ga oát', 'gw': 'gi ga oát',
+            'kwh': 'ki lô oát giờ', 'mwh': 'mê ga oát giờ', 'wh': 'oát giờ',
+            'ω': 'ôm', 'ohm': 'ôm', 'kω': 'ki lô ôm', 'mω': 'mê ga ôm',
+            'hz': 'héc', 'khz': 'ki lô héc', 'mhz': 'mê ga héc', 'ghz': 'gi ga héc',
+            'pa': 'pát cal', 'kpa': 'ki lô pát cal', 'mpa': 'mê ga pát cal',
+            'bar': 'ba', 'mbar': 'mi li ba', 'atm': 'át mốt phia', 'psi': 'pi ét xai',
+            'j': 'giun', 'kj': 'ki lô giun',
+            'cal': 'ca lo', 'kcal': 'ki lô ca lo',
+        }
+        self.digits = ['không', 'một', 'hai', 'ba', 'bốn',
+                      'năm', 'sáu', 'bảy', 'tám', 'chín']
+    def normalize(self, text):
+        """Main normalization pipeline with EN tag protection."""
+        # Step 1: Extract and protect EN tags
+        en_contents = []
+        placeholder_pattern = "___EN_PLACEHOLDER_{}___ "
+        def extract_en(match):
+            en_contents.append(match.group(0))
+            return placeholder_pattern.format(len(en_contents) - 1)
+        text = re.sub(r'<en>.*?</en>', extract_en, text, flags=re.IGNORECASE)
+        # Step 2: Normal normalization pipeline
+        text = text.lower()
+        text = self._normalize_temperature(text)
+        text = self._normalize_currency(text)
+        text = self._normalize_percentage(text)
+        text = self._normalize_units(text)
+        text = self._normalize_time(text)
+        text = self._normalize_date(text)
+        text = self._normalize_phone(text)
+        text = self._normalize_numbers(text)
+        text = self._number_to_words(text)
+        text = self._normalize_special_chars(text)
+        text = self._normalize_whitespace(text)
+        # Step 3: Restore EN tags
+        for idx, en_content in enumerate(en_contents):
+            text = text.replace(placeholder_pattern.format(idx).lower(), en_content + ' ')
+        # Final whitespace cleanup
+        text = self._normalize_whitespace(text)
+        return text
+    def _normalize_temperature(self, text):
+        """Convert temperature notation to words."""
+        text = re.sub(r'-(\d+(?:[.,]\d+)?)\s*°\s*c\b', r'âm \1 độ xê', text, flags=re.IGNORECASE)
+        text = re.sub(r'-(\d+(?:[.,]\d+)?)\s*°\s*f\b', r'âm \1 độ ép', text, flags=re.IGNORECASE)
+        text = re.sub(r'(\d+(?:[.,]\d+)?)\s*°\s*c\b', r'\1 độ xê', text, flags=re.IGNORECASE)
+        text = re.sub(r'(\d+(?:[.,]\d+)?)\s*°\s*f\b', r'\1 độ ép', text, flags=re.IGNORECASE)
+        text = re.sub(r'°', ' độ ', text)
+        return text
+    def _normalize_currency(self, text):
+        """Convert currency notation to words."""
+        def decimal_currency(match):
+            whole = match.group(1)
+            decimal = match.group(2)
+            unit = match.group(3)
+            decimal_words = ' '.join([self.digits[int(d)] for d in decimal])
+            unit_map = {'k': 'nghìn', 'm': 'triệu', 'b': 'tỷ'}
+            unit_word = unit_map.get(unit.lower(), unit)
+            return f"{whole} phẩy {decimal_words} {unit_word}"
+        text = re.sub(r'(\d+)[.,](\d+)\s*([kmb])\b', decimal_currency, text, flags=re.IGNORECASE)
+        text = re.sub(r'(\d+)\s*k\b', r'\1 nghìn', text, flags=re.IGNORECASE)
+        text = re.sub(r'(\d+)\s*m\b', r'\1 triệu', text, flags=re.IGNORECASE)
+        text = re.sub(r'(\d+)\s*b\b', r'\1 tỷ', text, flags=re.IGNORECASE)
+        text = re.sub(r'(\d+(?:[.,]\d+)?)\s*đ\b', r'\1 đồng', text)
+        text = re.sub(r'(\d+(?:[.,]\d+)?)\s*vnd\b', r'\1 đồng', text, flags=re.IGNORECASE)
+        text = re.sub(r'\$\s*(\d+(?:[.,]\d+)?)', r'\1 đô la', text)
+        text = re.sub(r'(\d+(?:[.,]\d+)?)\s*\$', r'\1 đô la', text)
+        return text
+    def _normalize_percentage(self, text):
+        """Convert percentage to words."""
+        text = re.sub(r'(\d+(?:[.,]\d+)?)\s*%', r'\1 phần trăm', text)
+        return text
+    def _normalize_units(self, text):
+        """Convert measurement units to words."""
+        def expand_compound_with_number(match):
+            number = match.group(1)
+            unit1 = match.group(2).lower()
+            unit2 = match.group(3).lower()
+            full_unit1 = self.units.get(unit1, unit1)
+            full_unit2 = self.units.get(unit2, unit2)
+            return f"{number} {full_unit1} trên {full_unit2}"
+        def expand_compound_without_number(match):
+            unit1 = match.group(1).lower()
+            unit2 = match.group(2).lower()
+            full_unit1 = self.units.get(unit1, unit1)
+            full_unit2 = self.units.get(unit2, unit2)
+            return f"{full_unit1} trên {full_unit2}"
+        text = re.sub(r'(\d+(?:[.,]\d+)?)\s*([a-zA-Zμµ²³°]+)/([a-zA-Zμµ²³°0-9]+)\b',
+                     expand_compound_with_number, text)
+        text = re.sub(r'\b([a-zA-Zμµ²³°]+)/([a-zA-Zμµ²³°0-9]+)\b',
+                     expand_compound_without_number, text)
+        sorted_units = sorted(self.units.items(), key=lambda x: len(x[0]), reverse=True)
+        for unit, full_name in sorted_units:
+            pattern = r'(\d+(?:[.,]\d+)?)\s*' + re.escape(unit) + r'\b'
+            text = re.sub(pattern, rf'\1 {full_name}', text, flags=re.IGNORECASE)
+        for unit, full_name in sorted_units:
+            if any(c in unit for c in '²³°'):
+                pattern = r'\b' + re.escape(unit) + r'\b'
+                text = re.sub(pattern, full_name, text, flags=re.IGNORECASE)
+        return text
+    def _normalize_time(self, text):
+        """Convert time notation to words with validation."""
+        def validate_and_convert_time(match):
+            """Validate time components before converting."""
+            groups = match.groups()
+            # HH:MM:SS format
+            if len(groups) == 3:
+                hour, minute, second = groups
+                hour_int, minute_int, second_int = int(hour), int(minute), int(second)
+                if not (0 <= hour_int <= 23):
+                    return match.group(0)
+                if not (0 <= minute_int <= 59):
+                    return match.group(0)
+                if not (0 <= second_int <= 59):
+                    return match.group(0)
+                return f"{hour} giờ {minute} phút {second} giây"
+            # HH:MM or HHhMM format
+            elif len(groups) == 2:
+                hour, minute = groups
+                hour_int, minute_int = int(hour), int(minute)
+                if not (0 <= hour_int <= 23):
+                    return match.group(0)
+                if not (0 <= minute_int <= 59):
+                    return match.group(0)
+                return f"{hour} giờ {minute} phút"
+            # HHh format
+            else:
+                hour = groups[0]
+                hour_int = int(hour)
+                if not (0 <= hour_int <= 23):
+                    return match.group(0)
+                return f"{hour} giờ"
+        text = re.sub(r'(\d{1,2}):(\d{2}):(\d{2})', validate_and_convert_time, text)
+        text = re.sub(r'(\d{1,2}):(\d{2})', validate_and_convert_time, text)
+        text = re.sub(r'(\d{1,2})h(\d{2})', validate_and_convert_time, text)
+        text = re.sub(r'(\d{1,2})h\b', validate_and_convert_time, text)
+        return text
+    def _normalize_date(self, text):
+        """Convert date notation to words with validation."""
+        def is_valid_date(day, month, year):
+            """Check if date components are valid."""
+            day, month, year = int(day), int(month), int(year)
+            if not (1 <= day <= 31):
+                return False
+            if not (1 <= month <= 12):
+                return False
+            return True
+        def date_to_text(match):
+            day, month, year = match.groups()
+            if is_valid_date(day, month, year):
+                return f"ngày {day} tháng {month} năm {year}"
+            return match.group(0)
+        def date_iso_to_text(match):
+            year, month, day = match.groups()
+            if is_valid_date(day, month, year):
+                return f"ngày {day} tháng {month} năm {year}"
+            return match.group(0)
+        def date_short_year(match):
+            day, month, year = match.groups()
+            full_year = f"20{year}" if int(year) < 50 else f"19{year}"
+            if is_valid_date(day, month, full_year):
+                return f"ngày {day} tháng {month} năm {full_year}"
+            return match.group(0)
+        text = re.sub(r'\bngày\s+(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b',
+                    lambda m: date_to_text(m).replace('ngày ngày', 'ngày'), text)
+        text = re.sub(r'\bngày\s+(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b',
+                    lambda m: date_short_year(m).replace('ngày ngày', 'ngày'), text)
+        text = re.sub(r'\b(\d{4})-(\d{1,2})-(\d{1,2})\b', date_iso_to_text, text)
+        text = re.sub(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b', date_to_text, text)
+        text = re.sub(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b', date_short_year, text)
+        return text
+    def _normalize_phone(self, text):
+        """Convert phone numbers to digit-by-digit reading."""
+        def phone_to_text(match):
+            phone = match.group(0)
+            phone = re.sub(r'[^\d]', '', phone)
+            if phone.startswith('84') and len(phone) >= 10:
+                phone = '0' + phone[2:]
+            if 10 <= len(phone) <= 11:
+                words = [self.digits[int(d)] for d in phone]
+                return ' '.join(words) + ' '
+            return match.group(0)
+        text = re.sub(r'(\+84|84)[\s\-\.]?\d[\d\s\-\.]{7,}', phone_to_text, text)
+        text = re.sub(r'\b0\d[\d\s\-\.]{8,}', phone_to_text, text)
+        return text
+    def _normalize_numbers(self, text):
+        text = re.sub(r'(\d+(?:[,.]\d+)?)%', lambda m: f'{m.group(1)} phần trăm', text)
+        text = re.sub(r'(\d{1,3})(?:\.(\d{3}))+', lambda m: m.group(0).replace('.', ''), text)
+        def decimal_to_words(match):
+            whole = match.group(1)
+            decimal = match.group(2)
+            decimal_words = ' '.join([self.digits[int(d)] for d in decimal])
+            separator = 'phẩy' if ',' in match.group(0) else 'chấm'
+            return f"{whole} {separator} {decimal_words}"
+        text = re.sub(r'(\d+),(\d+)', decimal_to_words, text)
+        text = re.sub(r'(\d+)\.(\d{1,2})\b', decimal_to_words, text)
+        return text
+    def _read_two_digits(self, n):
+        """Read two-digit numbers in Vietnamese."""
+        if n < 10:
+            return self.digits[n]
+        elif n == 10:
+            return "mười"
+        elif n < 20:
+            if n == 15:
+                return "mười lăm"
+            return f"mười {self.digits[n % 10]}"
+        else:
+            tens = n // 10
+            ones = n % 10
+            if ones == 0:
+                return f"{self.digits[tens]} mươi"
+            elif ones == 1:
+                return f"{self.digits[tens]} mươi mốt"
+            elif ones == 5:
+                return f"{self.digits[tens]} mươi lăm"
+            else:
+                return f"{self.digits[tens]} mươi {self.digits[ones]}"
+    def _read_three_digits(self, n):
+        """Read three-digit numbers in Vietnamese."""
+        if n < 100:
+            return self._read_two_digits(n)
+        hundreds = n // 100
+        remainder = n % 100
+        result = f"{self.digits[hundreds]} trăm"
+        if remainder == 0:
+            return result
+        elif remainder < 10:
+            result += f" lẻ {self.digits[remainder]}"
+        else:
+            result += f" {self._read_two_digits(remainder)}"
+        return result
+    def _convert_number_to_words(self, num):
+        """Convert a number to Vietnamese words."""
+        if num == 0:
+            return "không"
+        if num < 0:
+            return f"âm {self._convert_number_to_words(-num)}"
+        if num >= 1000000000:
+            billion = num // 1000000000
+            remainder = num % 1000000000
+            result = f"{self._read_three_digits(billion)} tỷ"
+            if remainder > 0:
+                result += f" {self._convert_number_to_words(remainder)}"
+            return result
+        elif num >= 1000000:
+            million = num // 1000000
+            remainder = num % 1000000
+            result = f"{self._read_three_digits(million)} triệu"
+            if remainder > 0:
+                result += f" {self._convert_number_to_words(remainder)}"
+            return result
+        elif num >= 1000:
+            thousand = num // 1000
+            remainder = num % 1000
+            result = f"{self._read_three_digits(thousand)} nghìn"
+            if remainder > 0:
+                if remainder < 10:
+                    result += f" không trăm lẻ {self.digits[remainder]}"
+                elif remainder < 100:
+                    result += f" không trăm {self._read_two_digits(remainder)}"
+                else:
+                    result += f" {self._read_three_digits(remainder)}"
+            return result
+        else:
+            return self._read_three_digits(num)
+    def _number_to_words(self, text):
+        """Convert all remaining numbers to words."""
+        def convert_number(match):
+            num = int(match.group(0))
+            return self._convert_number_to_words(num)
+        text = re.sub(r'\b\d+\b', convert_number, text)
+        return text
+    def _normalize_special_chars(self, text):
+        """Handle special characters."""
+        text = text.replace('&', ' và ')
+        text = text.replace('+', ' cộng ')
+        text = text.replace('=', ' bằng ')
+        text = text.replace('#', ' thăng ')
+        text = re.sub(r'[\[\]\(\)\{\}]', ' ', text)
+        text = re.sub(r'\s+[-–—]+\s+', ' ', text)
+        text = re.sub(r'\.{2,}', ' ', text)
+        text = re.sub(r'\s+\.\s+', ' ', text)
+        text = re.sub(r'[^\w\sàáảãạăắằẳẵặâấầẩẫậèéẻẽẹêếềểễệìíỉĩịòóỏõọôốồổỗộơớờởỡợùúủũụưứừửữựỳýỷỹỵđ.,!?;:@%_]', ' ', text)
+        return text
+    def _normalize_whitespace(self, text):
+        """Normalize whitespace."""
+        text = re.sub(r'\s+', ' ', text)
+        text = text.strip()
+        return text
+if __name__ == "__main__":
+    normalizer = VietnameseTTSNormalizer()
+    test_texts = [
+        "Chào mừng <en>hello world</en> đến với AI",
+        "Công nghệ <en>machine learning</en> và <en>deep learning</en>",
+        "Giá 2.500.000đ với <en>discount</en> 50%",
+        "Nhiệt độ 25°C, <en>temperature</en> cao",
+        "Hệ thống <en>text-to-speech</en> tiếng Việt",
+    ]
+    print("=" * 80)
+    print("VIETNAMESE TTS NORMALIZATION TEST (WITH EN TAG)")
+    print("=" * 80)
+    for text in test_texts:
+        print(f"\n📝 Input: {text}")
+        normalized = normalizer.normalize(text)
+        print(f"🎵 Output: {normalized}")
+        print("-" * 80)

utils/phoneme_dict.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:331f9583a0ac0c795000b569e141e0c9d50c3005d02c49b631fb30edb4b407d4
+size 18078190

utils/phonemize_text.py ADDED Viewed

	@@ -0,0 +1,346 @@

+import os
+import json
+import platform
+import glob
+import re
+from phonemizer import phonemize
+from phonemizer.backend.espeak.espeak import EspeakWrapper
+from utils.normalize_text import VietnameseTTSNormalizer
+# Configuration
+PHONEME_DICT_PATH = os.getenv(
+    'PHONEME_DICT_PATH',
+    os.path.join(os.path.dirname(__file__), "phoneme_dict.json")
+)
+def load_phoneme_dict(path=PHONEME_DICT_PATH):
+    """Load phoneme dictionary from JSON file."""
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except FileNotFoundError:
+        raise FileNotFoundError(
+            f"Phoneme dictionary not found at {path}. "
+            "Please create it or set PHONEME_DICT_PATH environment variable."
+        )
+def setup_espeak_library():
+    """Configure eSpeak library path based on operating system."""
+    system = platform.system()
+    if system == "Windows":
+        _setup_windows_espeak()
+    elif system == "Linux":
+        _setup_linux_espeak()
+    elif system == "Darwin":
+        _setup_macos_espeak()
+    else:
+        raise OSError(
+            f"Unsupported OS: {system}. "
+            "Only Windows, Linux, and macOS are supported."
+        )
+def _setup_windows_espeak():
+    """Setup eSpeak for Windows."""
+    default_path = r"C:\Program Files\eSpeak NG\libespeak-ng.dll"
+    if os.path.exists(default_path):
+        EspeakWrapper.set_library(default_path)
+    else:
+        raise FileNotFoundError(
+            f"eSpeak library not found at {default_path}. "
+            "Please install eSpeak NG from: https://github.com/espeak-ng/espeak-ng/releases"
+        )
+def _setup_linux_espeak():
+    """Setup eSpeak for Linux."""
+    search_patterns = [
+        "/usr/lib/x86_64-linux-gnu/libespeak-ng.so*",
+        "/usr/lib/x86_64-linux-gnu/libespeak.so*",
+        "/usr/lib/libespeak-ng.so*",
+        "/usr/lib64/libespeak-ng.so*",
+        "/usr/local/lib/libespeak-ng.so*",
+    ]
+    for pattern in search_patterns:
+        matches = glob.glob(pattern)
+        if matches:
+            EspeakWrapper.set_library(sorted(matches, key=len)[0])
+            return
+    raise RuntimeError(
+        "eSpeak NG library not found. Install with:\n"
+        "  Ubuntu/Debian: sudo apt-get install espeak-ng\n"
+        "  Fedora: sudo dnf install espeak-ng\n"
+        "  Arch: sudo pacman -S espeak-ng\n"
+        "See: https://github.com/pnnbao97/VieNeu-TTS/issues/5"
+    )
+def _setup_macos_espeak():
+    """Setup eSpeak for macOS."""
+    espeak_lib = os.environ.get('PHONEMIZER_ESPEAK_LIBRARY')
+    paths_to_check = [
+        espeak_lib,
+        "/opt/homebrew/lib/libespeak-ng.dylib",  # Apple Silicon
+        "/usr/local/lib/libespeak-ng.dylib",     # Intel
+        "/opt/local/lib/libespeak-ng.dylib",     # MacPorts
+    ]
+    for path in paths_to_check:
+        if path and os.path.exists(path):
+            EspeakWrapper.set_library(path)
+            return
+    raise FileNotFoundError(
+        "eSpeak library not found. Install with:\n"
+        "  brew install espeak-ng\n"
+        "Or set: export PHONEMIZER_ESPEAK_LIBRARY=/path/to/libespeak-ng.dylib"
+    )
+# Initialize
+try:
+    setup_espeak_library()
+    phoneme_dict = load_phoneme_dict()
+    normalizer = VietnameseTTSNormalizer()
+except Exception as e:
+    print(f"Initialization error: {e}")
+    raise
+def phonemize_text(text: str) -> str:
+    """
+    Convert text to phonemes (simple version without dict, without EN tag).
+    Kept for backward compatibility.
+    """
+    text = normalizer.normalize(text)
+    return phonemize(
+        text,
+        language="vi",
+        backend="espeak",
+        preserve_punctuation=True,
+        with_stress=True,
+        language_switch="remove-flags"
+    )
+def phonemize_with_dict(text: str, phoneme_dict=phoneme_dict) -> str:
+    """
+    Phonemize single text with dictionary lookup and EN tag support.
+    """
+    text = normalizer.normalize(text)
+    # Split by EN tags
+    parts = re.split(r'(<en>.*?</en>)', text, flags=re.IGNORECASE)
+    en_texts = []
+    en_indices = []
+    vi_texts = []
+    vi_indices = []
+    vi_word_maps = []
+    processed_parts = []
+    for part_idx, part in enumerate(parts):
+        if re.match(r'<en>.*</en>', part, re.IGNORECASE):
+            # English part
+            en_content = re.sub(r'</?en>', '', part, flags=re.IGNORECASE).strip()
+            en_texts.append(en_content)
+            en_indices.append(part_idx)
+            processed_parts.append(None)
+        else:
+            # Vietnamese part
+            words = part.split()
+            processed_words = []
+            for word_idx, word in enumerate(words):
+                match = re.match(r'^(\W*)(.*?)(\W*)$', word)
+                pre, core, suf = match.groups() if match else ("", word, "")
+                if not core:
+                    processed_words.append(word)
+                elif core in phoneme_dict:
+                    processed_words.append(f"{pre}{phoneme_dict[core]}{suf}")
+                else:
+                    vi_texts.append(word)
+                    vi_indices.append(part_idx)
+                    vi_word_maps.append((part_idx, len(processed_words)))
+                    processed_words.append(None)
+            processed_parts.append(processed_words)
+    if en_texts:
+        try:
+            en_phonemes = phonemize(
+                en_texts,
+                language='en-us',
+                backend='espeak',
+                preserve_punctuation=True,
+                with_stress=True,
+                language_switch="remove-flags"
+            )
+            if isinstance(en_phonemes, str):
+                en_phonemes = [en_phonemes]
+            for idx, (part_idx, phoneme) in enumerate(zip(en_indices, en_phonemes)):
+                processed_parts[part_idx] = phoneme.strip()
+        except Exception as e:
+            print(f"Warning: Could not phonemize EN texts: {e}")
+            for part_idx in en_indices:
+                processed_parts[part_idx] = en_texts[en_indices.index(part_idx)]
+    if vi_texts:
+        try:
+            vi_phonemes = phonemize(
+                vi_texts,
+                language='vi',
+                backend='espeak',
+                preserve_punctuation=True,
+                with_stress=True,
+                language_switch='remove-flags'
+            )
+            if isinstance(vi_phonemes, str):
+                vi_phonemes = [vi_phonemes]
+            for idx, (part_idx, word_idx) in enumerate(vi_word_maps):
+                phoneme = vi_phonemes[idx].strip()
+                original_word = vi_texts[idx]
+                if original_word.lower().startswith('r'):
+                    phoneme = 'ɹ' + phoneme[1:] if len(phoneme) > 0 else phoneme
+                phoneme_dict[original_word] = phoneme
+                if processed_parts[part_idx] is not None:
+                    processed_parts[part_idx][word_idx] = phoneme
+        except Exception as e:
+            print(f"Warning: Could not phonemize VI texts: {e}")
+            for idx, (part_idx, word_idx) in enumerate(vi_word_maps):
+                if processed_parts[part_idx] is not None:
+                    processed_parts[part_idx][word_idx] = vi_texts[idx]
+    final_parts = []
+    for part in processed_parts:
+        if isinstance(part, list):
+            final_parts.append(' '.join(str(w) for w in part if w is not None))
+        elif part is not None:
+            final_parts.append(part)
+    result = ' '.join(final_parts)
+    result = re.sub(r'\s+([.,!?;:])', r'\1', result)
+    return result
+def phonemize_batch(texts: list, phoneme_dict=phoneme_dict) -> list:
+    """
+    Phonemize multiple texts with optimal batching.
+    Args:
+        texts: List of text strings to phonemize
+        phoneme_dict: Phoneme dictionary for lookup
+    Returns:
+        List of phonemized texts
+    """
+    normalized_texts = [normalizer.normalize(text) for text in texts]
+    all_en_texts = []
+    all_en_maps = []
+    all_vi_texts = []
+    all_vi_maps = []
+    results = []
+    for text_idx, text in enumerate(normalized_texts):
+        parts = re.split(r'(<en>.*?</en>)', text, flags=re.IGNORECASE)
+        processed_parts = []
+        for part_idx, part in enumerate(parts):
+            if re.match(r'<en>.*</en>', part, re.IGNORECASE):
+                en_content = re.sub(r'</?en>', '', part, flags=re.IGNORECASE).strip()
+                all_en_texts.append(en_content)
+                all_en_maps.append((text_idx, part_idx))
+                processed_parts.append(None)
+            else:
+                words = part.split()
+                processed_words = []
+                for word in words:
+                    match = re.match(r'^(\W*)(.*?)(\W*)$', word)
+                    pre, core, suf = match.groups() if match else ("", word, "")
+                    if not core:
+                        processed_words.append(word)
+                    elif core in phoneme_dict:
+                        processed_words.append(f"{pre}{phoneme_dict[core]}{suf}")
+                    else:
+                        all_vi_texts.append(word)
+                        all_vi_maps.append((text_idx, part_idx, len(processed_words)))
+                        processed_words.append(None)
+                processed_parts.append(processed_words)
+        results.append(processed_parts)
+    if all_en_texts:
+        try:
+            en_phonemes = phonemize(
+                all_en_texts,
+                language='en-us',
+                backend='espeak',
+                preserve_punctuation=True,
+                with_stress=True,
+                language_switch="remove-flags"
+            )
+            if isinstance(en_phonemes, str):
+                en_phonemes = [en_phonemes]
+            for (text_idx, part_idx), phoneme in zip(all_en_maps, en_phonemes):
+                results[text_idx][part_idx] = phoneme.strip()
+        except Exception as e:
+            print(f"Warning: Batch EN phonemization failed: {e}")
+    if all_vi_texts:
+        try:
+            vi_phonemes = phonemize(
+                all_vi_texts,
+                language='vi',
+                backend='espeak',
+                preserve_punctuation=True,
+                with_stress=True,
+                language_switch='remove-flags'
+            )
+            if isinstance(vi_phonemes, str):
+                vi_phonemes = [vi_phonemes]
+            for idx, (text_idx, part_idx, word_idx) in enumerate(all_vi_maps):
+                phoneme = vi_phonemes[idx].strip()
+                original_word = all_vi_texts[idx]
+                if original_word.lower().startswith('r'):
+                    phoneme = 'ɹ' + phoneme[1:] if len(phoneme) > 0 else phoneme
+                phoneme_dict[original_word] = phoneme
+                results[text_idx][part_idx][word_idx] = phoneme
+        except Exception as e:
+            print(f"Warning: Batch VI phonemization failed: {e}")
+    final_results = []
+    for processed_parts in results:
+        final_parts = []
+        for part in processed_parts:
+            if isinstance(part, list):
+                final_parts.append(' '.join(str(w) for w in part if w is not None))
+            elif part is not None:
+                final_parts.append(part)
+        result = ' '.join(final_parts)
+        result = re.sub(r'\s+([.,!?;:])', r'\1', result)
+        final_results.append(result)
+    return final_results

vieneu_tts.py ADDED Viewed

	@@ -0,0 +1,859 @@

+from pathlib import Path
+from typing import Generator
+import librosa
+import numpy as np
+import torch
+from neucodec import NeuCodec, DistillNeuCodec
+from utils.phonemize_text import phonemize_with_dict
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
+import re
+import gc
+# ============================================================================
+# Shared Utilities
+# ============================================================================
+def _linear_overlap_add(frames: list[np.ndarray], stride: int) -> np.ndarray:
+    """Linear overlap-add for smooth audio concatenation"""
+    assert len(frames)
+    dtype = frames[0].dtype
+    shape = frames[0].shape[:-1]
+    total_size = 0
+    for i, frame in enumerate(frames):
+        frame_end = stride * i + frame.shape[-1]
+        total_size = max(total_size, frame_end)
+    sum_weight = np.zeros(total_size, dtype=dtype)
+    out = np.zeros(*shape, total_size, dtype=dtype)
+    offset: int = 0
+    for frame in frames:
+        frame_length = frame.shape[-1]
+        t = np.linspace(0, 1, frame_length + 2, dtype=dtype)[1:-1]
+        weight = np.abs(0.5 - (t - 0.5))
+        out[..., offset : offset + frame_length] += weight * frame
+        sum_weight[offset : offset + frame_length] += weight
+        offset += stride
+    assert sum_weight.min() > 0
+    return out / sum_weight
+def _compile_codec_with_triton(codec):
+    """Compile codec with Triton for faster decoding (Windows/Linux compatible)"""
+    try:
+        import triton
+        if hasattr(codec, 'dec') and hasattr(codec.dec, 'resblocks'):
+            if len(codec.dec.resblocks) > 2:
+                codec.dec.resblocks[2].forward = torch.compile(
+                    codec.dec.resblocks[2].forward,
+                    mode="reduce-overhead",
+                    dynamic=True
+                )
+                print("   ✅ Triton compilation enabled for codec")
+        return True
+    except ImportError:
+        print("   ⚠️ Triton not found. Install for faster speed:")
+        print("      • Linux: pip install triton")
+        print("      • Windows: pip install triton-windows")
+        print("      (Optional but recommended)")
+        return False
+# ============================================================================
+# VieNeuTTS - Standard implementation (CPU/GPU compatible)
+# Supports: PyTorch Transformers, GGUF/GGML quantized models
+# ============================================================================
+class VieNeuTTS:
+    """
+    Standard VieNeu-TTS implementation.
+    Supports:
+    - PyTorch + Transformers backend (CPU/GPU)
+    - GGUF quantized models via llama-cpp-python (CPU optimized)
+    Use this for:
+    - CPU-only environments
+    - Standard PyTorch workflows
+    - GGUF quantized models
+    """
+    def __init__(
+        self,
+        backbone_repo="pnnbao-ump/VieNeu-TTS",
+        backbone_device="cpu",
+        codec_repo="neuphonic/neucodec",
+        codec_device="cpu",
+    ):
+        """
+        Initialize VieNeu-TTS.
+        Args:
+            backbone_repo: Model repository or path to GGUF file
+            backbone_device: Device for backbone ('cpu', 'cuda', 'gpu')
+            codec_repo: Codec repository
+            codec_device: Device for codec
+        """
+        # Constants
+        self.sample_rate = 24_000
+        self.max_context = 2048
+        self.hop_length = 480
+        self.streaming_overlap_frames = 1
+        self.streaming_frames_per_chunk = 25
+        self.streaming_lookforward = 5
+        self.streaming_lookback = 50
+        self.streaming_stride_samples = self.streaming_frames_per_chunk * self.hop_length
+        # Flags
+        self._is_quantized_model = False
+        self._is_onnx_codec = False
+        # HF tokenizer
+        self.tokenizer = None
+        # Load models
+        self._load_backbone(backbone_repo, backbone_device)
+        self._load_codec(codec_repo, codec_device)
+    def _load_backbone(self, backbone_repo, backbone_device):
+        # MPS device validation
+        if backbone_device == "mps":
+            if not torch.backends.mps.is_available():
+                print("Warning: MPS not available, falling back to CPU")
+                backbone_device = "cpu"
+        print(f"Loading backbone from: {backbone_repo} on {backbone_device} ...")
+        if backbone_repo.lower().endswith("gguf") or "gguf" in backbone_repo.lower():
+            try:
+                from llama_cpp import Llama
+            except ImportError as e:
+                raise ImportError(
+                    "Failed to import `llama_cpp`. "
+                    "Xem hướng dẫn cài đặt llama_cpp_python phiên bản tối thiểu 0.3.16 tại: https://llama-cpp-python.readthedocs.io/en/latest/"
+                ) from e
+            self.backbone = Llama.from_pretrained(
+                repo_id=backbone_repo,
+                filename="*.gguf",
+                verbose=False,
+                n_gpu_layers=-1 if backbone_device == "gpu" else 0,
+                n_ctx=self.max_context,
+                mlock=True,
+                flash_attn=True if backbone_device == "gpu" else False,
+            )
+            self._is_quantized_model = True
+        else:
+            from transformers import AutoTokenizer, AutoModelForCausalLM
+            self.tokenizer = AutoTokenizer.from_pretrained(backbone_repo)
+            self.backbone = AutoModelForCausalLM.from_pretrained(backbone_repo).to(
+                torch.device(backbone_device)
+            )
+    def _load_codec(self, codec_repo, codec_device):
+        # MPS device validation
+        if codec_device == "mps":
+            if not torch.backends.mps.is_available():
+                print("Warning: MPS not available for codec, falling back to CPU")
+                codec_device = "cpu"
+        print(f"Loading codec from: {codec_repo} on {codec_device} ...")
+        match codec_repo:
+            case "neuphonic/neucodec":
+                self.codec = NeuCodec.from_pretrained(codec_repo)
+                self.codec.eval().to(codec_device)
+            case "neuphonic/distill-neucodec":
+                self.codec = DistillNeuCodec.from_pretrained(codec_repo)
+                self.codec.eval().to(codec_device)
+            case "neuphonic/neucodec-onnx-decoder-int8":
+                if codec_device != "cpu":
+                    raise ValueError("Onnx decoder only currently runs on CPU.")
+                try:
+                    from neucodec import NeuCodecOnnxDecoder
+                except ImportError as e:
+                    raise ImportError(
+                        "Failed to import the onnx decoder."
+                        "Ensure you have onnxruntime installed as well as neucodec >= 0.0.4."
+                    ) from e
+                self.codec = NeuCodecOnnxDecoder.from_pretrained(codec_repo)
+                self._is_onnx_codec = True
+            case _:
+                raise ValueError(f"Unsupported codec repository: {codec_repo}")
+    def encode_reference(self, ref_audio_path: str | Path):
+        """Encode reference audio to codes"""
+        wav, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
+        wav_tensor = torch.from_numpy(wav).float().unsqueeze(0).unsqueeze(0)  # [1, 1, T]
+        with torch.no_grad():
+            ref_codes = self.codec.encode_code(audio_or_path=wav_tensor).squeeze(0).squeeze(0)
+        return ref_codes
+    def infer(self, text: str, ref_codes: np.ndarray | torch.Tensor, ref_text: str) -> np.ndarray:
+        """
+        Perform inference to generate speech from text using the TTS model and reference audio.
+        Args:
+            text (str): Input text to be converted to speech.
+            ref_codes (np.ndarray | torch.tensor): Encoded reference.
+            ref_text (str): Reference text for reference audio.
+        Returns:
+            np.ndarray: Generated speech waveform.
+        """
+        # Generate tokens
+        if self._is_quantized_model:
+            output_str = self._infer_ggml(ref_codes, ref_text, text)
+        else:
+            prompt_ids = self._apply_chat_template(ref_codes, ref_text, text)
+            output_str = self._infer_torch(prompt_ids)
+        # Decode
+        wav = self._decode(output_str)
+        return wav
+    def infer_stream(self, text: str, ref_codes: np.ndarray | torch.Tensor, ref_text: str) -> Generator[np.ndarray, None, None]:
+        """
+        Perform streaming inference to generate speech from text using the TTS model and reference audio.
+        Args:
+            text (str): Input text to be converted to speech.
+            ref_codes (np.ndarray | torch.tensor): Encoded reference.
+            ref_text (str): Reference text for reference audio.
+        Yields:
+            np.ndarray: Generated speech waveform.
+        """
+        if self._is_quantized_model:
+            return self._infer_stream_ggml(ref_codes, ref_text, text)
+        else:
+            raise NotImplementedError("Streaming is not implemented for the torch backend!")
+    def _decode(self, codes: str):
+        """Decode speech tokens to audio waveform."""
+        # Extract speech token IDs using regex
+        speech_ids = [int(num) for num in re.findall(r"<\|speech_(\d+)\|>", codes)]
+        if len(speech_ids) == 0:
+            raise ValueError(
+                "No valid speech tokens found in the output. "
+                "The model may not have generated proper speech tokens."
+            )
+        # Onnx decode
+        if self._is_onnx_codec:
+            codes = np.array(speech_ids, dtype=np.int32)[np.newaxis, np.newaxis, :]
+            recon = self.codec.decode_code(codes)
+        # Torch decode
+        else:
+            with torch.no_grad():
+                codes = torch.tensor(speech_ids, dtype=torch.long)[None, None, :].to(
+                    self.codec.device
+                )
+                recon = self.codec.decode_code(codes).cpu().numpy()
+        return recon[0, 0, :]
+    def _apply_chat_template(self, ref_codes: list[int], ref_text: str, input_text: str) -> list[int]:
+        input_text = phonemize_with_dict(ref_text) + " " + phonemize_with_dict(input_text)
+        speech_replace = self.tokenizer.convert_tokens_to_ids("<|SPEECH_REPLACE|>")
+        speech_gen_start = self.tokenizer.convert_tokens_to_ids("<|SPEECH_GENERATION_START|>")
+        text_replace = self.tokenizer.convert_tokens_to_ids("<|TEXT_REPLACE|>")
+        text_prompt_start = self.tokenizer.convert_tokens_to_ids("<|TEXT_PROMPT_START|>")
+        text_prompt_end = self.tokenizer.convert_tokens_to_ids("<|TEXT_PROMPT_END|>")
+        input_ids = self.tokenizer.encode(input_text, add_special_tokens=False)
+        chat = """user: Convert the text to speech:<|TEXT_REPLACE|>\nassistant:<|SPEECH_REPLACE|>"""
+        ids = self.tokenizer.encode(chat)
+        text_replace_idx = ids.index(text_replace)
+        ids = (
+            ids[:text_replace_idx]
+            + [text_prompt_start]
+            + input_ids
+            + [text_prompt_end]
+            + ids[text_replace_idx + 1 :]  # noqa
+        )
+        speech_replace_idx = ids.index(speech_replace)
+        codes_str = "".join([f"<|speech_{i}|>" for i in ref_codes])
+        codes = self.tokenizer.encode(codes_str, add_special_tokens=False)
+        ids = ids[:speech_replace_idx] + [speech_gen_start] + list(codes)
+        return ids
+    def _infer_torch(self, prompt_ids: list[int]) -> str:
+        prompt_tensor = torch.tensor(prompt_ids).unsqueeze(0).to(self.backbone.device)
+        speech_end_id = self.tokenizer.convert_tokens_to_ids("<|SPEECH_GENERATION_END|>")
+        with torch.no_grad():
+            output_tokens = self.backbone.generate(
+                prompt_tensor,
+                max_length=self.max_context,
+                eos_token_id=speech_end_id,
+                do_sample=True,
+                temperature=0.7,
+                top_k=50,
+                use_cache=True,
+                min_new_tokens=50,
+            )
+        input_length = prompt_tensor.shape[-1]
+        output_str = self.tokenizer.decode(
+            output_tokens[0, input_length:].cpu().numpy().tolist(), add_special_tokens=False
+        )
+        return output_str
+    def _infer_ggml(self, ref_codes: list[int], ref_text: str, input_text: str) -> str:
+        ref_text = phonemize_with_dict(ref_text)
+        input_text = phonemize_with_dict(input_text)
+        codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
+        prompt = (
+            f"user: Convert the text to speech:<|TEXT_PROMPT_START|>{ref_text} {input_text}"
+            f"<|TEXT_PROMPT_END|>\nassistant:<|SPEECH_GENERATION_START|>{codes_str}"
+        )
+        output = self.backbone(
+            prompt,
+            max_tokens=self.max_context,
+            temperature=0.7,
+            top_k=50,
+            stop=["<|SPEECH_GENERATION_END|>"],
+        )
+        output_str = output["choices"][0]["text"]
+        return output_str
+    def _infer_stream_ggml(self, ref_codes: torch.Tensor, ref_text: str, input_text: str) -> Generator[np.ndarray, None, None]:
+        ref_text = phonemize_with_dict(ref_text)
+        input_text = phonemize_with_dict(input_text)
+        codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
+        prompt = (
+            f"user: Convert the text to speech:<|TEXT_PROMPT_START|>{ref_text} {input_text}"
+            f"<|TEXT_PROMPT_END|>\nassistant:<|SPEECH_GENERATION_START|>{codes_str}"
+        )
+        audio_cache: list[np.ndarray] = []
+        token_cache: list[str] = [f"<|speech_{idx}|>" for idx in ref_codes]
+        n_decoded_samples: int = 0
+        n_decoded_tokens: int = len(ref_codes)
+        for item in self.backbone(
+            prompt,
+            max_tokens=self.max_context,
+            temperature=0.7,
+            top_k=50,
+            stop=["<|SPEECH_GENERATION_END|>"],
+            stream=True
+        ):
+            output_str = item["choices"][0]["text"]
+            token_cache.append(output_str)
+            if len(token_cache[n_decoded_tokens:]) >= self.streaming_frames_per_chunk + self.streaming_lookforward:
+                # decode chunk
+                tokens_start = max(
+                    n_decoded_tokens
+                    - self.streaming_lookback
+                    - self.streaming_overlap_frames,
+                    0
+                )
+                tokens_end = (
+                    n_decoded_tokens
+                    + self.streaming_frames_per_chunk
+                    + self.streaming_lookforward
+                    + self.streaming_overlap_frames
+                )
+                sample_start = (
+                    n_decoded_tokens - tokens_start
+                ) * self.hop_length
+                sample_end = (
+                    sample_start
+                    + (self.streaming_frames_per_chunk + 2 * self.streaming_overlap_frames) * self.hop_length
+                )
+                curr_codes = token_cache[tokens_start:tokens_end]
+                recon = self._decode("".join(curr_codes))
+                recon = recon[sample_start:sample_end]
+                audio_cache.append(recon)
+                # postprocess
+                processed_recon = _linear_overlap_add(
+                    audio_cache, stride=self.streaming_stride_samples
+                )
+                new_samples_end = len(audio_cache) * self.streaming_stride_samples
+                processed_recon = processed_recon[
+                    n_decoded_samples:new_samples_end
+                ]
+                n_decoded_samples = new_samples_end
+                n_decoded_tokens += self.streaming_frames_per_chunk
+                yield processed_recon
+        # final decoding handled separately as non-constant chunk size
+        remaining_tokens = len(token_cache) - n_decoded_tokens
+        if len(token_cache) > n_decoded_tokens:
+            tokens_start = max(
+                len(token_cache)
+                - (self.streaming_lookback + self.streaming_overlap_frames + remaining_tokens),
+                0
+            )
+            sample_start = (
+                len(token_cache)
+                - tokens_start
+                - remaining_tokens
+                - self.streaming_overlap_frames
+            ) * self.hop_length
+            curr_codes = token_cache[tokens_start:]
+            recon = self._decode("".join(curr_codes))
+            recon = recon[sample_start:]
+            audio_cache.append(recon)
+            processed_recon = _linear_overlap_add(audio_cache, stride=self.streaming_stride_samples)
+            processed_recon = processed_recon[n_decoded_samples:]
+            yield processed_recon
+# ============================================================================
+# FastVieNeuTTS - GPU-optimized implementation
+# Requires: LMDeploy with CUDA
+# ============================================================================
+class FastVieNeuTTS:
+    """
+    GPU-optimized VieNeu-TTS using LMDeploy TurbomindEngine.
+    """
+    def __init__(
+        self,
+        backbone_repo="pnnbao-ump/VieNeu-TTS",
+        backbone_device="cuda",
+        codec_repo="neuphonic/neucodec",
+        codec_device="cuda",
+        memory_util=0.3,
+        tp=1,
+        enable_prefix_caching=True,
+        quant_policy=0,
+        enable_triton=True,
+        max_batch_size=8,
+    ):
+        """
+        Initialize FastVieNeuTTS with LMDeploy backend and optimizations.
+        Args:
+            backbone_repo: Model repository
+            backbone_device: Device for backbone (must be CUDA)
+            codec_repo: Codec repository
+            codec_device: Device for codec
+            memory_util: GPU memory utilization (0.0-1.0)
+            tp: Tensor parallel size for multi-GPU
+            enable_prefix_caching: Enable prefix caching for faster batch processing
+            quant_policy: KV cache quantization (0=off, 8=int8, 4=int4)
+            enable_triton: Enable Triton compilation for codec
+            max_batch_size: Maximum batch size for inference (prevent GPU overload)
+        """
+        if backbone_device != "cuda" and not backbone_device.startswith("cuda:"):
+            raise ValueError("LMDeploy backend requires CUDA device")
+        # Constants
+        self.sample_rate = 24_000
+        self.max_context = 2048
+        self.hop_length = 480
+        self.streaming_overlap_frames = 1
+        self.streaming_frames_per_chunk = 50
+        self.streaming_lookforward = 5
+        self.streaming_lookback = 50
+        self.streaming_stride_samples = self.streaming_frames_per_chunk * self.hop_length
+        self.max_batch_size = max_batch_size
+        self._ref_cache = {}
+        self.stored_dict = defaultdict(dict)
+        # Flags
+        self._is_onnx_codec = False
+        self._triton_enabled = False
+        # Load models
+        self._load_backbone_lmdeploy(backbone_repo, memory_util, tp, enable_prefix_caching, quant_policy)
+        self._load_codec(codec_repo, codec_device, enable_triton)
+        self._warmup_model()
+        print("✅ FastVieNeuTTS with optimizations loaded successfully!")
+        print(f"   Max batch size: {self.max_batch_size} (adjustable to prevent GPU overload)")
+    def _load_backbone_lmdeploy(self, repo, memory_util, tp, enable_prefix_caching, quant_policy):
+        """Load backbone using LMDeploy's TurbomindEngine"""
+        print(f"Loading backbone with LMDeploy from: {repo}")
+        try:
+            from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig
+        except ImportError as e:
+            raise ImportError(
+                "Failed to import `lmdeploy`. "
+                "Xem hướng dẫn cài đặt lmdeploy để tối ưu hiệu suất GPU tại: https://github.com/pnnbao97/VieNeu-TTS"
+            ) from e
+        backend_config = TurbomindEngineConfig(
+            cache_max_entry_count=memory_util,
+            tp=tp,
+            enable_prefix_caching=enable_prefix_caching,
+            dtype='bfloat16',
+            quant_policy=quant_policy
+        )
+        self.backbone = pipeline(repo, backend_config=backend_config)
+        self.gen_config = GenerationConfig(
+            top_p=0.95,
+            top_k=50,
+            temperature=0.7,
+            max_new_tokens=2048,
+            do_sample=True,
+            min_new_tokens=40,
+        )
+        print(f"   LMDeploy TurbomindEngine initialized")
+        print(f"   - Memory util: {memory_util}")
+        print(f"   - Tensor Parallel: {tp}")
+        print(f"   - Prefix caching: {enable_prefix_caching}")
+        print(f"   - KV quant: {quant_policy} ({'Enabled' if quant_policy > 0 else 'Disabled'})")
+    def _load_codec(self, codec_repo, codec_device, enable_triton):
+        """Load codec with optional Triton compilation"""
+        print(f"Loading codec from: {codec_repo} on {codec_device}")
+        match codec_repo:
+            case "neuphonic/neucodec":
+                self.codec = NeuCodec.from_pretrained(codec_repo)
+                self.codec.eval().to(codec_device)
+            case "neuphonic/distill-neucodec":
+                self.codec = DistillNeuCodec.from_pretrained(codec_repo)
+                self.codec.eval().to(codec_device)
+            case "neuphonic/neucodec-onnx-decoder-int8":
+                if codec_device != "cpu":
+                    raise ValueError("ONNX decoder only runs on CPU")
+                try:
+                    from neucodec import NeuCodecOnnxDecoder
+                except ImportError as e:
+                    raise ImportError(
+                        "Failed to import ONNX decoder. "
+                        "Ensure onnxruntime and neucodec >= 0.0.4 are installed."
+                    ) from e
+                self.codec = NeuCodecOnnxDecoder.from_pretrained(codec_repo)
+                self._is_onnx_codec = True
+            case _:
+                raise ValueError(f"Unsupported codec repository: {codec_repo}")
+        if enable_triton and not self._is_onnx_codec and codec_device != "cpu":
+            self._triton_enabled = _compile_codec_with_triton(self.codec)
+    def _warmup_model(self):
+        """Warmup inference pipeline to reduce first-token latency"""
+        print("🔥 Warming up model...")
+        try:
+            dummy_codes = list(range(10))
+            dummy_prompt = self._format_prompt(dummy_codes, "warmup", "test")
+            _ = self.backbone([dummy_prompt], gen_config=self.gen_config, do_preprocess=False)
+            print("   ✅ Warmup complete")
+        except Exception as e:
+            print(f"   ⚠️ Warmup failed (non-critical): {e}")
+    def encode_reference(self, ref_audio_path: str | Path):
+        """Encode reference audio to codes"""
+        wav, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
+        wav_tensor = torch.from_numpy(wav).float().unsqueeze(0).unsqueeze(0)
+        with torch.no_grad():
+            ref_codes = self.codec.encode_code(audio_or_path=wav_tensor).squeeze(0).squeeze(0)
+        return ref_codes
+    def get_cached_reference(self, voice_name: str, audio_path: str, ref_text: str = None):
+        """
+        Get or create cached reference codes.
+        Args:
+            voice_name: Unique identifier for this voice
+            audio_path: Path to reference audio
+            ref_text: Optional reference text (stored with codes)
+        Returns:
+            ref_codes: Encoded reference codes
+        """
+        cache_key = f"{voice_name}_{audio_path}"
+        if cache_key not in self._ref_cache:
+            ref_codes = self.encode_reference(audio_path)
+            self._ref_cache[cache_key] = {
+                'codes': ref_codes,
+                'ref_text': ref_text
+            }
+        return self._ref_cache[cache_key]['codes']
+    def add_speaker(self, user_id: int, audio_file: str, ref_text: str):
+        """
+        Add a speaker to the stored dictionary for easy access.
+        Args:
+            user_id: Unique user ID
+            audio_file: Reference audio file path
+            ref_text: Reference text
+        Returns:
+            user_id: The user ID for use in streaming
+        """
+        codes = self.encode_reference(audio_file)
+        if isinstance(codes, torch.Tensor):
+            codes = codes.cpu().numpy()
+        if isinstance(codes, np.ndarray):
+            codes = codes.flatten().tolist()
+        self.stored_dict[f"{user_id}"]['codes'] = codes
+        self.stored_dict[f"{user_id}"]['ref_text'] = ref_text
+        return user_id
+    def _decode(self, codes: str):
+        """Decode speech tokens to audio waveform"""
+        speech_ids = [int(num) for num in re.findall(r"<\|speech_(\d+)\|>", codes)]
+        if len(speech_ids) == 0:
+            raise ValueError("No valid speech tokens found in output")
+        if self._is_onnx_codec:
+            codes = np.array(speech_ids, dtype=np.int32)[np.newaxis, np.newaxis, :]
+            recon = self.codec.decode_code(codes)
+        else:
+            with torch.no_grad():
+                codes = torch.tensor(speech_ids, dtype=torch.long)[None, None, :].to(
+                    self.codec.device
+                )
+                recon = self.codec.decode_code(codes).cpu().numpy()
+        return recon[0, 0, :]
+    def _decode_batch(self, codes_list: list[str], max_workers: int = None):
+        """
+        Decode multiple code strings in parallel.
+        Args:
+            codes_list: List of code strings to decode
+            max_workers: Number of parallel workers (auto-tuned if None)
+        Returns:
+            List of decoded audio arrays
+        """
+        # Auto-tune workers based on GPU memory and batch size
+        if max_workers is None:
+            if torch.cuda.is_available():
+                gpu_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
+                # 1 worker per 4GB VRAM, max 4 workers
+                max_workers = min(max(1, int(gpu_mem_gb / 4)), 4)
+            else:
+                max_workers = 2
+        # For small batches, use sequential to avoid overhead
+        if len(codes_list) <= 2:
+            return [self._decode(codes) for codes in codes_list]
+        # Parallel decoding with controlled workers
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            futures = [executor.submit(self._decode, codes) for codes in codes_list]
+            results = [f.result() for f in futures]
+        return results
+    def _format_prompt(self, ref_codes: list[int], ref_text: str, input_text: str) -> str:
+        """Format prompt for LMDeploy"""
+        ref_text_phones = phonemize_with_dict(ref_text)
+        input_text_phones = phonemize_with_dict(input_text)
+        codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
+        prompt = (
+            f"user: Convert the text to speech:<|TEXT_PROMPT_START|>{ref_text_phones} {input_text_phones}"
+            f"<|TEXT_PROMPT_END|>\nassistant:<|SPEECH_GENERATION_START|>{codes_str}"
+        )
+        return prompt
+    def infer(self, text: str, ref_codes: np.ndarray | torch.Tensor, ref_text: str) -> np.ndarray:
+        """
+        Single inference.
+        Args:
+            text: Input text to synthesize
+            ref_codes: Encoded reference audio codes
+            ref_text: Reference text for reference audio
+        Returns:
+            Generated speech waveform as numpy array
+        """
+        if isinstance(ref_codes, torch.Tensor):
+            ref_codes = ref_codes.cpu().numpy()
+        if isinstance(ref_codes, np.ndarray):
+            ref_codes = ref_codes.flatten().tolist()
+        prompt = self._format_prompt(ref_codes, ref_text, text)
+        # Use LMDeploy pipeline for generation
+        responses = self.backbone([prompt], gen_config=self.gen_config, do_preprocess=False)
+        output_str = responses[0].text
+        # Decode to audio
+        wav = self._decode(output_str)
+        return wav
+    def infer_batch(self, texts: list[str], ref_codes: np.ndarray | torch.Tensor, ref_text: str, max_batch_size: int = None) -> list[np.ndarray]:
+        """
+        Batch inference for multiple texts.
+        """
+        if max_batch_size is None:
+            max_batch_size = self.max_batch_size
+        if not isinstance(texts, list):
+            texts = [texts]
+        if isinstance(ref_codes, torch.Tensor):
+            ref_codes = ref_codes.cpu().numpy()
+        if isinstance(ref_codes, np.ndarray):
+            ref_codes = ref_codes.flatten().tolist()
+        all_wavs = []
+        for i in range(0, len(texts), max_batch_size):
+            batch_texts = texts[i:i+max_batch_size]
+            prompts = [self._format_prompt(ref_codes, ref_text, text) for text in batch_texts]
+            responses = self.backbone(prompts, gen_config=self.gen_config, do_preprocess=False)
+            batch_codes = [response.text for response in responses]
+            if len(batch_codes) > 3:
+                batch_wavs = self._decode_batch(batch_codes)
+            else:
+                batch_wavs = [self._decode(codes) for codes in batch_codes]
+            all_wavs.extend(batch_wavs)
+            if i + max_batch_size < len(texts):
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+        return all_wavs
+    def infer_stream(self, text: str, ref_codes: np.ndarray | torch.Tensor, ref_text: str) -> Generator[np.ndarray, None, None]:
+        """
+        Streaming inference with low latency.
+        Args:
+            text: Input text to synthesize
+            ref_codes: Encoded reference audio codes
+            ref_text: Reference text for reference audio
+        Yields:
+            Audio chunks as numpy arrays
+        """
+        if isinstance(ref_codes, torch.Tensor):
+            ref_codes = ref_codes.cpu().numpy()
+        if isinstance(ref_codes, np.ndarray):
+            ref_codes = ref_codes.flatten().tolist()
+        prompt = self._format_prompt(ref_codes, ref_text, text)
+        audio_cache = []
+        token_cache = [f"<|speech_{idx}|>" for idx in ref_codes]
+        n_decoded_samples = 0
+        n_decoded_tokens = len(ref_codes)
+        for response in self.backbone.stream_infer([prompt], gen_config=self.gen_config, do_preprocess=False):
+            output_str = response.text
+            # Extract new tokens
+            new_tokens = output_str[len("".join(token_cache[len(ref_codes):])):] if len(token_cache) > len(ref_codes) else output_str
+            if new_tokens:
+                token_cache.append(new_tokens)
+            # Check if we have enough tokens to decode a chunk
+            if len(token_cache[n_decoded_tokens:]) >= self.streaming_frames_per_chunk + self.streaming_lookforward:
+                # Decode chunk with context
+                tokens_start = max(
+                    n_decoded_tokens - self.streaming_lookback - self.streaming_overlap_frames,
+                    0
+                )
+                tokens_end = (
+                    n_decoded_tokens
+                    + self.streaming_frames_per_chunk
+                    + self.streaming_lookforward
+                    + self.streaming_overlap_frames
+                )
+                sample_start = (n_decoded_tokens - tokens_start) * self.hop_length
+                sample_end = (
+                    sample_start
+                    + (self.streaming_frames_per_chunk + 2 * self.streaming_overlap_frames) * self.hop_length
+                )
+                curr_codes = token_cache[tokens_start:tokens_end]
+                recon = self._decode("".join(curr_codes))
+                recon = recon[sample_start:sample_end]
+                audio_cache.append(recon)
+                # Overlap-add processing
+                processed_recon = _linear_overlap_add(
+                    audio_cache, stride=self.streaming_stride_samples
+                )
+                new_samples_end = len(audio_cache) * self.streaming_stride_samples
+                processed_recon = processed_recon[n_decoded_samples:new_samples_end]
+                n_decoded_samples = new_samples_end
+                n_decoded_tokens += self.streaming_frames_per_chunk
+                yield processed_recon
+        # Final chunk
+        remaining_tokens = len(token_cache) - n_decoded_tokens
+        if remaining_tokens > 0:
+            tokens_start = max(
+                len(token_cache) - (self.streaming_lookback + self.streaming_overlap_frames + remaining_tokens),
+                0
+            )
+            sample_start = (
+                len(token_cache) - tokens_start - remaining_tokens - self.streaming_overlap_frames
+            ) * self.hop_length
+            curr_codes = token_cache[tokens_start:]
+            recon = self._decode("".join(curr_codes))
+            recon = recon[sample_start:]
+            audio_cache.append(recon)
+            processed_recon = _linear_overlap_add(audio_cache, stride=self.streaming_stride_samples)
+            processed_recon = processed_recon[n_decoded_samples:]
+            yield processed_recon
+    def cleanup_memory(self):
+        """Clean up GPU memory"""
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        gc.collect()
+        print("🧹 Memory cleaned up")
+    def get_optimization_stats(self) -> dict:
+        """
+        Get current optimization statistics.
+        Returns:
+            Dictionary with optimization info
+        """
+        return {
+            'triton_enabled': self._triton_enabled,
+            'max_batch_size': self.max_batch_size,
+            'cached_references': len(self._ref_cache),
+            'active_sessions': len(self.stored_dict),
+            'kv_quant': self.gen_config.__dict__.get('quant_policy', 0),
+            'prefix_caching': True,  # Always enabled in our config
+        }