import gradio as gr import os import re import pysrt import io import wave import math import json import concurrent.futures from piper.voice import PiperVoice from pydub import AudioSegment, effects from pydub.silence import split_on_silence from vinorm import TTSnorm from huggingface_hub import hf_hub_download, list_repo_files # --- 1. QUẢN LÝ MODEL TỪ HUGGING FACE --- REPO_ID = "hoanglinhn0/Model" voice_cache = {} CONFIG_FILE = "config.json" def load_config(): """Đọc cấu hình từ file json, nếu không có trả về dict rỗng""" if os.path.exists(CONFIG_FILE): try: with open(CONFIG_FILE, "r", encoding="utf-8") as f: return json.load(f) except: return {} return {} def save_config_to_file(voice, clean_opts, vol, speed, smart, overlap, ns, nw): """Lưu các tham số hiện tại vào file json""" data = { "voice": voice, "clean_opts": clean_opts, "vol": vol, "speed": speed, "smart": smart, "overlap": overlap, "ns": ns, "nw": nw } try: with open(CONFIG_FILE, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=4) except Exception as e: print(f"Lỗi lưu config: {e}") def get_voice_list(): try: files = list_repo_files(repo_id=REPO_ID) models = [f for f in files if f.endswith('.onnx')] return sorted(models) except Exception as e: print(f"Lỗi lấy danh sách model: {e}") return [] def load_piper_voice(model_name): if model_name not in voice_cache: print(f"Đang tải model: {model_name}...") try: onnx_path = hf_hub_download(repo_id=REPO_ID, filename=model_name) config_path = hf_hub_download(repo_id=REPO_ID, filename=model_name + ".json") voice_cache[model_name] = PiperVoice.load(onnx_path, config_path=config_path) except Exception as e: raise Exception(f"Không thể tải model {model_name}: {str(e)}") return voice_cache[model_name] # --- 2. HÀM LÀM SẠCH VĂN BẢN (ĐÃ TỐI ƯU CHO FILE SRT CỦA BẠN) --- def advanced_text_cleaning(text, options=None): if options is None: options = [] # --- A. Xử lý các thẻ Code/HTML/SRT rác trước --- # 1. Xử lý HTML tags (quan trọng cho file có , ...) if "Ignore html tags" in options: text = re.sub(r'<[^>]+>', ' ', text) # Thay bằng khoảng trắng để tránh dính chữ text = re.sub(r'#[0-9a-fA-F]{6}', '', text) # Xóa mã màu hex nếu còn sót # 2. Xử lý các loại ngoặc if "Ignore text between parentheses ()" in options: text = re.sub(r'\([^)]*\)', '', text) if "Ignore text between curly brackets {}" in options: text = re.sub(r'\{[^}]*\}', '', text) if "Ignore text between square brackets []" in options: # Xử lý các thẻ như text = re.sub(r'\[[^\]]*\]', '', text) if "Ignore text between asterisks * *" in options: text = re.sub(r'\*[^*]*\*', '', text) # 3. Xử lý Nốt nhạc (Logic thông minh hơn) # Lưu ý: Nếu chọn tùy chọn này, nó sẽ xóa nội dung GIỮA nốt nhạc. # Với file của bạn, KHÔNG NÊN chọn option này. # Nhưng nếu người dùng chọn, code vẫn thực thi. if "Ignore text between notes ♪ ♪" in options: text = re.sub(r'♪[^♪]*♪', '', text) # 4. Xử lý ký tự đặc biệt (Để xóa nốt nhạc ♪ nhưng giữ lại lời thoại) if "Ignore characters: * # ~ ♪ + _ !" in options: # Giữ lại dấu chấm than (!) để model thể hiện cảm xúc, xóa các ký tự rác khác bao gồm ♪ text = re.sub(r'[*#~♪+_]', ' ', text) if "Ignore repetitions of non-alphabetic characters" in options: text = re.sub(r'(\W)\1+', r'\1', text) if "Ignore periods and commas ( . , )" in options: text = re.sub(r'[.,]', ' ', text) # --- B. Làm sạch khoảng trắng và định dạng --- text = text.strip() # Xóa ký tự gạch đầu dòng đầu câu (thường gặp trong sub) text = re.sub(r'^[\-\–]\s+', '', text) text = re.sub(r'\n[\-\–]\s+', '\n', text) # --- C. CHẠY TTSNORM --- try: text = TTSnorm(text) except: pass # --- D. XỬ LÝ DẤU CÂU (Tạo nhịp điệu đọc) --- # Thay thế xuống dòng bằng dấu chấm để ngắt câu rõ ràng text = re.sub(r'[\r\n]+', '. ', text) # Thay dấu chấm phẩy/hai chấm bằng dấu phẩy text = re.sub(r'[;:]', ',', text) # Xóa ngoặc kép/đơn text = re.sub(r'[\"\']', '', text) # Chuẩn hóa dấu 3 chấm text = re.sub(r'\.{2,}', '... ', text) # --- E. XỬ LÝ KHOẢNG TRẮNG CUỐI CÙNG --- # Thêm khoảng trắng sau dấu câu nếu bị dính (trừ số thập phân) text = re.sub(r'(?<=[.,?!])(?=[^\s\d])', r' ', text) # Xóa khoảng trắng thừa trước dấu câu text = re.sub(r'\s+([.,?!])', r'\1', text) # Gộp nhiều khoảng trắng thành 1 text = re.sub(r'\s+', ' ', text) return text.strip() # --- 3. XỬ LÝ ÂM THANH --- def trim_silence(audio, silence_thresh=-40, min_silence_len=50): try: return audio.strip_silence(silence_thresh=silence_thresh) except: return audio def process_loudness(audio, boost_factor): try: if boost_factor <= 1.0: return audio audio = audio.high_pass_filter(100) audio = effects.compress_dynamic_range(audio, threshold=-20.0, ratio=2.5) gain_db = 20 * math.log10(boost_factor) audio = audio + gain_db audio = effects.compress_dynamic_range(audio, threshold=-1.0, ratio=10.0) return effects.normalize(audio, headroom=0.1) except: return audio def synthesize_single_segment(args): text, voice_name, speed, noise, noise_w, volume_boost, sub_info = args try: voice = load_piper_voice(voice_name) # Tách câu dựa trên dấu câu để tạo độ ngắt nghỉ tự nhiên parts = re.split(r'([.,!?])', text) combined_audio = AudioSegment.silent(duration=0) for i in range(0, len(parts), 2): part_text = parts[i].strip() if not part_text: continue with io.BytesIO() as wav_io: with wave.open(wav_io, "wb") as wav_file: voice.synthesize(part_text, wav_file, length_scale=1.0/speed, noise_scale=noise, noise_w=noise_w) wav_io.seek(0) part_audio = AudioSegment.from_wav(wav_io) combined_audio += part_audio # Xử lý độ trễ (pause) dựa trên dấu câu if i + 1 < len(parts): punc = parts[i + 1] if punc in ['.', '!', '?']: combined_audio += AudioSegment.silent(duration=150) # Nghỉ hết câu elif punc == ',': combined_audio += AudioSegment.silent(duration=200) # Nghỉ dấu phẩy audio = trim_silence(combined_audio) audio = process_loudness(audio, volume_boost) return (audio, sub_info) except Exception as e: print(f"Lỗi segment {sub_info['index']}: {e}") return (AudioSegment.silent(duration=0), sub_info) # --- TÍNH NĂNG: NGHE THỬ (PREVIEW) --- def preview_speech(text, voice_name, speed, noise, noise_w, volume_boost, clean_opts): try: if not text.strip(): return "⚠️ Hãy nhập nội dung để test!", None if not voice_name: return "⚠️ Chưa chọn model!", None clean_text = advanced_text_cleaning(text, clean_opts) print(f"[Preview Debug] Cleaned Text: {clean_text}") voice = load_piper_voice(voice_name) parts = re.split(r'([.,!?])', clean_text) combined_audio = AudioSegment.silent(duration=0) for i in range(0, len(parts), 2): part_text = parts[i].strip() if not part_text: continue with io.BytesIO() as wav_io: with wave.open(wav_io, "wb") as wav_file: voice.synthesize(part_text, wav_file, length_scale=1.0/speed, noise_scale=noise, noise_w=noise_w) wav_io.seek(0) part_audio = AudioSegment.from_wav(wav_io) combined_audio += part_audio if i + 1 < len(parts): punc = parts[i + 1] if punc in ['.', '!', '?']: combined_audio += AudioSegment.silent(duration=150) elif punc == ',': combined_audio += AudioSegment.silent(duration=200) audio = trim_silence(combined_audio) audio = process_loudness(audio, volume_boost) output_path = "preview_temp.wav" audio.export(output_path, format="wav") return "✅ Đã tạo mẫu thử!", output_path except Exception as e: return f"Lỗi Preview: {str(e)}", None # --- 4. PIPELINE XỬ LÝ CHÍNH --- def process_pipeline(voice_name, srt_file, manual_text, base_speed, noise, noise_w, clean_options, smart_speed, volume_boost, allow_overlap): try: if not voice_name: return "⚠️ Chưa chọn model (đang tải danh sách...)", None is_srt = False subs = [] if srt_file: try: subs = pysrt.open(srt_file.name, encoding='utf-8') is_srt = True except: return "⚠️ Lỗi đọc file SRT!", None elif manual_text.strip(): try: temp_subs = pysrt.from_string(manual_text) if len(temp_subs) > 0 and temp_subs[0].end.ordinal > 0: subs = temp_subs is_srt = True else: raise Exception("Not SRT") except: subs = manual_text.strip().split('\n') is_srt = False else: return "⚠️ Không có dữ liệu!", None tasks = [] for i, item in enumerate(subs): text_raw = item.text if is_srt else item clean_text = advanced_text_cleaning(text_raw, clean_options) if not clean_text.strip(): continue sub_info = { 'start': item.start.ordinal if is_srt else i * 1000, 'end': item.end.ordinal if is_srt else 0, 'index': i } tasks.append((clean_text, voice_name, base_speed, noise, noise_w, volume_boost, sub_info)) if not tasks: return "⚠️ Không tìm thấy văn bản sau khi làm sạch! (Kiểm tra lại bộ lọc)", None workers = min(4, os.cpu_count() or 2) with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: results = list(executor.map(synthesize_single_segment, tasks)) results.sort(key=lambda x: x[1]['index']) final_audio = AudioSegment.silent(duration=0) for segment, info in results: if len(segment) == 0: continue if is_srt: start_ms = info['start'] if allow_overlap: if start_ms > len(final_audio): final_audio += AudioSegment.silent(duration=start_ms - len(final_audio)) final_audio = final_audio.overlay(segment, position=start_ms) else: if start_ms > len(final_audio): final_audio += AudioSegment.silent(duration=start_ms - len(final_audio)) final_audio += segment else: final_audio += segment + AudioSegment.silent(duration=300) output_path = "output_pro.wav" final_audio.export(output_path, format="wav") return "✅ Hoàn thành!", output_path except Exception as e: return f"Lỗi hệ thống: {str(e)}", None # --- 5. GIAO DIỆN GRADIO --- custom_css = """ .gradio-container {background-color: #f0faff;} #header_title {text-align: center; color: #1e40af;} .action-btn {background: linear-gradient(135deg, #60a5fa, #2563eb) !important; color: white !important;} .test-btn {background: linear-gradient(135deg, #34d399, #059669) !important; color: white !important;} .group-box {background: white; border-radius: 15px; padding: 20px; box-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1);} """ CLEANING_OPTIONS = [ "Ignore html tags", "Ignore text between parentheses ()", "Ignore text between curly brackets {}", "Ignore text between square brackets []", "Ignore text between asterisks * *", "Ignore text between notes ♪ ♪", "Ignore characters: * # ~ ♪ + _ !", "Ignore repetitions of non-alphabetic characters", "Ignore periods and commas ( . , )" ] # CẤU HÌNH MẶC ĐỊNH ĐƯỢC TỐI ƯU CHO FILE CỦA BẠN: # - Đã bỏ chọn "Ignore text between notes" để tránh mất tiếng # - Đã chọn "Ignore html tags" và "brackets" để lọc rác DEFAULT_CHECKED = [ "Ignore html tags", "Ignore text between curly brackets {}", "Ignore text between square brackets []", "Ignore text between asterisks * *", # "Ignore text between notes ♪ ♪", <-- ĐÃ BỎ CHỌN MẶC ĐỊNH "Ignore characters: * # ~ ♪ + _ !" ] user_settings = load_config() with gr.Blocks() as demo: gr.Markdown("# 🎙️ Piper High-Speed Mastering (Cleaned Text)", elem_id="header_title") with gr.Row(): with gr.Column(scale=4): with gr.Group(elem_classes="group-box"): with gr.Row(): voice_choices = get_voice_list() default_voice = user_settings.get("voice", voice_choices[0] if voice_choices else None) v_select = gr.Dropdown( choices=voice_choices, label="Model (Từ Hugging Face)", value=default_voice, scale=4 ) refresh_btn = gr.Button("🔄 Tải DS", scale=1) with gr.Tabs(): with gr.TabItem("📝 Văn bản / SRT"): manual_input = gr.Textbox(label="Input", lines=10, placeholder="Dán nội dung...") with gr.TabItem("📁 Tải File"): srt_input = gr.File(label="SRT File") with gr.Group(elem_classes="group-box"): gr.Markdown("### 🧹 Làm sạch văn bản") clean_checkboxes = gr.CheckboxGroup( choices=CLEANING_OPTIONS, value=user_settings.get("clean_opts", DEFAULT_CHECKED), label="Tùy chọn lọc bỏ", interactive=True ) with gr.Column(scale=3): with gr.Group(elem_classes="group-box"): gr.Markdown("### ⚙️ Cài đặt") vol_boost = gr.Slider(1.0, 5.0, value=user_settings.get("vol", 3.0), step=0.1, label="🔊 Độ to") sp = gr.Slider(0.5, 3.0, value=user_settings.get("speed", 1.3), step=0.1, label="⏩ Tốc độ") with gr.Row(): smart_spd = gr.Checkbox(label="Smart Speed", value=user_settings.get("smart", False)) allow_overlap = gr.Checkbox(label="Ghi đè (SRT)", value=user_settings.get("overlap", False)) with gr.Row(): ns = gr.Slider(0.1, 1.0, value=user_settings.get("ns", 0.6), label="Noise Scale") nw = gr.Slider(0.1, 1.0, value=user_settings.get("nw", 0.8), label="Noise W") with gr.Group(elem_classes="group-box"): gr.Markdown("### 🎧 Nghe thử nhanh (Test Audio)") test_text = gr.Textbox(label="Văn bản test", value="Xin chào, đây là bản thử nghiệm. 1.5 giây.", lines=2) with gr.Row(): test_btn = gr.Button("🔊 Nghe Thử", variant="secondary", elem_classes="test-btn") test_audio_out = gr.Audio(label="Audio Test", interactive=False, type="filepath") test_status = gr.Markdown("") gr.Markdown("---") btn = gr.Button("🚀 CHẠY TOÀN BỘ FILE", variant="primary", elem_classes="action-btn") with gr.Row(): status = gr.Textbox(label="Trạng thái xử lý") audio_out = gr.Audio(label="Kết quả Final", interactive=False) auto_updater = gr.Timer(value=60) def refresh_voices(current_voice): new_choices = get_voice_list() next_val = current_voice if current_voice in new_choices else (new_choices[0] if new_choices else None) return gr.update(choices=new_choices, value=next_val) refresh_btn.click(refresh_voices, inputs=[v_select], outputs=[v_select]) auto_updater.tick(refresh_voices, inputs=[v_select], outputs=[v_select]) test_btn.click( preview_speech, inputs=[test_text, v_select, sp, ns, nw, vol_boost, clean_checkboxes], outputs=[test_status, test_audio_out] ) btn.click(process_pipeline, [v_select, srt_input, manual_input, sp, ns, nw, clean_checkboxes, smart_spd, vol_boost, allow_overlap], [status, audio_out]) settings_inputs = [v_select, clean_checkboxes, vol_boost, sp, smart_spd, allow_overlap, ns, nw] for component in settings_inputs: component.change(fn=save_config_to_file, inputs=settings_inputs, outputs=None) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", theme=gr.themes.Soft(primary_hue="blue", secondary_hue="cyan"), css=custom_css )