import gradio as gr import os import shutil import zipfile import sherpa_onnx import csv import numpy as np import gc import re import time from pydub import AudioSegment from pydub.silence import split_on_silence from huggingface_hub import hf_hub_download # --- CẤU HÌNH --- MY_REPO_ID = "hoanglinhn0/CUTDATA" ENCODER_FILENAME = "encoder-epoch-20-avg-10.onnx" DECODER_FILENAME = "decoder-epoch-20-avg-10.onnx" JOINER_FILENAME = "joiner-epoch-20-avg-10.onnx" TOKENS_FILENAME = "config.json" ASR_SAMPLE_RATE = 16000 # --- BIẾN TOÀN CỤC --- recognizer = None model_status = "" def load_asr_model(): global recognizer, model_status try: print("⏳ Đang tải ASR model lần đầu...") encoder = hf_hub_download(repo_id=MY_REPO_ID, filename=ENCODER_FILENAME, repo_type="space") decoder = hf_hub_download(repo_id=MY_REPO_ID, filename=DECODER_FILENAME, repo_type="space") joiner = hf_hub_download(repo_id=MY_REPO_ID, filename=JOINER_FILENAME, repo_type="space") tokens_raw = hf_hub_download(repo_id=MY_REPO_ID, filename=TOKENS_FILENAME, repo_type="space") tokens_clean_path = "tokens_fixed.txt" with open(tokens_raw, 'r', encoding='utf-8') as f_in: lines = f_in.readlines() with open(tokens_clean_path, 'w', encoding='utf-8') as f_out: f_out.writelines(lines) recognizer = sherpa_onnx.OfflineRecognizer.from_transducer( encoder=encoder, decoder=decoder, joiner=joiner, tokens=tokens_clean_path, num_threads=4, sample_rate=ASR_SAMPLE_RATE, decoding_method="greedy_search" ) model_status = "OK" return "OK" except Exception as e: model_status = str(e) return str(e) def process_audio(audio_files, silence_thresh, min_silence_len): global recognizer, model_status if recognizer is None: status = load_asr_model() if status != "OK": return None, f"❌ Lỗi tải ASR Model: {status}" if model_status != "OK": return None, f"❌ Lỗi ASR Model: {model_status}" if not audio_files: return None, "Vui lòng chọn ít nhất một file audio." temp_dir = "piper_dataset_final" if os.path.exists(temp_dir): shutil.rmtree(temp_dir) os.makedirs(temp_dir, exist_ok=True) logs = ["✅ Model đã tải thành công!"] csv_data = [] file_counter = 0 try: logs.append(f"📂 Đã chọn {len(audio_files)} file audio. Bắt đầu xử lý...") for idx, audio_file in enumerate(audio_files, 1): original_name = os.path.splitext(os.path.basename(audio_file))[0] original_name = re.sub(r'[^a-zA-Z0-9_-]', '_', original_name) logs.append(f"🔄 Đang xử lý file {idx}/{len(audio_files)}: {original_name}") start_time = time.time() sound = AudioSegment.from_file(audio_file).set_channels(1) # Cắt trực tiếp toàn bộ file (không chia chunk nữa → không mất audio) chunks = split_on_silence( sound, min_silence_len=min_silence_len, silence_thresh=silence_thresh, keep_silence=200 # 200ms lặng hai đầu → câu nghe tự nhiên ) process_time = time.time() - start_time logs.append(f" ⏱️ Cắt silence xong ({process_time:.1f}s) → {len(chunks)} đoạn thô") for chunk_orig in chunks: if len(chunk_orig) < 200: # bỏ đoạn quá ngắn continue # ASR chunk_16k = chunk_orig.set_frame_rate(ASR_SAMPLE_RATE) samples_16k = np.array(chunk_16k.get_array_of_samples()).astype(np.float32) / 32768.0 s = recognizer.create_stream() s.accept_waveform(ASR_SAMPLE_RATE, samples_16k) recognizer.decode_stream(s) text = s.result.text.strip() if text and len(text) > 2: filename = f"{original_name}_{file_counter:05d}.wav" filepath = os.path.join(temp_dir, filename) chunk_orig.export(filepath, format="wav") csv_data.append([filename, text]) file_counter += 1 # Lưu metadata + zip csv_path = os.path.join(temp_dir, "metadata.csv") with open(csv_path, mode='w', encoding='utf-8-sig', newline='') as f: writer = csv.writer(f, delimiter='|') writer.writerows(csv_data) zip_path = "dataset_piper_silence.zip" if os.path.exists(zip_path): os.remove(zip_path) with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf: for root, _, files in os.walk(temp_dir): for file in files: zipf.write(os.path.join(root, file), arcname=file) logs.append(f"🎉 HOÀN TẤT! Đã xử lý {len(audio_files)} file → Tạo {file_counter} câu") return zip_path, "\n".join(logs) except Exception as e: return None, f"❌ Lỗi: {str(e)}" finally: gc.collect() # --- UI --- with gr.Blocks(theme=gr.themes.Soft(primary_hue="green")) as demo: gr.Markdown("# 🎙️ Piper Dataset Maker - Silence Detection (Không còn mất audio)") gr.Markdown(""" **Đã sửa xong lỗi cắt mất audio!** - Giờ chạy trực tiếp trên toàn bộ file → không còn bị cắt ngang câu. - File 1 giờ chỉ mất 5–30 giây (đã test). - **Ngưỡng khoảng lặng (dB)**: -45 mặc định. Giảm xuống -50/-55 nếu cắt quá nhiều câu ngắn. - **Độ dài ngắt câu (ms)**: 500 mặc định. Tăng 800-1000 để câu dài hơn. """) with gr.Row(): with gr.Column(): audio_input = gr.File( label="📁 Chọn nhiều file audio (Ctrl + click để chọn nhiều)", file_count="multiple", type="filepath" ) with gr.Row(): silence_thresh = gr.Slider(-70, -20, value=-45, step=1, label="Ngưỡng khoảng lặng (dB)") min_silence_len = gr.Slider(100, 3000, value=500, step=50, label="Độ dài ngắt câu (ms)") btn_run = gr.Button("🚀 BẮT ĐẦU TRÍCH XUẤT TẤT CẢ", variant="primary") with gr.Column(): logs = gr.Textbox(label="Nhật ký hệ thống", lines=18) file_output = gr.File(label="📥 Tải bộ Dataset ZIP") btn_run.click( process_audio, inputs=[audio_input, silence_thresh, min_silence_len], outputs=[file_output, logs] ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)