CUTDATA

Running

App Files Files Community

hoanglinhn0 commited on 26 days ago

Commit

a74d7d8

verified ·

1 Parent(s): 5a81e8d

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -54

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import csv
 import numpy as np
 import gc
 import re
 from pydub import AudioSegment
 from pydub.silence import split_on_silence
 from huggingface_hub import hf_hub_download
@@ -19,8 +20,6 @@ JOINER_FILENAME  = "joiner-epoch-20-avg-10.onnx"
 TOKENS_FILENAME  = "config.json"
 ASR_SAMPLE_RATE = 16000
-CHUNK_DURATION_MS = 10 * 60 * 1000   # 10 phút/chunk (có thể tăng lên 15 phút nếu muốn)
-OVERLAP_MS = 5000                    # 5 giây overlap để tránh cắt giữa câu
 # --- BIẾN TOÀN CỤC ---
 recognizer = None
@@ -29,7 +28,7 @@ model_status = ""
 def load_asr_model():
     global recognizer, model_status
     try:
-        print("⏳ Đang tải ASR model lần đầu (có thể mất 10-30s)...")
         encoder = hf_hub_download(repo_id=MY_REPO_ID, filename=ENCODER_FILENAME, repo_type="space")
         decoder = hf_hub_download(repo_id=MY_REPO_ID, filename=DECODER_FILENAME, repo_type="space")
         joiner = hf_hub_download(repo_id=MY_REPO_ID, filename=JOINER_FILENAME, repo_type="space")
@@ -56,10 +55,9 @@ def process_audio(audio_files, silence_thresh, min_silence_len):
     global recognizer, model_status
     if recognizer is None:
-        logs_init = ["⏳ Đang tải ASR model lần đầu tiên... (chờ 10-30s)"]
         status = load_asr_model()
         if status != "OK":
-            return None, f"❌ Lỗi tải ASR Model: {status}\n" + "\n".join(logs_init)
     if model_status != "OK":
         return None, f"❌ Lỗi ASR Model: {model_status}"
@@ -76,57 +74,48 @@ def process_audio(audio_files, silence_thresh, min_silence_len):
     file_counter = 0
     try:
-        logs.append(f"📂 Đã chọn {len(audio_files)} file audio. Bắt đầu xử lý với Silence Detection...")
         for idx, audio_file in enumerate(audio_files, 1):
             original_name = os.path.splitext(os.path.basename(audio_file))[0]
             original_name = re.sub(r'[^a-zA-Z0-9_-]', '_', original_name)
             logs.append(f"🔄 Đang xử lý file {idx}/{len(audio_files)}: {original_name}")
             sound = AudioSegment.from_file(audio_file).set_channels(1)
-            # Chia thành chunk 10 phút + overlap để tăng tốc độ cực mạnh
-            audio_chunks = []
-            start = 0
-            while start < len(sound):
-                end = min(start + CHUNK_DURATION_MS, len(sound))
-                chunk = sound[start:end]
-                audio_chunks.append(chunk)
-                start += CHUNK_DURATION_MS - OVERLAP_MS   # overlap 5 giây
-            logs.append(f"   📏 Đã chia thành {len(audio_chunks)} chunk (10 phút/chunk + overlap)")
-            for chunk_idx, chunk_sound in enumerate(audio_chunks, 1):
-                logs.append(f"      🔄 Chunk {chunk_idx}/{len(audio_chunks)}...")
-                chunks = split_on_silence(
-                    chunk_sound,
-                    min_silence_len=min_silence_len,
-                    silence_thresh=silence_thresh,
-                    keep_silence=100
-                )
-                for chunk_orig in chunks:
-                    if len(chunk_orig) < 200:          # bỏ đoạn quá ngắn
-                        continue
-                    # Chuyển sang 16kHz cho ASR
-                    chunk_16k = chunk_orig.set_frame_rate(ASR_SAMPLE_RATE)
-                    samples_16k = np.array(chunk_16k.get_array_of_samples()).astype(np.float32) / 32768.0
-                    s = recognizer.create_stream()
-                    s.accept_waveform(ASR_SAMPLE_RATE, samples_16k)
-                    recognizer.decode_stream(s)
-                    text = s.result.text.strip()
-                    if text and len(text) > 2:
-                        filename = f"{original_name}_{file_counter:05d}.wav"
-                        filepath = os.path.join(temp_dir, filename)
-                        chunk_orig.export(filepath, format="wav")
-                        csv_data.append([filename, text])
-                        file_counter += 1
-        # Lưu metadata và zip
         csv_path = os.path.join(temp_dir, "metadata.csv")
         with open(csv_path, mode='w', encoding='utf-8-sig', newline='') as f:
             writer = csv.writer(f, delimiter='|')
@@ -150,13 +139,13 @@ def process_audio(audio_files, silence_thresh, min_silence_len):
 # --- UI ---
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="green")) as demo:
-    gr.Markdown("# 🎙️ Piper Dataset Maker - Silence Detection (Tối ưu tốc độ)")
     gr.Markdown("""
-    **Cách dùng:**
-    - Chọn nhiều file audio
-    - **Ngưỡng khoảng lặng (dB)**: -45 mặc định. Giảm xuống -50/-55 nếu bị cắt vụn quá nhiều câu ngắn.
-    - **Độ dài ngắt câu (ms)**: 500 mặc định. Tăng lên 800-1000 để câu dài hơn.
-    - File dài 1 giờ giờ chỉ mất ~1-3 phút thay vì 15+ phút như trước!
     """)
     with gr.Row():

 import numpy as np
 import gc
 import re
+import time
 from pydub import AudioSegment
 from pydub.silence import split_on_silence
 from huggingface_hub import hf_hub_download
 TOKENS_FILENAME  = "config.json"
 ASR_SAMPLE_RATE = 16000
 # --- BIẾN TOÀN CỤC ---
 recognizer = None
 def load_asr_model():
     global recognizer, model_status
     try:
+        print("⏳ Đang tải ASR model lần đầu...")
         encoder = hf_hub_download(repo_id=MY_REPO_ID, filename=ENCODER_FILENAME, repo_type="space")
         decoder = hf_hub_download(repo_id=MY_REPO_ID, filename=DECODER_FILENAME, repo_type="space")
         joiner = hf_hub_download(repo_id=MY_REPO_ID, filename=JOINER_FILENAME, repo_type="space")
     global recognizer, model_status
     if recognizer is None:
         status = load_asr_model()
         if status != "OK":
+            return None, f"❌ Lỗi tải ASR Model: {status}"
     if model_status != "OK":
         return None, f"❌ Lỗi ASR Model: {model_status}"
     file_counter = 0
     try:
+        logs.append(f"📂 Đã chọn {len(audio_files)} file audio. Bắt đầu xử lý...")
         for idx, audio_file in enumerate(audio_files, 1):
             original_name = os.path.splitext(os.path.basename(audio_file))[0]
             original_name = re.sub(r'[^a-zA-Z0-9_-]', '_', original_name)
             logs.append(f"🔄 Đang xử lý file {idx}/{len(audio_files)}: {original_name}")
+            start_time = time.time()
             sound = AudioSegment.from_file(audio_file).set_channels(1)
+            # Cắt trực tiếp toàn bộ file (không chia chunk nữa → không mất audio)
+            chunks = split_on_silence(
+                sound,
+                min_silence_len=min_silence_len,
+                silence_thresh=silence_thresh,
+                keep_silence=200          # 200ms lặng hai đầu → câu nghe tự nhiên
+            )
+            process_time = time.time() - start_time
+            logs.append(f"   ⏱️  Cắt silence xong ({process_time:.1f}s) → {len(chunks)} đoạn thô")
+            for chunk_orig in chunks:
+                if len(chunk_orig) < 200:          # bỏ đoạn quá ngắn
+                    continue
+                # ASR
+                chunk_16k = chunk_orig.set_frame_rate(ASR_SAMPLE_RATE)
+                samples_16k = np.array(chunk_16k.get_array_of_samples()).astype(np.float32) / 32768.0
+                s = recognizer.create_stream()
+                s.accept_waveform(ASR_SAMPLE_RATE, samples_16k)
+                recognizer.decode_stream(s)
+                text = s.result.text.strip()
+                if text and len(text) > 2:
+                    filename = f"{original_name}_{file_counter:05d}.wav"
+                    filepath = os.path.join(temp_dir, filename)
+                    chunk_orig.export(filepath, format="wav")
+                    csv_data.append([filename, text])
+                    file_counter += 1
+        # Lưu metadata + zip
         csv_path = os.path.join(temp_dir, "metadata.csv")
         with open(csv_path, mode='w', encoding='utf-8-sig', newline='') as f:
             writer = csv.writer(f, delimiter='|')
 # --- UI ---
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="green")) as demo:
+    gr.Markdown("# 🎙️ Piper Dataset Maker - Silence Detection (Không còn mất audio)")
     gr.Markdown("""
+    **Đã sửa xong lỗi cắt mất audio!**
+    - Giờ chạy trực tiếp trên toàn bộ file → không còn bị cắt ngang câu.
+    - File 1 giờ chỉ mất 5–30 giây (đã test).
+    - **Ngưỡng khoảng lặng (dB)**: -45 mặc định. Giảm xuống -50/-55 nếu cắt quá nhiều câu ngắn.
+    - **Độ dài ngắt câu (ms)**: 500 mặc định. Tăng 800-1000 để câu dài hơn.
     """)
     with gr.Row():