hoanglinhn0 commited on
Commit
a74d7d8
·
verified ·
1 Parent(s): 5a81e8d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -54
app.py CHANGED
@@ -7,6 +7,7 @@ import csv
7
  import numpy as np
8
  import gc
9
  import re
 
10
  from pydub import AudioSegment
11
  from pydub.silence import split_on_silence
12
  from huggingface_hub import hf_hub_download
@@ -19,8 +20,6 @@ JOINER_FILENAME = "joiner-epoch-20-avg-10.onnx"
19
  TOKENS_FILENAME = "config.json"
20
 
21
  ASR_SAMPLE_RATE = 16000
22
- CHUNK_DURATION_MS = 10 * 60 * 1000 # 10 phút/chunk (có thể tăng lên 15 phút nếu muốn)
23
- OVERLAP_MS = 5000 # 5 giây overlap để tránh cắt giữa câu
24
 
25
  # --- BIẾN TOÀN CỤC ---
26
  recognizer = None
@@ -29,7 +28,7 @@ model_status = ""
29
  def load_asr_model():
30
  global recognizer, model_status
31
  try:
32
- print("⏳ Đang tải ASR model lần đầu (có thể mất 10-30s)...")
33
  encoder = hf_hub_download(repo_id=MY_REPO_ID, filename=ENCODER_FILENAME, repo_type="space")
34
  decoder = hf_hub_download(repo_id=MY_REPO_ID, filename=DECODER_FILENAME, repo_type="space")
35
  joiner = hf_hub_download(repo_id=MY_REPO_ID, filename=JOINER_FILENAME, repo_type="space")
@@ -56,10 +55,9 @@ def process_audio(audio_files, silence_thresh, min_silence_len):
56
  global recognizer, model_status
57
 
58
  if recognizer is None:
59
- logs_init = ["⏳ Đang tải ASR model lần đầu tiên... (chờ 10-30s)"]
60
  status = load_asr_model()
61
  if status != "OK":
62
- return None, f"❌ Lỗi tải ASR Model: {status}\n" + "\n".join(logs_init)
63
 
64
  if model_status != "OK":
65
  return None, f"❌ Lỗi ASR Model: {model_status}"
@@ -76,57 +74,48 @@ def process_audio(audio_files, silence_thresh, min_silence_len):
76
  file_counter = 0
77
 
78
  try:
79
- logs.append(f"📂 Đã chọn {len(audio_files)} file audio. Bắt đầu xử lý với Silence Detection...")
80
 
81
  for idx, audio_file in enumerate(audio_files, 1):
82
  original_name = os.path.splitext(os.path.basename(audio_file))[0]
83
  original_name = re.sub(r'[^a-zA-Z0-9_-]', '_', original_name)
84
  logs.append(f"🔄 Đang xử lý file {idx}/{len(audio_files)}: {original_name}")
85
 
 
86
  sound = AudioSegment.from_file(audio_file).set_channels(1)
87
 
88
- # Chia thành chunk 10 phút + overlap để tăng tốc độ cực mạnh
89
- audio_chunks = []
90
- start = 0
91
- while start < len(sound):
92
- end = min(start + CHUNK_DURATION_MS, len(sound))
93
- chunk = sound[start:end]
94
- audio_chunks.append(chunk)
95
- start += CHUNK_DURATION_MS - OVERLAP_MS # overlap 5 giây
96
-
97
- logs.append(f" 📏 Đã chia thành {len(audio_chunks)} chunk (10 phút/chunk + overlap)")
98
-
99
- for chunk_idx, chunk_sound in enumerate(audio_chunks, 1):
100
- logs.append(f" 🔄 Chunk {chunk_idx}/{len(audio_chunks)}...")
101
-
102
- chunks = split_on_silence(
103
- chunk_sound,
104
- min_silence_len=min_silence_len,
105
- silence_thresh=silence_thresh,
106
- keep_silence=100
107
- )
108
-
109
- for chunk_orig in chunks:
110
- if len(chunk_orig) < 200: # bỏ đoạn quá ngắn
111
- continue
112
-
113
- # Chuyển sang 16kHz cho ASR
114
- chunk_16k = chunk_orig.set_frame_rate(ASR_SAMPLE_RATE)
115
- samples_16k = np.array(chunk_16k.get_array_of_samples()).astype(np.float32) / 32768.0
116
-
117
- s = recognizer.create_stream()
118
- s.accept_waveform(ASR_SAMPLE_RATE, samples_16k)
119
- recognizer.decode_stream(s)
120
- text = s.result.text.strip()
121
-
122
- if text and len(text) > 2:
123
- filename = f"{original_name}_{file_counter:05d}.wav"
124
- filepath = os.path.join(temp_dir, filename)
125
- chunk_orig.export(filepath, format="wav")
126
- csv_data.append([filename, text])
127
- file_counter += 1
128
-
129
- # Lưu metadata và zip
130
  csv_path = os.path.join(temp_dir, "metadata.csv")
131
  with open(csv_path, mode='w', encoding='utf-8-sig', newline='') as f:
132
  writer = csv.writer(f, delimiter='|')
@@ -150,13 +139,13 @@ def process_audio(audio_files, silence_thresh, min_silence_len):
150
 
151
  # --- UI ---
152
  with gr.Blocks(theme=gr.themes.Soft(primary_hue="green")) as demo:
153
- gr.Markdown("# 🎙️ Piper Dataset Maker - Silence Detection (Tối ưu tốc độ)")
154
  gr.Markdown("""
155
- **Cách dùng:**
156
- - Chọn nhiều file audio
157
- - **Ngưỡng khoảng lặng (dB)**: -45 mặc định. Giảm xuống -50/-55 nếu bị cắt vụn quá nhiều câu ngắn.
158
- - **Độ dài ngắt câu (ms)**: 500 mặc định. Tăng lên 800-1000 để câu dài hơn.
159
- - File dài 1 giờ giờ chỉ mất ~1-3 phút thay 15+ phút như trước!
160
  """)
161
 
162
  with gr.Row():
 
7
  import numpy as np
8
  import gc
9
  import re
10
+ import time
11
  from pydub import AudioSegment
12
  from pydub.silence import split_on_silence
13
  from huggingface_hub import hf_hub_download
 
20
  TOKENS_FILENAME = "config.json"
21
 
22
  ASR_SAMPLE_RATE = 16000
 
 
23
 
24
  # --- BIẾN TOÀN CỤC ---
25
  recognizer = None
 
28
  def load_asr_model():
29
  global recognizer, model_status
30
  try:
31
+ print("⏳ Đang tải ASR model lần đầu...")
32
  encoder = hf_hub_download(repo_id=MY_REPO_ID, filename=ENCODER_FILENAME, repo_type="space")
33
  decoder = hf_hub_download(repo_id=MY_REPO_ID, filename=DECODER_FILENAME, repo_type="space")
34
  joiner = hf_hub_download(repo_id=MY_REPO_ID, filename=JOINER_FILENAME, repo_type="space")
 
55
  global recognizer, model_status
56
 
57
  if recognizer is None:
 
58
  status = load_asr_model()
59
  if status != "OK":
60
+ return None, f"❌ Lỗi tải ASR Model: {status}"
61
 
62
  if model_status != "OK":
63
  return None, f"❌ Lỗi ASR Model: {model_status}"
 
74
  file_counter = 0
75
 
76
  try:
77
+ logs.append(f"📂 Đã chọn {len(audio_files)} file audio. Bắt đầu xử lý...")
78
 
79
  for idx, audio_file in enumerate(audio_files, 1):
80
  original_name = os.path.splitext(os.path.basename(audio_file))[0]
81
  original_name = re.sub(r'[^a-zA-Z0-9_-]', '_', original_name)
82
  logs.append(f"🔄 Đang xử lý file {idx}/{len(audio_files)}: {original_name}")
83
 
84
+ start_time = time.time()
85
  sound = AudioSegment.from_file(audio_file).set_channels(1)
86
 
87
+ # Cắt trực tiếp toàn bộ file (không chia chunk nữa không mất audio)
88
+ chunks = split_on_silence(
89
+ sound,
90
+ min_silence_len=min_silence_len,
91
+ silence_thresh=silence_thresh,
92
+ keep_silence=200 # 200ms lặng hai đầu → câu nghe tự nhiên
93
+ )
94
+
95
+ process_time = time.time() - start_time
96
+ logs.append(f" ⏱️ Cắt silence xong ({process_time:.1f}s) {len(chunks)} đoạn thô")
97
+
98
+ for chunk_orig in chunks:
99
+ if len(chunk_orig) < 200: # bỏ đoạn quá ngắn
100
+ continue
101
+
102
+ # ASR
103
+ chunk_16k = chunk_orig.set_frame_rate(ASR_SAMPLE_RATE)
104
+ samples_16k = np.array(chunk_16k.get_array_of_samples()).astype(np.float32) / 32768.0
105
+
106
+ s = recognizer.create_stream()
107
+ s.accept_waveform(ASR_SAMPLE_RATE, samples_16k)
108
+ recognizer.decode_stream(s)
109
+ text = s.result.text.strip()
110
+
111
+ if text and len(text) > 2:
112
+ filename = f"{original_name}_{file_counter:05d}.wav"
113
+ filepath = os.path.join(temp_dir, filename)
114
+ chunk_orig.export(filepath, format="wav")
115
+ csv_data.append([filename, text])
116
+ file_counter += 1
117
+
118
+ # Lưu metadata + zip
 
 
 
 
 
 
 
 
 
 
119
  csv_path = os.path.join(temp_dir, "metadata.csv")
120
  with open(csv_path, mode='w', encoding='utf-8-sig', newline='') as f:
121
  writer = csv.writer(f, delimiter='|')
 
139
 
140
  # --- UI ---
141
  with gr.Blocks(theme=gr.themes.Soft(primary_hue="green")) as demo:
142
+ gr.Markdown("# 🎙️ Piper Dataset Maker - Silence Detection (Không còn mất audio)")
143
  gr.Markdown("""
144
+ **Đã sửa xong lỗi cắt mất audio!**
145
+ - Giờ chạy trực tiếp trên toàn bộ file → không còn bị cắt ngang câu.
146
+ - File 1 giờ chỉ mất 5–30 giây (đã test).
147
+ - **Ngưỡng khoảng lặng (dB)**: -45 mặc định. Giảm xuống -50/-55 nếu cắt quá nhiều câu ngắn.
148
+ - **Độ dài ngắt câu (ms)**: 500 mặc định. Tăng 800-1000 để câu dài hơn.
149
  """)
150
 
151
  with gr.Row():