Spaces:
Sleeping
Sleeping
| import os | |
| import sys | |
| import threading | |
| import numpy as np | |
| import soundfile as sf | |
| import shutil | |
| import librosa | |
| import gradio as gr | |
| # torch and NeuTTSAir imported lazily in get_tts() to avoid slow startup / OOM on Render | |
| # --------------------------- | |
| # eSpeak check (Windows + Linux) | |
| # --------------------------- | |
| def check_espeak_installed(): | |
| # If already set (e.g. by Docker), trust it | |
| if os.environ.get("PHONEMIZER_ESPEAK_LIBRARY") and os.path.exists(os.environ["PHONEMIZER_ESPEAK_LIBRARY"]): | |
| print(f"Using espeak library from env: {os.environ['PHONEMIZER_ESPEAK_LIBRARY']}") | |
| return True | |
| # Linux: look for libespeak-ng.so in common locations | |
| if sys.platform != "win32": | |
| so_names = ["libespeak-ng.so", "libespeak-ng.so.1", "libespeak.so"] | |
| search_dirs = ["/usr/lib", "/usr/lib/x86_64-linux-gnu", "/usr/local/lib"] | |
| for d in search_dirs: | |
| if not os.path.isdir(d): | |
| continue | |
| for name in so_names: | |
| candidate = os.path.join(d, name) | |
| if os.path.exists(candidate): | |
| os.environ["PHONEMIZER_ESPEAK_LIBRARY"] = candidate | |
| print(f"Found espeak library at: {candidate}") | |
| return True | |
| if shutil.which("espeak-ng") or shutil.which("espeak"): | |
| print("Found espeak-ng in PATH (phonemizer may use default library)") | |
| return True | |
| print("\nError: espeak-ng not found! On Linux install with: apt-get install espeak-ng libespeak-ng-dev") | |
| return False | |
| # Windows | |
| possible_paths = [ | |
| "C:\\Program Files\\eSpeak NG", | |
| "C:\\Program Files (x86)\\eSpeak NG", | |
| "C:\\Program Files\\eSpeak", | |
| "C:\\Program Files (x86)\\eSpeak", | |
| ] | |
| dll_names = ['libespeak-ng.dll', 'espeak-ng.dll', 'libespeak.dll', 'espeak.dll'] | |
| for exe_cmd in ['espeak-ng', 'espeak']: | |
| exe_path = shutil.which(exe_cmd) | |
| if exe_path: | |
| print(f"Found {exe_cmd} in PATH at: {exe_path}") | |
| exe_dir = os.path.dirname(exe_path) | |
| for dll in dll_names: | |
| candidate = os.path.join(exe_dir, dll) | |
| if os.path.exists(candidate): | |
| os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = candidate | |
| print(f"Found espeak shared library at: {candidate}") | |
| return True | |
| for path in possible_paths: | |
| if os.path.exists(path): | |
| for root, _, files in os.walk(path): | |
| for dll in dll_names: | |
| candidate = os.path.join(root, dll) | |
| if os.path.exists(candidate): | |
| os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = candidate | |
| os.environ['PATH'] = f"{path};{os.environ['PATH']}" | |
| return True | |
| bin_path = os.path.join(path, 'espeak-ng.exe') | |
| if os.path.exists(bin_path): | |
| os.environ['PATH'] = f"{path};{os.environ['PATH']}" | |
| break | |
| print("\nError: espeak-ng not found!") | |
| print("Install from https://github.com/espeak-ng/espeak-ng/releases") | |
| return False | |
| if not check_espeak_installed(): | |
| sys.exit(1) | |
| # --------------------------- | |
| # Model initialization (deferred so server can bind to PORT first for Render) | |
| # --------------------------- | |
| tts = None | |
| _tts_lock = threading.Lock() | |
| def get_tts(): | |
| """Load TTS model on first use so the Gradio server can start and bind to PORT immediately.""" | |
| global tts | |
| with _tts_lock: | |
| if tts is not None: | |
| return tts | |
| import torch | |
| from neuttsair.neutts import NeuTTSAir | |
| print("\nLoading TTS model (first use)...") | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| torch.cuda.synchronize() | |
| print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB total") | |
| project_root = os.path.abspath(os.path.dirname(__file__)) | |
| local_backbone = os.path.join(project_root, "Models", "neutts-air") | |
| def _resolve_hf_snapshot(root_path: str) -> str: | |
| try: | |
| for name in os.listdir(root_path): | |
| if name.startswith("models--"): | |
| models_dir = os.path.join(root_path, name) | |
| snapshots_dir = os.path.join(models_dir, "snapshots") | |
| if os.path.isdir(snapshots_dir): | |
| for snap in os.listdir(snapshots_dir): | |
| snap_path = os.path.join(snapshots_dir, snap) | |
| if os.path.exists(os.path.join(snap_path, "config.json")): | |
| print(f"Found model in snapshots: {snap_path}") | |
| return snap_path | |
| except Exception as e: | |
| print(f"Warning: Error resolving model path: {e}") | |
| return root_path | |
| # Use full transformers model (neuphonic/neutts-air) to avoid llama-cpp build on cloud | |
| backbone_arg = _resolve_hf_snapshot(local_backbone) if os.path.isdir(local_backbone) else "neuphonic/neutts-air" | |
| print(f"Using backbone: {backbone_arg}") | |
| print(f"Using codec: neuphonic/neucodec") | |
| if not torch.cuda.is_available(): | |
| backbone_device = "cpu" | |
| codec_device = "cpu" | |
| print("No CUDA GPU detected. Using CPU for backbone and codec.") | |
| else: | |
| backbone_device = "cuda" | |
| codec_device = "cuda" | |
| gpu_memory_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3 | |
| if gpu_memory_gb <= 4.5: | |
| print(f"Detected {gpu_memory_gb:.2f} GB GPU. Loading codec on CPU to save GPU memory.") | |
| codec_device = "cpu" | |
| tts = NeuTTSAir( | |
| backbone_repo=backbone_arg, | |
| backbone_device=backbone_device, | |
| codec_repo="neuphonic/neucodec", | |
| codec_device=codec_device, | |
| ) | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| print("TTS model loaded.") | |
| return tts | |
| # --------------------------- | |
| # Voice loading logic | |
| # --------------------------- | |
| VOICES = {"samples": {}} | |
| voice_dir = "samples" | |
| os.makedirs(voice_dir, exist_ok=True) | |
| for name in os.listdir(voice_dir): | |
| if name.endswith(".txt"): | |
| base = os.path.splitext(name)[0] | |
| txt_path = os.path.join(voice_dir, f"{base}.txt") | |
| wav_path = os.path.join(voice_dir, f"{base}.wav") | |
| pt_path = os.path.join(voice_dir, f"{base}.pt") | |
| if os.path.exists(txt_path) and (os.path.exists(wav_path) or os.path.exists(pt_path)): | |
| VOICES["samples"][base] = (txt_path, wav_path if os.path.exists(wav_path) else pt_path) | |
| def format_voice_choice(name): | |
| return f"Voice: {name}" | |
| # --------------------------- | |
| # Core functions | |
| # --------------------------- | |
| def load_reference(voice_name): | |
| import torch | |
| txt_path, audio_or_pt = VOICES["samples"][voice_name] | |
| ref_text = open(txt_path, "r").read().strip() | |
| if audio_or_pt.endswith(".pt"): | |
| ref_codes = torch.load(audio_or_pt) | |
| else: | |
| ref_codes = get_tts().encode_reference(audio_or_pt) | |
| return ref_text, ref_codes | |
| def split_text_into_chunks(text, max_length=150): | |
| """Split text into smaller chunks preserving sentence and punctuation structure.""" | |
| import re | |
| # Clean up the text first | |
| text = text.strip() | |
| if not text: | |
| return [] | |
| # Split by sentence-ending punctuation while preserving the punctuation | |
| sentence_pattern = r'([.!?]+)' | |
| parts = re.split(sentence_pattern, text) | |
| # Reconstruct sentences with their punctuation | |
| sentences = [] | |
| i = 0 | |
| while i < len(parts): | |
| if parts[i].strip(): | |
| sentence = parts[i].strip() | |
| # Add punctuation if it exists | |
| if i + 1 < len(parts) and parts[i + 1].strip(): | |
| sentence += parts[i + 1] | |
| i += 2 | |
| else: | |
| # If no punctuation follows, add a period (only once) | |
| if not sentence.endswith(('.', '!', '?')): | |
| sentence += '.' | |
| i += 1 | |
| sentences.append(sentence) | |
| else: | |
| i += 1 | |
| # ✅ FIX: Avoid adding the last part twice when no punctuation present | |
| if len(parts) > 0 and parts[-1].strip(): | |
| last_part = parts[-1].strip() | |
| # Add only if it's not already included | |
| if not any(last_part in s or s.startswith(last_part) for s in sentences): | |
| if not last_part.endswith(('.', '!', '?')): | |
| last_part += '.' | |
| sentences.append(last_part) | |
| # Group sentences into chunks | |
| chunks = [] | |
| current_chunk = "" | |
| for sentence in sentences: | |
| # If single sentence exceeds max_length, split by commas | |
| if len(sentence) > max_length: | |
| comma_parts = re.split(r'(,)', sentence) | |
| temp_sentence = "" | |
| i = 0 | |
| while i < len(comma_parts): | |
| part = comma_parts[i].strip() | |
| comma = comma_parts[i + 1] if i + 1 < len(comma_parts) else '' | |
| # If part is still too long, split by words | |
| if len(part) > max_length: | |
| words = part.split() | |
| temp_words = [] | |
| for word in words: | |
| test_chunk = ' '.join(temp_words + [word]) | |
| if len(test_chunk) > max_length and temp_words: | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = "" | |
| chunks.append(' '.join(temp_words)) | |
| temp_words = [word] | |
| else: | |
| temp_words.append(word) | |
| if temp_words: | |
| part = ' '.join(temp_words) + comma | |
| if current_chunk and len(current_chunk + ' ' + part) > max_length: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = part | |
| else: | |
| current_chunk += (' ' if current_chunk else '') + part | |
| else: | |
| part_with_comma = part + comma | |
| if current_chunk and len(current_chunk + ' ' + part_with_comma) > max_length: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = part_with_comma | |
| else: | |
| current_chunk += (' ' if current_chunk else '') + part_with_comma | |
| i += 2 if i + 1 < len(comma_parts) else 1 | |
| else: | |
| # Normal sentence that fits within limit | |
| if current_chunk and len(current_chunk + ' ' + sentence) > max_length: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = sentence | |
| else: | |
| current_chunk += (' ' if current_chunk else '') + sentence | |
| # CRITICAL: Always add remaining chunk at the end | |
| if current_chunk.strip(): | |
| chunks.append(current_chunk.strip()) | |
| # Filter out empty or duplicate chunks ✅ | |
| final_chunks = [] | |
| for chunk in chunks: | |
| if chunk.strip() and (not final_chunks or chunk.strip() != final_chunks[-1]): | |
| final_chunks.append(chunk.strip()) | |
| return final_chunks | |
| def process_chunk(chunk, ref_codes, ref_text, tts_model): | |
| """Process a single chunk of text and return the audio.""" | |
| try: | |
| return tts_model.infer(chunk, ref_codes, ref_text) | |
| except Exception as e: | |
| # Swallow individual chunk errors and return None to let caller handle it | |
| return None | |
| def estimate_generation_time(num_chunks): | |
| """Estimate the generation time based on number of chunks.""" | |
| # Assuming average of 3 seconds per chunk plus overhead | |
| return num_chunks * 3 + 2 | |
| def format_time(seconds): | |
| """Format seconds into a readable time string.""" | |
| if seconds < 60: | |
| return f"{seconds:.1f} seconds" | |
| minutes = int(seconds // 60) | |
| seconds = seconds % 60 | |
| return f"{minutes} minute{'s' if minutes != 1 else ''} {seconds:.1f} seconds" | |
| def generate_speech(text, voice_name, speed_control="1x"): | |
| try: | |
| import time | |
| # Input validations | |
| if not text or not text.strip(): | |
| yield 0, None, "❌ Error: Input text cannot be empty.", None | |
| return | |
| if not voice_name: | |
| yield 0, None, "❌ Error: No voice selected. Please select a voice.", None | |
| return | |
| if voice_name not in VOICES["samples"]: | |
| yield 0, None, f"❌ Error: Voice '{voice_name}' not found.", None | |
| return | |
| # Convert speed control string to float | |
| try: | |
| speed = float(speed_control.rstrip('x')) | |
| except ValueError: | |
| speed = 1.0 # Default to 1x if conversion fails | |
| # Load TTS model on first use (deferred so server can bind to PORT first) | |
| yield 5, None, "Loading TTS model (first time may take a few minutes)...", None | |
| try: | |
| tts_instance = get_tts() | |
| except Exception as e: | |
| yield 0, None, f"❌ Failed to load TTS model: {str(e)}", None | |
| return | |
| start_time = time.time() | |
| yield 10, None, "Loading voice reference...", None | |
| ref_text, ref_codes = load_reference(voice_name) | |
| # Split text into smaller chunks for better processing | |
| chunks = split_text_into_chunks(text) | |
| total_chunks = len(chunks) | |
| if total_chunks == 0: | |
| raise ValueError("No text to process") | |
| # Estimate total time | |
| estimated_time = estimate_generation_time(total_chunks) | |
| status = f"Estimated time to completion: {format_time(estimated_time)}\nProcessing {total_chunks} chunks..." | |
| yield 15, None, status, None | |
| # Process each chunk and store with its index | |
| chunk_results = [] | |
| for i, chunk in enumerate(chunks, 1): | |
| chunk_start = time.time() | |
| # Update progress | |
| progress = int(15 + (75 * i / total_chunks)) | |
| # Calculate and show time statistics | |
| elapsed_time = time.time() - start_time | |
| if i > 1: | |
| avg_time_per_chunk = elapsed_time / (i - 1) | |
| remaining_chunks = total_chunks - (i - 1) | |
| estimated_remaining = avg_time_per_chunk * remaining_chunks | |
| status = ( | |
| f"Processing chunk {i}/{total_chunks}\n" | |
| f"Progress: {progress}% complete\n" | |
| f"Est. remaining: {format_time(estimated_remaining)}" | |
| ) | |
| else: | |
| status = f"Processing chunk {i}/{total_chunks}\nProgress: {progress}% complete" | |
| yield progress, None, status, None | |
| # Generate audio for this chunk | |
| chunk_wav = process_chunk(chunk, ref_codes, ref_text, tts_instance) | |
| if chunk_wav is not None: | |
| # Store chunk with its index to maintain order | |
| chunk_results.append((i-1, chunk_wav)) | |
| if not chunk_results: | |
| raise ValueError("Failed to generate any audio") | |
| # Update status for final processing | |
| yield 90, None, "Finalizing audio...\nOrdering and combining chunks...", None | |
| # Sort chunks by their original index and extract the audio data | |
| chunk_results.sort(key=lambda x: x[0]) # Sort by index | |
| processed_chunks = [chunk[1] for chunk in chunk_results] # Extract audio data in order | |
| # Create silence once | |
| silence = np.zeros(int(24000 * 0.25)) # 0.25 seconds silence between chunks | |
| # Concatenate all chunks with silence in between | |
| all_wav = processed_chunks[0] | |
| for chunk_wav in processed_chunks[1:]: | |
| all_wav = np.concatenate([all_wav, silence, chunk_wav]) | |
| # Apply speed adjustment if needed (pitch-preserving time-stretching) | |
| if speed != 1.0: | |
| # Use librosa for pitch-preserving time-stretching | |
| # rate > 1 speeds up, rate < 1 slows down | |
| all_wav = librosa.effects.time_stretch(all_wav.astype(np.float32), rate=speed) | |
| # Save the final audio | |
| temp_path = "temp_output.wav" | |
| sf.write(temp_path, all_wav, 24000) | |
| # Calculate and show total time taken | |
| total_time = time.time() - start_time | |
| final_status = f"✅ Generation complete!\nTotal time: {format_time(total_time)}" | |
| yield 100, temp_path, final_status, None | |
| except Exception as e: | |
| error_status = f"❌ Error generating speech: {str(e)}" | |
| yield 0, None, error_status, None | |
| def delete_voice(voice_name): | |
| """Deletes a voice and its associated files.""" | |
| try: | |
| if voice_name not in VOICES["samples"]: | |
| return f"❌ Voice '{voice_name}' not found!", gr.update() | |
| txt_path = f"samples/{voice_name}.txt" | |
| wav_path = f"samples/{voice_name}.wav" | |
| pt_path = f"samples/{voice_name}.pt" | |
| # Remove files if they exist | |
| for path in [txt_path, wav_path, pt_path]: | |
| if os.path.exists(path): | |
| os.remove(path) | |
| # Remove from VOICES dictionary | |
| del VOICES["samples"][voice_name] | |
| remaining_voices = list(VOICES["samples"].keys()) | |
| new_selected = remaining_voices[0] if remaining_voices else None | |
| return f"✅ Voice '{voice_name}' deleted successfully!", gr.update(choices=remaining_voices, value=new_selected) | |
| except Exception as e: | |
| return f"❌ Error deleting voice: {e}", gr.update() | |
| def clone_voice(new_name, txt, audio_file): | |
| """Encodes a new reference voice and saves its embedding.""" | |
| try: | |
| # Input validations | |
| if not new_name or not new_name.strip(): | |
| return "❌ Error: New Voice name cannot be empty.", gr.update() | |
| if not txt or not txt.strip(): | |
| return "❌ Error: Reference text cannot be empty.", gr.update() | |
| if not audio_file: | |
| return "❌ Error: No reference audio file provided.", gr.update() | |
| if new_name in VOICES["samples"]: | |
| return f"❌ Error: Voice '{new_name}' already exists. Please choose a different name.", gr.update() | |
| try: | |
| tts_instance = get_tts() | |
| except Exception as e: | |
| return f"❌ Failed to load TTS model: {str(e)}", gr.update() | |
| os.makedirs("samples", exist_ok=True) | |
| txt_path = f"samples/{new_name}.txt" | |
| wav_path = f"samples/{new_name}.wav" | |
| pt_path = f"samples/{new_name}.pt" | |
| # Save reference text and audio | |
| with open(txt_path, "w") as f: | |
| f.write(txt.strip()) | |
| shutil.copy(audio_file, wav_path) | |
| ref_codes = tts_instance.encode_reference(wav_path) | |
| import torch | |
| torch.save(ref_codes, pt_path) | |
| VOICES["samples"][new_name] = (txt_path, pt_path) | |
| return f"✅ Voice '{new_name}' cloned and saved successfully!", gr.update(choices=list(VOICES["samples"].keys()), value=new_name) | |
| except Exception as e: | |
| return f"❌ Error cloning voice: {e}", gr.update() | |
| # --------------------------- | |
| # UI | |
| # --------------------------- | |
| # Custom CSS - consistent dark theme | |
| custom_css = """ | |
| footer {display: none !important;} | |
| .footer {display: none !important;} | |
| #api-docs-link {display: none !important;} | |
| /* Dark theme palette */ | |
| :root { | |
| --dark-bg: #1e1e2e; | |
| --dark-card: #252530; | |
| --dark-border: #3d3d4a; | |
| --text-primary: #e4e4e7; | |
| --text-muted: #a1a1aa; | |
| --accent: #818cf8; | |
| --accent-secondary: #a78bfa; | |
| } | |
| /* Modern header - dark gradient */ | |
| .heading-container { | |
| text-align: center; | |
| padding: 2rem 1rem; | |
| background: linear-gradient(135deg, #4338ca 0%, #6d28d9 50%, #4c1d95 100%); | |
| border-radius: 12px; | |
| margin-bottom: 2rem; | |
| border: 1px solid var(--dark-border); | |
| color: white; | |
| } | |
| .heading-container h1 { | |
| margin: 0; | |
| font-size: 2.5rem; | |
| font-weight: 700; | |
| color: white; | |
| } | |
| .heading-container h3 { | |
| margin: 0.5rem 0 0 0; | |
| font-size: 1.1rem; | |
| font-weight: 400; | |
| color: rgba(255, 255, 255, 0.9); | |
| } | |
| /* Cards - dark, same as rest of app */ | |
| .control-panel { | |
| background: var(--dark-card) !important; | |
| padding: 1.5rem; | |
| border-radius: 12px; | |
| border: 1px solid var(--dark-border); | |
| margin-bottom: 1rem; | |
| color: var(--text-primary) !important; | |
| } | |
| .control-panel label, .control-panel .label-wrap, .control-panel p, | |
| .control-panel h1, .control-panel h2, .control-panel h3, .control-panel h4, | |
| .control-panel span, .control-panel div, .control-panel li, .control-panel small, | |
| .control-panel .markdown, .control-panel [class*="markdown"], .control-panel * { | |
| color: var(--text-primary) !important; | |
| } | |
| .output-panel { | |
| background: var(--dark-card) !important; | |
| padding: 1.5rem; | |
| border-radius: 12px; | |
| border: 1px solid var(--dark-border); | |
| box-shadow: 0 2px 12px rgba(0,0,0,0.3); | |
| color: var(--text-primary) !important; | |
| } | |
| .output-panel label, .output-panel .label-wrap, .output-panel p, | |
| .output-panel h1, .output-panel h2, .output-panel h3, .output-panel h4, | |
| .output-panel span, .output-panel div, .output-panel li, .output-panel small, | |
| .output-panel .markdown, .output-panel [class*="markdown"], .output-panel * { | |
| color: var(--text-primary) !important; | |
| } | |
| /* Button styling */ | |
| .primary-button { | |
| width: 100%; | |
| padding: 0.75rem; | |
| font-size: 1.1rem; | |
| font-weight: 600; | |
| border-radius: 8px; | |
| margin-top: 1rem; | |
| } | |
| /* Progress bar styling */ | |
| .progress-container { | |
| margin: 1rem 0; | |
| } | |
| /* Status box - dark */ | |
| .status-box { | |
| background: var(--dark-bg) !important; | |
| border-radius: 8px; | |
| padding: 1rem; | |
| min-height: 80px; | |
| border: 1px solid var(--dark-border); | |
| color: var(--text-primary) !important; | |
| } | |
| /* Audio container - dark */ | |
| .audio-container { | |
| margin-top: 1rem; | |
| padding: 1rem; | |
| background: var(--dark-card) !important; | |
| border-radius: 8px; | |
| border: 1px solid var(--dark-border); | |
| color: var(--text-primary) !important; | |
| } | |
| .audio-container label, .audio-container .label-wrap, | |
| .audio-container h1, .audio-container h2, .audio-container h3, .audio-container h4, | |
| .audio-container span, .audio-container div, .audio-container .markdown, | |
| .audio-container [class*="markdown"], .audio-container * { | |
| color: var(--text-primary) !important; | |
| } | |
| /* Upload/drop zone text */ | |
| .control-panel [class*="upload"] span, | |
| .control-panel [class*="drop"] span, | |
| .output-panel [class*="upload"] span, | |
| .output-panel [class*="drop"] span { | |
| color: var(--text-muted) !important; | |
| } | |
| .info-text, .info-text * { | |
| color: var(--text-muted) !important; | |
| } | |
| /* Voice selection styling */ | |
| .voice-controls { | |
| display: flex; | |
| gap: 0.5rem; | |
| align-items: flex-end; | |
| } | |
| .tab-nav { | |
| margin-bottom: 1.5rem; | |
| } | |
| /* Instructions - dark card, same as panels */ | |
| .instructions-content { | |
| background: var(--dark-card) !important; | |
| padding: 2rem; | |
| border-radius: 12px; | |
| border: 1px solid var(--dark-border); | |
| line-height: 1.8; | |
| max-width: 1200px; | |
| margin: 0 auto; | |
| color: var(--text-primary) !important; | |
| } | |
| .instructions-content p, | |
| .instructions-content li, | |
| .instructions-content span, | |
| .instructions-content div { | |
| color: var(--text-primary) !important; | |
| } | |
| .instructions-content *:not(h1):not(h2):not(h3) { | |
| color: var(--text-primary) !important; | |
| } | |
| .instructions-content h1 { | |
| color: var(--accent) !important; | |
| border-bottom: 3px solid var(--accent); | |
| padding-bottom: 0.5rem; | |
| margin-bottom: 1.5rem; | |
| } | |
| .instructions-content h2 { | |
| color: var(--accent-secondary) !important; | |
| margin-top: 2rem; | |
| margin-bottom: 1rem; | |
| font-size: 1.5rem; | |
| } | |
| .instructions-content h3 { | |
| color: #93c5fd !important; | |
| margin-top: 1.5rem; | |
| margin-bottom: 0.75rem; | |
| font-size: 1.2rem; | |
| } | |
| .instructions-content ul, .instructions-content ol { | |
| margin-left: 1.5rem; | |
| margin-bottom: 1rem; | |
| } | |
| .instructions-content li { | |
| margin-bottom: 0.5rem; | |
| color: var(--text-primary) !important; | |
| } | |
| .instructions-content code { | |
| background: var(--dark-bg); | |
| color: var(--accent); | |
| padding: 0.2rem 0.4rem; | |
| border-radius: 4px; | |
| font-family: 'Courier New', monospace; | |
| font-size: 0.9em; | |
| border: 1px solid var(--dark-border); | |
| } | |
| .instructions-content hr { | |
| border: none; | |
| border-top: 2px solid var(--dark-border); | |
| margin: 2rem 0; | |
| } | |
| .instructions-content blockquote { | |
| border-left: 4px solid var(--accent); | |
| padding-left: 1rem; | |
| margin-left: 0; | |
| color: var(--text-muted) !important; | |
| font-style: italic; | |
| } | |
| """ | |
| with gr.Blocks(title="Virtual Lab Voice Cloning") as app: | |
| # Modern header with gradient | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown( | |
| """ | |
| <div class="heading-container"> | |
| <h1>🎙️ Virtual Lab Voice Cloning</h1> | |
| <h3>High-Quality Text-to-Speech with Voice Cloning</h3> | |
| </div> | |
| """, | |
| elem_classes="heading" | |
| ) | |
| with gr.Tab("🎯 Generate Speech", elem_classes="tab-nav"): | |
| with gr.Row(equal_height=True): | |
| # Left Column - Input Controls | |
| with gr.Column(scale=1, min_width=400): | |
| gr.Markdown("### 📝 Input Settings", elem_classes="control-panel") | |
| text_input = gr.Textbox( | |
| label="📄 Text to Convert", | |
| placeholder="Enter the text you want to convert to speech...", | |
| lines=6, | |
| elem_classes="text-input" | |
| ) | |
| with gr.Row(elem_classes="voice-controls"): | |
| voice_select = gr.Dropdown( | |
| label="🎤 Select Voice", | |
| choices=list(VOICES["samples"].keys()), | |
| value=list(VOICES["samples"].keys())[0] if VOICES["samples"] else None, | |
| interactive=True, | |
| scale=3 | |
| ) | |
| delete_btn = gr.Button( | |
| "🗑️", | |
| variant="secondary", | |
| size="sm", | |
| scale=1, | |
| min_width=50 | |
| ) | |
| speed_control = gr.Dropdown( | |
| label="⚡ Speech Speed", | |
| choices=["1x", "1.1x", "1.2x", "1.3x", "1.4x", "1.5x"], | |
| value="1x", | |
| info="Select playback speed (preserves pitch and voice characteristics)" | |
| ) | |
| generate_btn = gr.Button( | |
| "🎙️ Generate Speech", | |
| variant="primary", | |
| size="lg", | |
| elem_classes="primary-button" | |
| ) | |
| # Right Column - Output & Status | |
| with gr.Column(scale=1, min_width=400): | |
| gr.Markdown("### 📊 Generation Status", elem_classes="output-panel") | |
| progress_bar = gr.Slider( | |
| label="Progress", | |
| minimum=0, | |
| maximum=100, | |
| value=0, | |
| interactive=False, | |
| elem_classes="progress-container" | |
| ) | |
| status_box = gr.Textbox( | |
| label="Status Information", | |
| value="Ready to generate speech. Enter text and select a voice.", | |
| lines=4, | |
| interactive=False, | |
| elem_classes="status-box" | |
| ) | |
| delete_status = gr.Textbox(label="Status", visible=False) | |
| gr.Markdown("### 🎵 Audio Output", elem_classes="audio-container") | |
| audio_output = gr.Audio( | |
| label="Generated Audio", | |
| autoplay=True | |
| ) | |
| # Event handlers | |
| generate_btn.click( | |
| fn=generate_speech, | |
| inputs=[text_input, voice_select, speed_control], | |
| outputs=[progress_bar, audio_output, status_box, delete_status] | |
| ) | |
| delete_btn.click( | |
| fn=delete_voice, | |
| inputs=[voice_select], | |
| outputs=[delete_status, voice_select] | |
| ) | |
| with gr.Tab("🧬 Clone New Voice", elem_classes="tab-nav"): | |
| with gr.Row(equal_height=True): | |
| # Left Column - Voice Cloning Input | |
| with gr.Column(scale=1, min_width=400): | |
| gr.Markdown("### 🎤 Voice Cloning Setup", elem_classes="control-panel") | |
| new_voice_name = gr.Textbox( | |
| label="📛 Voice Name", | |
| placeholder="Enter a unique name for this voice...", | |
| info="Choose a descriptive name for your cloned voice" | |
| ) | |
| ref_text_input = gr.Textbox( | |
| label="📝 Reference Text", | |
| placeholder="Enter the exact text that is spoken in the audio sample...", | |
| lines=4, | |
| info="This should match the text spoken in your audio file" | |
| ) | |
| ref_audio_input = gr.Audio( | |
| label="🎵 Reference Audio File", | |
| type="filepath" | |
| ) | |
| gr.Markdown( | |
| "<small>💡 Upload a WAV file containing the voice sample (recommended: 5-30 seconds)</small>", | |
| elem_classes="info-text" | |
| ) | |
| clone_btn = gr.Button( | |
| "🧬 Clone Voice", | |
| variant="primary", | |
| size="lg", | |
| elem_classes="primary-button" | |
| ) | |
| # Right Column - Status | |
| with gr.Column(scale=1, min_width=400): | |
| gr.Markdown("### 📋 Cloning Status", elem_classes="output-panel") | |
| clone_status = gr.Textbox( | |
| label="Status", | |
| value="Ready to clone a new voice. Fill in the details on the left and upload an audio sample.", | |
| lines=8, | |
| interactive=False, | |
| elem_classes="status-box" | |
| ) | |
| gr.Markdown( | |
| """ | |
| ### 💡 Tips for Best Results | |
| - Use clear, high-quality audio recordings | |
| - Ensure the reference text matches what's spoken | |
| - Audio length: 5-30 seconds works best | |
| - Speak naturally and clearly in the sample | |
| - Avoid background noise when possible | |
| """, | |
| elem_classes="control-panel" | |
| ) | |
| # Event handler | |
| clone_btn.click( | |
| fn=clone_voice, | |
| inputs=[new_voice_name, ref_text_input, ref_audio_input], | |
| outputs=[clone_status, voice_select] | |
| ) | |
| with gr.Tab("📖 Instructions", elem_classes="tab-nav"): | |
| with gr.Column(): | |
| gr.Markdown( | |
| """ | |
| # 🎙️ Virtual Lab Voice Cloning - User Guide | |
| Welcome to the Virtual Lab Voice Cloning tool! This guide will help you get started with creating high-quality text-to-speech audio using voice cloning technology. | |
| --- | |
| ## 🎯 How to Generate Speech | |
| ### Step 1: Navigate to the "Generate Speech" Tab | |
| Click on the **"🎯 Generate Speech"** tab at the top of the interface. | |
| ### Step 2: Enter Your Text | |
| - Type or paste the text you want to convert to speech in the **"📄 Text to Convert"** text box | |
| - You can enter multiple sentences or paragraphs | |
| - The tool will automatically split long texts into manageable chunks | |
| ### Step 3: Select a Voice | |
| - Choose a voice from the **"🎤 Select Voice"** dropdown menu | |
| - Only voices that have been cloned and saved will appear in this list | |
| - You can delete a voice by clicking the 🗑️ button next to the voice selector | |
| ### Step 4: Adjust Speech Speed (Optional) | |
| - Use the **"⚡ Speech Speed"** dropdown to control playback speed | |
| - Options range from 1x (normal) to 1.5x (faster) | |
| - Speed adjustment preserves pitch and voice characteristics | |
| ### Step 5: Generate Audio | |
| - Click the **"🎙️ Generate Speech"** button | |
| - Monitor the progress bar and status messages | |
| - The generated audio will appear automatically when complete | |
| - You can play the audio directly in the browser or download it | |
| --- | |
| ## 🧬 How to Clone a New Voice | |
| ### Step 1: Navigate to the "Clone New Voice" Tab | |
| Click on the **"🧬 Clone New Voice"** tab at the top of the interface. | |
| ### Step 2: Prepare Your Audio Sample | |
| Before cloning, you'll need: | |
| - A clear audio recording (WAV format recommended) | |
| - 5-30 seconds of speech works best | |
| - High-quality audio with minimal background noise | |
| - Natural, clear speech | |
| ### Step 3: Enter Voice Details | |
| - **Voice Name**: Enter a unique, descriptive name for your cloned voice | |
| - **Reference Text**: Type the exact text that is spoken in your audio sample | |
| - **Reference Audio**: Upload your WAV audio file using the file uploader | |
| ### Step 4: Clone the Voice | |
| - Click the **"🧬 Clone Voice"** button | |
| - Wait for the cloning process to complete | |
| - Once successful, the new voice will be available in the voice selector | |
| ### Step 5: Use Your Cloned Voice | |
| - Navigate back to the "Generate Speech" tab | |
| - Your newly cloned voice will appear in the voice dropdown | |
| - Select it and generate speech as usual | |
| --- | |
| ## 💡 Best Practices & Tips | |
| ### For Voice Cloning: | |
| - ✅ Use high-quality, clear audio recordings | |
| - ✅ Ensure the reference text exactly matches what's spoken in the audio | |
| - ✅ Record in a quiet environment to minimize background noise | |
| - ✅ Speak naturally and at a normal pace | |
| - ✅ Use 5-30 seconds of audio for best results | |
| - ❌ Avoid very short clips (less than 3 seconds) | |
| - ❌ Avoid clips with heavy background noise or music | |
| - ❌ Don't use text that doesn't match the audio content | |
| ### For Speech Generation: | |
| - ✅ Use proper punctuation for better natural pauses | |
| - ✅ Break long texts into paragraphs for better processing | |
| - ✅ Review the generated audio and adjust speed if needed | |
| - ✅ The tool automatically handles long texts by splitting them into chunks | |
| - ✅ Generated audio is saved and can be downloaded | |
| ### Performance Tips: | |
| - The tool processes text in chunks for better performance | |
| - Longer texts will take more time to generate | |
| - Progress updates show estimated completion time | |
| - GPU acceleration is used when available for faster processing | |
| --- | |
| ## 🔧 Technical Information | |
| ### Supported Formats: | |
| - **Input Audio**: WAV format (recommended) | |
| - **Output Audio**: WAV format, 24kHz sample rate | |
| - **Text**: Plain text (UTF-8) | |
| ### System Requirements: | |
| - NVIDIA GPU recommended for best performance | |
| - CUDA support for GPU acceleration | |
| - eSpeak NG installed for phonemization | |
| ### Features: | |
| - High-quality neural text-to-speech | |
| - Voice cloning from short audio samples | |
| - Pitch-preserving speed control | |
| - Automatic text chunking for long inputs | |
| - Real-time progress tracking | |
| --- | |
| ## ❓ Troubleshooting | |
| ### Common Issues: | |
| **"No voice selected" error:** | |
| - Make sure you have cloned at least one voice | |
| - Check that the voice appears in the dropdown menu | |
| **"Input text cannot be empty" error:** | |
| - Ensure you've entered text in the text input box | |
| - Check for whitespace-only text | |
| **Audio generation fails:** | |
| - Verify your GPU has enough memory | |
| - Try generating shorter texts first | |
| - Check that the voice files are not corrupted | |
| **Voice cloning fails:** | |
| - Ensure the audio file is in WAV format | |
| - Verify the reference text matches the audio content | |
| - Check that the audio quality is sufficient | |
| - Make sure the voice name is unique | |
| --- | |
| ## 📝 Notes | |
| - All cloned voices are saved locally in the `samples` folder | |
| - Generated audio files are temporary and should be downloaded if you want to keep them | |
| - The tool uses advanced neural networks for high-quality voice synthesis | |
| - Processing time depends on text length and system performance | |
| --- | |
| **Enjoy creating amazing voice clones! 🎉** | |
| """, | |
| elem_classes="instructions-content" | |
| ) | |
| if __name__ == "__main__": | |
| # Always bind to 0.0.0.0 so Render can detect the port (default PORT=10000) | |
| port = int(os.environ.get("PORT", "7860")) | |
| server_name = "0.0.0.0" | |
| inbrowser = os.environ.get("PORT") is None # open browser only when not in cloud | |
| print(f"\nLaunching on http://{server_name}:{port}") | |
| app.launch( | |
| server_name=server_name, | |
| server_port=port, | |
| share=False, | |
| inbrowser=inbrowser, | |
| show_error=True, | |
| theme=gr.themes.Soft(primary_hue="purple", secondary_hue="blue"), | |
| css=custom_css, | |
| ) | |