import { KokoroSession, loadConfig, loadVoicepack, type KokoroConfig, type KokoroVocab } from "./model"; import { phonemizeByWord } from "./phonemize"; // eslint-disable-next-line @typescript-eslint/ban-ts-comment // @ts-ignore — sanscript ships JS with no types import Sanscript from "@indic-transliteration/sanscript"; import { tokenizeByWord } from "./tokenize"; import { buildTimings, type WordTiming, type PhonemeTiming } from "./timestamps"; import type { PhonemizedWord } from "./phonemize"; // Voice list, per-voice default speeds, and gender-aware test phrases all // come from public/voicepacks.json — single source of truth, edited there // to add voices or update sample chips without touching code. interface VoicepackEntry { id: string; name: string; name_native: string; gender: "male" | "female"; source: string; default_speed: number; ship_status: string; } interface ChipPhrase { label: string; text: string; } interface VoicepackManifest { voices: VoicepackEntry[]; test_phrases: Record; default_textarea?: { female?: string; male?: string }; } // ── DOM refs ──────────────────────────────────────────────────────────────── const $ = (id: string) => document.getElementById(id) as T; const statusEl = $("status"); const textEl = $("text"); const voiceEl = $("voice"); const speedEl = $("speed"); const speedValEl = $("speed-val"); const synthBtn = $("synth"); const transliterateBtn = $("transliterate"); const outputSection = $("output-section"); const audioEl = $("audio"); const transcriptEl = $("transcript"); const voiceRefDlEl = $("voice-ref-dl"); const audioDlEl = $("audio-dl"); const metricsCard = $("metrics-card"); const mTtfb = $("m-ttfb"); const mSynth = $("m-synth"); const mRtf = $("m-rtf"); const mTps = $("m-tps"); const mTokens = $("m-tokens"); const mFrames = $("m-frames"); const compactEl = $("transcript-compact"); const phonemesEl = $("phonemes"); const wordTimingsEl = $("word-timings"); const waveformCanvas = $("waveform"); const playPauseBtn = $("playpause"); const timeCurEl = $("time-cur"); const timeTotalEl = $("time-total"); const loadModelBtn = $("load-model"); const modelGateEl = $("model-gate"); // ── State ─────────────────────────────────────────────────────────────────── let session: KokoroSession | null = null; let config: KokoroConfig | null = null; let voicepackManifest: VoicepackManifest | null = null; const voicepackCache: Record = {}; let currentWords: WordTiming[] = []; let currentAudio: Float32Array | null = null; let currentAudioURL: string | null = null; // blob URL we must revoke before replacing let highlightRaf: number | null = null; async function loadVoicepackManifest(): Promise { const res = await fetch("voicepacks.json"); if (!res.ok) throw new Error(`voicepacks.json: ${res.status}`); return (await res.json()) as VoicepackManifest; } // Extra silence to splice into synthesized audio AFTER each punctuation phoneme. // The Marathi v0.2 predictor allocates very few frames to punctuation tokens — // likely because training data was silence-trimmed, so the predictor never saw // natural pauses around commas/periods to learn from. Until v0.6 retrains with // preserved punctuation prosody, splice silence client-side. // // Values in seconds. `,` `;` `:` `—` get the "minor break" treatment; `.` `?` // `!` `…` get the "major break" treatment. const PUNC_PAUSE_SEC: Record = { ",": 0.20, ";": 0.25, ":": 0.20, "—": 0.20, ".": 0.30, "!": 0.30, "?": 0.30, "…": 0.30, }; function injectPunctuationPauses( audio: Float32Array, phonemes: PhonemeTiming[], words: WordTiming[], sampleRate: number, ): { audio: Float32Array; phonemes: PhonemeTiming[]; words: WordTiming[] } { type Insertion = { atSample: number; padSamples: number; afterPhonemeIndex: number }; const insertions: Insertion[] = []; for (let i = 0; i < phonemes.length; i++) { const extra = PUNC_PAUSE_SEC[phonemes[i].phone]; if (!extra) continue; insertions.push({ atSample: Math.min(audio.length, Math.floor(phonemes[i].end * sampleRate)), padSamples: Math.floor(extra * sampleRate), afterPhonemeIndex: i, }); } if (insertions.length === 0) return { audio, phonemes, words }; // Build new audio buffer with zeros spliced in at insertion points. // Insertions are already in increasing atSample order (phonemes are ordered // by time and we walked forward), so we can stream a single pass. const totalPad = insertions.reduce((acc, x) => acc + x.padSamples, 0); const out = new Float32Array(audio.length + totalPad); let writeOff = 0; let readOff = 0; for (const ins of insertions) { const copyLen = ins.atSample - readOff; if (copyLen > 0) { out.set(audio.subarray(readOff, ins.atSample), writeOff); writeOff += copyLen; readOff = ins.atSample; } // Float32Array is already zero-initialized; just advance the write cursor. writeOff += ins.padSamples; } if (readOff < audio.length) { out.set(audio.subarray(readOff), writeOff); } // Compute cumulative time shift per phoneme index (each phoneme that comes // after an insertion gets pushed later by all preceding pads). const shiftSecPerPhoneme = new Array(phonemes.length).fill(0); for (const ins of insertions) { const shiftSec = ins.padSamples / sampleRate; for (let i = ins.afterPhonemeIndex + 1; i < phonemes.length; i++) { shiftSecPerPhoneme[i] += shiftSec; } } const newPhonemes: PhonemeTiming[] = phonemes.map((p, i) => ({ ...p, start: p.start + shiftSecPerPhoneme[i], end: p.end + shiftSecPerPhoneme[i], })); // Words: each word has a phonemes[] sub-array. Find the first/last phoneme // belonging to each word by phone-equality + time-window match against the // new phoneme list, and shift word.start/end to match. Phone strings repeat, // so we walk a cursor through newPhonemes to pair them up safely. let cursor = 0; const newWords: WordTiming[] = words.map((w) => { const wordPhonemes: PhonemeTiming[] = []; for (const wp of w.phonemes) { // Find the next newPhonemes entry with matching phone + roughly matching // pre-shift end time. Since phonemes preserve order across the splice, // a forward cursor walk is sufficient. while (cursor < newPhonemes.length && phonemes[cursor].phone !== wp.phone) { cursor++; } if (cursor < newPhonemes.length) { wordPhonemes.push(newPhonemes[cursor]); cursor++; } } if (wordPhonemes.length === 0) return w; return { ...w, start: wordPhonemes[0].start, end: wordPhonemes[wordPhonemes.length - 1].end, phonemes: wordPhonemes, }; }); return { audio: out, phonemes: newPhonemes, words: newWords }; } function populateVoiceSelector(manifest: VoicepackManifest): void { // Native by gender — keeps the dropdown a one-click affair on // mobile and accessible by default, while making the (now 25+) voice list // browsable. Order: Female first, Male second; within each group the // manifest order is preserved (Marathi-trained voices float to the top). while (voiceEl.firstChild) voiceEl.removeChild(voiceEl.firstChild); const femaleGroup = document.createElement("optgroup"); femaleGroup.label = "आवाज (स्त्री) — Female voices"; const maleGroup = document.createElement("optgroup"); maleGroup.label = "आवाज (पुरुष) — Male voices"; for (const v of manifest.voices) { const opt = document.createElement("option"); opt.value = v.id; const tag = v.ship_status === "stable" ? "" : ` [${v.ship_status.replace(/_/g, " ")}]`; // Drop the trailing "— female"/"— male" since the optgroup label conveys // gender now; keeps option text short on narrow screens. opt.textContent = `${v.name} (${v.name_native})${tag}`; (v.gender === "female" ? femaleGroup : maleGroup).appendChild(opt); } // Append both groups even if one is empty — empty optgroups are invisible. voiceEl.appendChild(femaleGroup); voiceEl.appendChild(maleGroup); } // Build chip