import type { TokenizeResult } from "./tokenize"; // pred_dur is in PREDICTOR frames, not mel frames. Empirically: // audio.length / pred_dur.sum === 600 (not 300 as you'd guess from mel hop). // The prosody predictor runs at half the mel-frame rate; the decoder // upsamples 2× internally before iSTFT. So frame-to-seconds uses hop=600. const HOP = 600; const DEFAULT_SR = 24000; export interface PhonemeTiming { phone: string; start: number; // seconds end: number; durFrames: number; } export interface WordTiming { word: string; ipa: string; start: number; // seconds end: number; phonemes: PhonemeTiming[]; } /** * Turn pred_dur (per-phoneme duration in frames) + the per-word index ranges * from tokenize.ts into phoneme + word level timestamps in seconds. * * Kokoro's prosody predictor outputs durations in *predictor frames* where * 1 frame = 600 audio samples at 24 kHz (25 ms/frame). We accumulate and * attribute to the originating word via the tokenStart/tokenEnd ranges that * tokenizeByWord wrote down. */ export function buildTimings( tokenizeResult: TokenizeResult, predDur: Int32Array | number[], sampleRate: number = DEFAULT_SR, ): { phonemes: PhonemeTiming[]; words: WordTiming[] } { const durArr: number[] = Array.from(predDur as Iterable); if (durArr.length !== tokenizeResult.inputIds.length) { // Kokoro sometimes returns pred_dur aligned to internal sequence length // (post-padding, post-space-injection). We expect them to match; if they // don't, warn and continue with the shorter of the two so we don't crash. console.warn( `pred_dur length (${durArr.length}) != input_ids length (${tokenizeResult.inputIds.length}) — truncating`, ); } const n = Math.min(durArr.length, tokenizeResult.inputIds.length); // Phoneme-level const phonemes: PhonemeTiming[] = []; let cumFrames = 0; for (let i = 0; i < n; i++) { const df = durArr[i]; const start = (cumFrames * HOP) / sampleRate; cumFrames += df; const end = (cumFrames * HOP) / sampleRate; phonemes.push({ phone: tokenizeResult.phonemes[i] ?? "?", start, end, durFrames: df, }); } // Word-level: for each word, take start of first in-range phoneme and end of last const words: WordTiming[] = []; for (const w of tokenizeResult.words) { const slice = phonemes.slice(w.tokenStart, Math.min(w.tokenEnd, phonemes.length)); if (slice.length === 0) continue; words.push({ word: w.word, ipa: w.ipa, start: slice[0].start, end: slice[slice.length - 1].end, phonemes: slice, }); } return { phonemes, words }; }