Spaces:
Running
Running
| import type { TokenizeResult } from "./tokenize"; | |
| // pred_dur is in PREDICTOR frames, not mel frames. Empirically: | |
| // audio.length / pred_dur.sum === 600 (not 300 as you'd guess from mel hop). | |
| // The prosody predictor runs at half the mel-frame rate; the decoder | |
| // upsamples 2× internally before iSTFT. So frame-to-seconds uses hop=600. | |
| const HOP = 600; | |
| const DEFAULT_SR = 24000; | |
| export interface PhonemeTiming { | |
| phone: string; | |
| start: number; // seconds | |
| end: number; | |
| durFrames: number; | |
| } | |
| export interface WordTiming { | |
| word: string; | |
| ipa: string; | |
| start: number; // seconds | |
| end: number; | |
| phonemes: PhonemeTiming[]; | |
| } | |
| /** | |
| * Turn pred_dur (per-phoneme duration in frames) + the per-word index ranges | |
| * from tokenize.ts into phoneme + word level timestamps in seconds. | |
| * | |
| * Kokoro's prosody predictor outputs durations in *predictor frames* where | |
| * 1 frame = 600 audio samples at 24 kHz (25 ms/frame). We accumulate and | |
| * attribute to the originating word via the tokenStart/tokenEnd ranges that | |
| * tokenizeByWord wrote down. | |
| */ | |
| export function buildTimings( | |
| tokenizeResult: TokenizeResult, | |
| predDur: Int32Array | number[], | |
| sampleRate: number = DEFAULT_SR, | |
| ): { phonemes: PhonemeTiming[]; words: WordTiming[] } { | |
| const durArr: number[] = Array.from(predDur as Iterable<number>); | |
| if (durArr.length !== tokenizeResult.inputIds.length) { | |
| // Kokoro sometimes returns pred_dur aligned to internal sequence length | |
| // (post-padding, post-space-injection). We expect them to match; if they | |
| // don't, warn and continue with the shorter of the two so we don't crash. | |
| console.warn( | |
| `pred_dur length (${durArr.length}) != input_ids length (${tokenizeResult.inputIds.length}) — truncating`, | |
| ); | |
| } | |
| const n = Math.min(durArr.length, tokenizeResult.inputIds.length); | |
| // Phoneme-level | |
| const phonemes: PhonemeTiming[] = []; | |
| let cumFrames = 0; | |
| for (let i = 0; i < n; i++) { | |
| const df = durArr[i]; | |
| const start = (cumFrames * HOP) / sampleRate; | |
| cumFrames += df; | |
| const end = (cumFrames * HOP) / sampleRate; | |
| phonemes.push({ | |
| phone: tokenizeResult.phonemes[i] ?? "?", | |
| start, | |
| end, | |
| durFrames: df, | |
| }); | |
| } | |
| // Word-level: for each word, take start of first in-range phoneme and end of last | |
| const words: WordTiming[] = []; | |
| for (const w of tokenizeResult.words) { | |
| const slice = phonemes.slice(w.tokenStart, Math.min(w.tokenEnd, phonemes.length)); | |
| if (slice.length === 0) continue; | |
| words.push({ | |
| word: w.word, | |
| ipa: w.ipa, | |
| start: slice[0].start, | |
| end: slice[slice.length - 1].end, | |
| phonemes: slice, | |
| }); | |
| } | |
| return { phonemes, words }; | |
| } | |