bol-tts-marathi / src /timestamps.ts
shreyask's picture
release: webgpu demo + source
2bb8806 verified
import type { TokenizeResult } from "./tokenize";
// pred_dur is in PREDICTOR frames, not mel frames. Empirically:
// audio.length / pred_dur.sum === 600 (not 300 as you'd guess from mel hop).
// The prosody predictor runs at half the mel-frame rate; the decoder
// upsamples 2× internally before iSTFT. So frame-to-seconds uses hop=600.
const HOP = 600;
const DEFAULT_SR = 24000;
export interface PhonemeTiming {
phone: string;
start: number; // seconds
end: number;
durFrames: number;
}
export interface WordTiming {
word: string;
ipa: string;
start: number; // seconds
end: number;
phonemes: PhonemeTiming[];
}
/**
* Turn pred_dur (per-phoneme duration in frames) + the per-word index ranges
* from tokenize.ts into phoneme + word level timestamps in seconds.
*
* Kokoro's prosody predictor outputs durations in *predictor frames* where
* 1 frame = 600 audio samples at 24 kHz (25 ms/frame). We accumulate and
* attribute to the originating word via the tokenStart/tokenEnd ranges that
* tokenizeByWord wrote down.
*/
export function buildTimings(
tokenizeResult: TokenizeResult,
predDur: Int32Array | number[],
sampleRate: number = DEFAULT_SR,
): { phonemes: PhonemeTiming[]; words: WordTiming[] } {
const durArr: number[] = Array.from(predDur as Iterable<number>);
if (durArr.length !== tokenizeResult.inputIds.length) {
// Kokoro sometimes returns pred_dur aligned to internal sequence length
// (post-padding, post-space-injection). We expect them to match; if they
// don't, warn and continue with the shorter of the two so we don't crash.
console.warn(
`pred_dur length (${durArr.length}) != input_ids length (${tokenizeResult.inputIds.length}) — truncating`,
);
}
const n = Math.min(durArr.length, tokenizeResult.inputIds.length);
// Phoneme-level
const phonemes: PhonemeTiming[] = [];
let cumFrames = 0;
for (let i = 0; i < n; i++) {
const df = durArr[i];
const start = (cumFrames * HOP) / sampleRate;
cumFrames += df;
const end = (cumFrames * HOP) / sampleRate;
phonemes.push({
phone: tokenizeResult.phonemes[i] ?? "?",
start,
end,
durFrames: df,
});
}
// Word-level: for each word, take start of first in-range phoneme and end of last
const words: WordTiming[] = [];
for (const w of tokenizeResult.words) {
const slice = phonemes.slice(w.tokenStart, Math.min(w.tokenEnd, phonemes.length));
if (slice.length === 0) continue;
words.push({
word: w.word,
ipa: w.ipa,
start: slice[0].start,
end: slice[slice.length - 1].end,
phonemes: slice,
});
}
return { phonemes, words };
}