Spaces:

shreyask
/

bol-tts-marathi

Running

App Files Files Community

bol-tts-marathi / src /timestamps.ts

shreyask

release: webgpu demo + source

2bb8806 verified 14 days ago

raw

history blame contribute delete

2.69 kB

	import type { TokenizeResult } from "./tokenize";

	// pred_dur is in PREDICTOR frames, not mel frames. Empirically:
	// audio.length / pred_dur.sum === 600 (not 300 as you'd guess from mel hop).
	// The prosody predictor runs at half the mel-frame rate; the decoder
	// upsamples 2× internally before iSTFT. So frame-to-seconds uses hop=600.
	const HOP = 600;
	const DEFAULT_SR = 24000;

	export interface PhonemeTiming {
	phone: string;
	start: number; // seconds
	end: number;
	durFrames: number;
	}

	export interface WordTiming {
	word: string;
	ipa: string;
	start: number; // seconds
	end: number;
	phonemes: PhonemeTiming[];
	}

	/**
	* Turn pred_dur (per-phoneme duration in frames) + the per-word index ranges
	* from tokenize.ts into phoneme + word level timestamps in seconds.
	*
	* Kokoro's prosody predictor outputs durations in predictor frames where
	* 1 frame = 600 audio samples at 24 kHz (25 ms/frame). We accumulate and
	* attribute to the originating word via the tokenStart/tokenEnd ranges that
	* tokenizeByWord wrote down.
	*/
	export function buildTimings(
	tokenizeResult: TokenizeResult,
	predDur: Int32Array \| number[],
	sampleRate: number = DEFAULT_SR,
	): { phonemes: PhonemeTiming[]; words: WordTiming[] } {
	const durArr: number[] = Array.from(predDur as Iterable<number>);

	if (durArr.length !== tokenizeResult.inputIds.length) {
	// Kokoro sometimes returns pred_dur aligned to internal sequence length
	// (post-padding, post-space-injection). We expect them to match; if they
	// don't, warn and continue with the shorter of the two so we don't crash.
	console.warn(
	`pred_dur length (${durArr.length}) != input_ids length (${tokenizeResult.inputIds.length}) — truncating`,
	);
	}
	const n = Math.min(durArr.length, tokenizeResult.inputIds.length);

	// Phoneme-level
	const phonemes: PhonemeTiming[] = [];
	let cumFrames = 0;
	for (let i = 0; i < n; i++) {
	const df = durArr[i];
	const start = (cumFrames * HOP) / sampleRate;
	cumFrames += df;
	const end = (cumFrames * HOP) / sampleRate;
	phonemes.push({
	phone: tokenizeResult.phonemes[i] ?? "?",
	start,
	end,
	durFrames: df,
	});
	}

	// Word-level: for each word, take start of first in-range phoneme and end of last
	const words: WordTiming[] = [];
	for (const w of tokenizeResult.words) {
	const slice = phonemes.slice(w.tokenStart, Math.min(w.tokenEnd, phonemes.length));
	if (slice.length === 0) continue;
	words.push({
	word: w.word,
	ipa: w.ipa,
	start: slice[0].start,
	end: slice[slice.length - 1].end,
	phonemes: slice,
	});
	}

	return { phonemes, words };
	}