File size: 2,689 Bytes
2bb8806
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import type { TokenizeResult } from "./tokenize";

// pred_dur is in PREDICTOR frames, not mel frames. Empirically:
//   audio.length / pred_dur.sum === 600 (not 300 as you'd guess from mel hop).
// The prosody predictor runs at half the mel-frame rate; the decoder
// upsamples 2× internally before iSTFT. So frame-to-seconds uses hop=600.
const HOP = 600;
const DEFAULT_SR = 24000;

export interface PhonemeTiming {
  phone: string;
  start: number;      // seconds
  end: number;
  durFrames: number;
}

export interface WordTiming {
  word: string;
  ipa: string;
  start: number;      // seconds
  end: number;
  phonemes: PhonemeTiming[];
}

/**
 * Turn pred_dur (per-phoneme duration in frames) + the per-word index ranges
 * from tokenize.ts into phoneme + word level timestamps in seconds.
 *
 * Kokoro's prosody predictor outputs durations in *predictor frames* where
 * 1 frame = 600 audio samples at 24 kHz (25 ms/frame). We accumulate and
 * attribute to the originating word via the tokenStart/tokenEnd ranges that
 * tokenizeByWord wrote down.
 */
export function buildTimings(
  tokenizeResult: TokenizeResult,
  predDur: Int32Array | number[],
  sampleRate: number = DEFAULT_SR,
): { phonemes: PhonemeTiming[]; words: WordTiming[] } {
  const durArr: number[] = Array.from(predDur as Iterable<number>);

  if (durArr.length !== tokenizeResult.inputIds.length) {
    // Kokoro sometimes returns pred_dur aligned to internal sequence length
    // (post-padding, post-space-injection). We expect them to match; if they
    // don't, warn and continue with the shorter of the two so we don't crash.
    console.warn(
      `pred_dur length (${durArr.length}) != input_ids length (${tokenizeResult.inputIds.length}) — truncating`,
    );
  }
  const n = Math.min(durArr.length, tokenizeResult.inputIds.length);

  // Phoneme-level
  const phonemes: PhonemeTiming[] = [];
  let cumFrames = 0;
  for (let i = 0; i < n; i++) {
    const df = durArr[i];
    const start = (cumFrames * HOP) / sampleRate;
    cumFrames += df;
    const end = (cumFrames * HOP) / sampleRate;
    phonemes.push({
      phone: tokenizeResult.phonemes[i] ?? "?",
      start,
      end,
      durFrames: df,
    });
  }

  // Word-level: for each word, take start of first in-range phoneme and end of last
  const words: WordTiming[] = [];
  for (const w of tokenizeResult.words) {
    const slice = phonemes.slice(w.tokenStart, Math.min(w.tokenEnd, phonemes.length));
    if (slice.length === 0) continue;
    words.push({
      word: w.word,
      ipa: w.ipa,
      start: slice[0].start,
      end: slice[slice.length - 1].end,
      phonemes: slice,
    });
  }

  return { phonemes, words };
}