Spaces:
Running
Running
| import { AutoModel, Tensor, env } from "@huggingface/transformers"; | |
| // Where the ONNX model lives. By default fetches from HF Hub: | |
| // onnx/model.onnx + config.json from `shreyask/bol-tts-marathi-onnx`. | |
| // For local testing of a re-export, set `?local` query param OR override | |
| // `LOCAL_MODEL_PATH` — loads from /public/models/bol-tts/{config.json,onnx/model.onnx}. | |
| const HF_MODEL_REPO = "shreyask/bol-tts-marathi-onnx"; | |
| const LOCAL_MODEL_PATH = "models/bol-tts"; | |
| // eslint-disable-next-line @typescript-eslint/no-explicit-any | |
| const USE_LOCAL_MODEL = typeof window !== "undefined" && new URLSearchParams(window.location.search).has("local"); | |
| export interface KokoroVocab { [phone: string]: number; } | |
| export interface KokoroConfig { vocab: KokoroVocab; n_token: number; } | |
| export async function loadConfig(): Promise<KokoroConfig> { | |
| const res = await fetch("/config.json"); | |
| if (!res.ok) throw new Error(`config.json fetch failed: ${res.status}`); | |
| return res.json(); | |
| } | |
| export async function loadVoicepack(voiceId: string): Promise<Float32Array> { | |
| const res = await fetch(`/voices/${voiceId}.bin`); | |
| if (!res.ok) throw new Error(`${voiceId}.bin fetch failed: ${res.status}`); | |
| const buf = await res.arrayBuffer(); | |
| const f32 = new Float32Array(buf); | |
| if (f32.length !== 510 * 1 * 256) { | |
| throw new Error(`voicepack ${voiceId}: expected ${510 * 1 * 256} floats, got ${f32.length}`); | |
| } | |
| return f32; | |
| } | |
| export interface SynthesizeResult { | |
| audio: Float32Array; | |
| predDur: Int32Array; | |
| sampleRate: number; | |
| // Time (seconds) the BOS token's audio occupies at the start of `audio`, | |
| // before the first content phoneme. Caller must add this to phoneme.start | |
| // / phoneme.end when computing timeline-aligned timings, since `predDur` | |
| // returned here covers ONLY content (BOS+EOS positions stripped). | |
| leadOffsetSec: number; | |
| } | |
| export class KokoroSession { | |
| // eslint-disable-next-line @typescript-eslint/no-explicit-any | |
| private model: any; | |
| private isWebGpu: boolean; | |
| private constructor(model: any, isWebGpu: boolean) { this.model = model; this.isWebGpu = isWebGpu; } | |
| get backend(): string { return this.isWebGpu ? "webgpu" : "wasm"; } | |
| static async create(onProgress?: (msg: string) => void): Promise<KokoroSession> { | |
| if (USE_LOCAL_MODEL) { | |
| env.allowRemoteModels = false; | |
| env.allowLocalModels = true; | |
| // localModelPath is prepended to the repo string; "/" + "models/bol-tts" → "/models/bol-tts" | |
| env.localModelPath = "/"; | |
| } else { | |
| env.allowRemoteModels = true; | |
| env.allowLocalModels = false; | |
| } | |
| const hasWebGpu = typeof (navigator as any).gpu !== "undefined"; | |
| const device = hasWebGpu ? "webgpu" : "wasm"; | |
| // fp32 only — Kokoro's ISTFTNet decoder is sensitive to int8/int4 weight | |
| // quantization (produces NaN or clipped audio). kokoro.js and HeadTTS both | |
| // default to fp32 on WebGPU for this reason. Size is the cost of quality. | |
| const repoOrPath = USE_LOCAL_MODEL ? LOCAL_MODEL_PATH : HF_MODEL_REPO; | |
| onProgress?.(`loading model from ${USE_LOCAL_MODEL ? "local" : "HF"} (${device}, ~325 MB one-time)…`); | |
| const model = await AutoModel.from_pretrained(repoOrPath, { | |
| dtype: "fp32", | |
| device, | |
| progress_callback: (p: any) => { | |
| if (p.status === "progress" && p.file?.endsWith(".onnx")) { | |
| const mb = (p.loaded / 1e6).toFixed(1); | |
| const pct = p.progress ? `${p.progress.toFixed(1)}%` : ""; | |
| onProgress?.(`downloading ${p.file}: ${mb} MB ${pct}`); | |
| } else if (p.status === "done") { | |
| onProgress?.(`loaded ${p.file}`); | |
| } | |
| }, | |
| } as any); | |
| return new KokoroSession(model, hasWebGpu); | |
| } | |
| async synthesize(inputIds: number[], refS: Float32Array, speed: number = 1.0): Promise<SynthesizeResult> { | |
| // Kokoro's KModel.forward (Python) prepends BOS=0 + appends EOS=0 before | |
| // calling forward_with_tokens. Our ONNX export exposes forward_with_tokens | |
| // directly so the caller has to wrap. Without BOS, the iSTFTNet decoder's | |
| // window-startup transient (~30-50 ms unreliable output) lands on the | |
| // first content phoneme — leading consonants like /m/ in मुंबई get | |
| // perceptually eaten. With BOS, the predictor allocates real duration to | |
| // the boundary token and the decoder produces clean pre-content audio | |
| // before the first phoneme. | |
| const wrappedIds = [0, ...inputIds, 0]; | |
| const ids = BigInt64Array.from(wrappedIds.map(v => BigInt(v))); | |
| const idTensor = new Tensor("int64", ids, [1, ids.length]); | |
| // Input is named "style" to match kokoro-js + thewh1teagle/kokoro-onnx | |
| // ecosystem convention. The `refS` argument name is preserved on this | |
| // function for backward compatibility — it's the same [1, 256] voicepack | |
| // slice either way. | |
| const styleTensor = new Tensor("float32", refS, [1, 256]); | |
| const speedTensor = new Tensor("float32", new Float32Array([speed]), [1]); | |
| const out = await this.model({ input_ids: idTensor, style: styleTensor, speed: speedTensor }); | |
| const fullAudio = out.audio.data as Float32Array; | |
| const raw = out.pred_dur.data as BigInt64Array; | |
| // Strip BOS (index 0) + EOS (last index) from pred_dur so the returned | |
| // array aligns 1:1 with the caller's original inputIds. ALSO strip the | |
| // BOS audio prefix and EOS audio suffix from the buffer: | |
| // - we need BOS in the *input* so the predictor allocates real duration | |
| // to the boundary token (this is what fixes Marathi /m/ getting eaten | |
| // in 'मुंबई' — the predictor with BOS context gives the first content | |
| // phoneme proper duration); | |
| // - but we DON'T want the BOS *audio* in the output: Rasa-trained voices | |
| // learned BOS audio = soft breathy pre-roll (training data had natural | |
| // pre-speech sounds), which surfaces as an audible "umm" at the start | |
| // of every utterance. SpringLab voices learned BOS = silence so they | |
| // wouldn't show this either way. Stripping the BOS audio gives both | |
| // voice families the predictor benefit without the umm. | |
| // | |
| // 1 predictor frame = 600 audio samples at 24 kHz = 25 ms. | |
| const HOP = 600; | |
| const SR = 24000; | |
| const bosFrames = raw.length > 0 ? Number(raw[0]) : 0; | |
| const eosFrames = raw.length > 1 ? Number(raw[raw.length - 1]) : 0; | |
| const innerLen = Math.max(0, raw.length - 2); | |
| const predDur = new Int32Array(innerLen); | |
| for (let i = 0; i < innerLen; i++) predDur[i] = Number(raw[i + 1]); | |
| const startSample = Math.min(bosFrames * HOP, fullAudio.length); | |
| const endSample = Math.max(startSample, fullAudio.length - eosFrames * HOP); | |
| const audio = fullAudio.subarray(startSample, endSample); | |
| return { | |
| audio, | |
| predDur, | |
| sampleRate: SR, | |
| // BOS audio already stripped from `audio`, so no offset for the caller. | |
| leadOffsetSec: 0, | |
| }; | |
| } | |
| } | |