alexwengg commited on Jan 12

Commit

e888982

verified ·

1 Parent(s): 3322761

Upload 54 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

BENCHMARK_RESULTS.md +41 -0
README.md +216 -0
models/.DS_Store +0 -0
models/decoder.mlmodelc/analytics/coremldata.bin +3 -0
models/decoder.mlmodelc/coremldata.bin +3 -0
models/decoder.mlmodelc/metadata.json +120 -0
models/decoder.mlmodelc/model.mil +57 -0
models/decoder.mlmodelc/weights/weight.bin +3 -0
models/decoder.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
models/decoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
models/decoder.mlpackage/Manifest.json +18 -0
models/encoder/encoder_float32.mlmodelc/analytics/coremldata.bin +3 -0
models/encoder/encoder_float32.mlmodelc/coremldata.bin +3 -0
models/encoder/encoder_float32.mlmodelc/metadata.json +168 -0
models/encoder/encoder_float32.mlmodelc/model.mil +0 -0
models/encoder/encoder_float32.mlmodelc/weights/weight.bin +3 -0
models/encoder/encoder_float32.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
models/encoder/encoder_float32.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
models/encoder/encoder_float32.mlpackage/Manifest.json +18 -0
models/encoder/encoder_int8.mlmodelc/analytics/coremldata.bin +3 -0
models/encoder/encoder_int8.mlmodelc/coremldata.bin +3 -0
models/encoder/encoder_int8.mlmodelc/metadata.json +171 -0
models/encoder/encoder_int8.mlmodelc/model.mil +0 -0
models/encoder/encoder_int8.mlmodelc/weights/weight.bin +3 -0
models/encoder/encoder_int8.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
models/encoder/encoder_int8.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
models/encoder/encoder_int8.mlpackage/Manifest.json +18 -0
models/joint.mlmodelc/analytics/coremldata.bin +3 -0
models/joint.mlmodelc/coremldata.bin +3 -0
models/joint.mlmodelc/metadata.json +75 -0
models/joint.mlmodelc/model.mil +25 -0
models/joint.mlmodelc/weights/weight.bin +3 -0
models/joint.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
models/joint.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
models/joint.mlpackage/Manifest.json +18 -0
models/metadata.json +23 -0
models/preprocessor.mlmodelc/analytics/coremldata.bin +3 -0
models/preprocessor.mlmodelc/coremldata.bin +3 -0
models/preprocessor.mlmodelc/metadata.json +106 -0
models/preprocessor.mlmodelc/model.mil +110 -0
models/preprocessor.mlmodelc/weights/weight.bin +3 -0
models/preprocessor.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
models/preprocessor.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
models/preprocessor.mlpackage/Manifest.json +18 -0
models/tokenizer.json +1026 -0
pyproject.toml +14 -0
scripts/benchmark_wer.py +171 -0
scripts/convert_nemotron_streaming.py +229 -0
scripts/individual_components.py +322 -0
scripts/nemo_streaming_reference.py +110 -0

BENCHMARK_RESULTS.md ADDED Viewed

	@@ -0,0 +1,41 @@

+# Nemotron Streaming 0.6B - WER Benchmark Results
+Model: `nvidia/nemotron-speech-streaming-en-0.6b`
+Dataset: LibriSpeech test-clean
+Chunk size: 1.12s
+## Results
+### 10 Files
+| Mode | WER | Errors | Words |
+|------|-----|--------|-------|
+| `pad_and_drop_preencoded=False` | 1.79% | 3 | 168 |
+| `pad_and_drop_preencoded=True` | 3.57% | 6 | 168 |
+### 100 Files
+| Mode | WER | Errors | Words |
+|------|-----|--------|-------|
+| `pad_and_drop_preencoded=False` | 1.88% | - | - |
+### NVIDIA Claimed
+| Dataset | WER |
+|---------|-----|
+| LibriSpeech test-clean (1.12s chunks) | 2.31% |
+## Notes
+- `pad_and_drop_preencoded=False`: Better WER, but cannot be exported to ONNX/CoreML
+- `pad_and_drop_preencoded=True`: Worse WER (~3%), but required for ONNX/CoreML export
+- NVIDIA's 2.31% likely uses `pad_and_drop_preencoded=True` on full 2620 files
+- Our implementation uses `conformer_stream_step` API with `CacheAwareStreamingAudioBuffer`
+## Run Benchmark
+```bash
+cd nemotron-speech-streaming-0.6b/coreml
+uv sync
+uv run python benchmark_wer.py --num-files 100
+```

README.md ADDED Viewed

	@@ -0,0 +1,216 @@

+# Nemotron Speech Streaming 0.6B - CoreML Conversion
+CoreML conversion of NVIDIA's `nvidia/nemotron-speech-streaming-en-0.6b` for real-time streaming ASR on Apple devices.
+## Model Overview
+| Property | Value |
+|----------|-------|
+| Source Model | `nvidia/nemotron-speech-streaming-en-0.6b` |
+| Architecture | FastConformer RNNT (Streaming) |
+| Parameters | 0.6B |
+| Chunk Size | 1.12 seconds (112 mel frames) |
+| Sample Rate | 16kHz |
+| Mel Features | 128 bins |
+## CoreML Models
+4 mlpackage files for the streaming RNNT pipeline:
+| Model | Size | Function |
+|-------|------|----------|
+| `preprocessor.mlpackage` | 1.2M | audio → 128-dim mel spectrogram |
+| `encoder.mlpackage` | 2.2G | mel + cache → encoded + new_cache |
+| `decoder.mlpackage` | 28M | token + LSTM state → decoder_out + new_state |
+| `joint.mlpackage` | 6.6M | encoder + decoder → logits |
+Plus:
+- `metadata.json` - Model configuration
+- `tokenizer.json` - Vocabulary (1024 tokens)
+## Streaming Configuration
+```json
+{
+  "sample_rate": 16000,
+  "mel_features": 128,
+  "chunk_mel_frames": 112,
+  "pre_encode_cache": 9,
+  "total_mel_frames": 121,
+  "vocab_size": 1024,
+  "blank_idx": 1024,
+  "encoder_dim": 1024,
+  "decoder_hidden": 640,
+  "decoder_layers": 2
+}
+```
+### Chunk Timing
+| Parameter | Value |
+|-----------|-------|
+| window_stride | 10ms |
+| chunk_mel_frames | 112 |
+| **chunk duration** | 112 × 10ms = **1.120s** |
+| samples per chunk | 17,920 |
+### Cache Shapes
+| Cache | Shape | Description |
+|-------|-------|-------------|
+| cache_channel | [1, 24, 70, 1024] | Attention context cache |
+| cache_time | [1, 24, 1024, 8] | Convolution time cache |
+| cache_len | [1] | Cache fill level |
+## Benchmark Results
+### WER on LibriSpeech test-clean
+| Mode | Files | WER | Notes |
+|------|-------|-----|-------|
+| PyTorch `pad_and_drop=False` | 100 | 1.88% | Non-streaming (full context) |
+| PyTorch `pad_and_drop=True` | 10 | 3.57% | True streaming |
+| CoreML Non-streaming | 100 | 1.83% | Full audio preprocessed |
+| CoreML Streaming | 100 | 1.79% | Audio chunked at 1.12s |
+| NVIDIA Claimed | 2620 | 2.31% | Full test-clean |
+### Streaming Modes Explained
+```
+NON-STREAMING (test_coreml_inference.py):
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+1. Full audio → preprocessor → FULL mel (one continuous spectrogram)
+2. Slice mel into chunks for encoder
+3. Each slice has natural continuity (no chunk boundaries)
+CHEAT: The mel was computed with full audio context
+WER: ~1.83%
+```
+```
+TRUE STREAMING (test_coreml_streaming.py):
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+1. Audio chunk 1 → preprocessor → mel_1
+2. Audio chunk 2 → preprocessor → mel_2 (computed separately!)
+3. Prepend last 9 frames of mel_1 to mel_2 (mel_cache)
+mel_cache = bridge between separately-computed mels (NOT cheating)
+WER: ~1.79%
+```
+### What is mel_cache?
+The encoder's subsampling layer needs 9 frames (~90ms) of look-back context:
+```
+ENCODER INPUT (needs 121 frames = 9 cache + 112 new)
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+│9│      112 frames      │
+ ↑
+ mel_cache = last 9 frames from PREVIOUS chunk's mel
+Chunk 1: [000000000][mel_chunk_1]  ← pad with zeros (no previous)
+Chunk 2: [mel_1_end][mel_chunk_2]  ← 9 frames from chunk 1
+Chunk 3: [mel_2_end][mel_chunk_3]  ← 9 frames from chunk 2
+```
+This is **NOT cheating** - in real-time streaming you DO have the previous 90ms of audio.
+## Inference Pipeline
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                     STREAMING RNNT PIPELINE                      │
+└─────────────────────────────────────────────────────────────────┘
+1. PREPROCESSOR (per 1.12s audio chunk)
+   audio [1, 17920] → mel [1, 128, 112]
+2. ENCODER (with cache)
+   mel [1, 128, 121] + cache → encoded [1, 1024, 14] + new_cache
+   (121 = 9 mel_cache + 112 new frames)
+   (14 output frames after 8x subsampling)
+3. DECODER + JOINT (greedy loop per encoder frame)
+   For each of 14 encoder frames:
+     ┌──────────────────────────────────────────┐
+     │  token → DECODER → decoder_out           │
+     │  encoder_step + decoder_out → JOINT      │
+     │  → logits → argmax → predicted token     │
+     │  if token == BLANK: next encoder frame   │
+     │  else: emit token, update decoder state  │
+     └──────────────────────────────────────────┘
+```
+## Usage
+### Convert to CoreML
+```bash
+cd conversion_scripts
+uv sync
+uv run python convert_nemotron_streaming.py --output-dir ../nemotron_coreml
+```
+Options:
+- `--encoder-cu`: Encoder compute units (default: CPU_AND_NE)
+- `--precision`: FLOAT32 or FLOAT16
+### Run WER Benchmark (PyTorch)
+```bash
+cd conversion_scripts
+uv run python ../benchmark_wer.py --num-files 100
+```
+### Test CoreML Inference
+Non-streaming (full audio preprocessing):
+```bash
+uv run python ../test_coreml_inference.py --model-dir ../nemotron_coreml --num-files 10
+```
+True streaming (audio chunked at 1.12s):
+```bash
+uv run python ../test_coreml_streaming.py --model-dir ../nemotron_coreml --num-files 10
+```
+## Files
+```
+nemotron-speech-streaming-0.6b/coreml/
+├── README.md                    # This file
+├── BENCHMARK_RESULTS.md         # WER benchmark results
+├── benchmark_wer.py             # PyTorch streaming WER benchmark
+├── nemo_streaming_reference.py  # NeMo streaming reference implementation
+├── test_coreml_inference.py     # CoreML non-streaming test
+├── test_coreml_streaming.py     # CoreML true streaming test
+├── conversion_scripts/
+│   ├── pyproject.toml           # Python dependencies (uv)
+│   ├── convert_nemotron_streaming.py  # Main conversion script
+│   └── individual_components.py       # Wrapper classes for export
+├── nemotron_coreml/             # Exported CoreML models
+│   ├── preprocessor.mlpackage
+│   ├── encoder.mlpackage
+│   ├── decoder.mlpackage
+│   ├── joint.mlpackage
+│   ├── metadata.json
+│   └── tokenizer.json
+└── datasets/
+    └── LibriSpeech/test-clean/  # 2620 test files
+```
+## Dependencies
+- Python 3.10
+- PyTorch 2.x
+- NeMo Toolkit 2.x
+- CoreMLTools 7.x
+- soundfile, numpy, typer
+## Notes
+- The encoder is the largest model (2.2GB) with 24 Conformer layers
+- Model uses 128 mel bins (not the typical 80)
+- RNNT blank token index is 1024 (vocab_size)
+- Decoder uses 2-layer LSTM with 640 hidden units

models/.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

models/decoder.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:496b8a25e0697cb5f15196251c89730c25574d1c9eed4111f70adc6457198b8b
+size 243

models/decoder.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2483acce3793dcef37b4c27d99af51125c9c0f6f11641e3a76fab5518391203b
+size 492

models/decoder.mlmodelc/metadata.json ADDED Viewed

	@@ -0,0 +1,120 @@

+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float32",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 640 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 640, 1]",
+        "name" : "decoder_out",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 2 × 1 × 640)",
+        "shortDescription" : "",
+        "shape" : "[2, 1, 640]",
+        "name" : "h_out",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 2 × 1 × 640)",
+        "shortDescription" : "",
+        "shape" : "[2, 1, 640]",
+        "name" : "c_out",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+    ],
+    "specificationVersion" : 8,
+    "mlProgramOperationTypeHistogram" : {
+      "Select" : 1,
+      "Ios17.squeeze" : 4,
+      "Ios17.gather" : 1,
+      "Ios17.lstm" : 2,
+      "Identity" : 1,
+      "Ios17.transpose" : 2,
+      "Split" : 2,
+      "Ios17.add" : 1,
+      "Ios17.greaterEqual" : 1,
+      "Stack" : 2
+    },
+    "computePrecision" : "Mixed (Float32, Int32)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+    ],
+    "availability" : {
+      "macOS" : "14.0",
+      "tvOS" : "17.0",
+      "visionOS" : "1.0",
+      "watchOS" : "10.0",
+      "iOS" : "17.0",
+      "macCatalyst" : "17.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.conversion_date" : "2026-01-11",
+      "com.github.apple.coremltools.source" : "torch==2.9.1",
+      "com.github.apple.coremltools.version" : "9.0",
+      "com.github.apple.coremltools.source_dialect" : "TorchScript"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 1]",
+        "name" : "token",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "token_length",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 2 × 1 × 640)",
+        "shortDescription" : "",
+        "shape" : "[2, 1, 640]",
+        "name" : "h_in",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 2 × 1 × 640)",
+        "shortDescription" : "",
+        "shape" : "[2, 1, 640]",
+        "name" : "c_in",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "decoder",
+    "method" : "predict"
+  }
+]

models/decoder.mlmodelc/model.mil ADDED Viewed

	@@ -0,0 +1,57 @@

+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.1"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
+{
+    func main<ios17>(tensor<fp32, [2, 1, 640]> c_in, tensor<fp32, [2, 1, 640]> h_in, tensor<int32, [1, 1]> token, tensor<int32, [1]> token_length) {
+            tensor<fp32, [1025, 640]> module_prediction_embed_weight = const()[name = tensor<string, []>("module_prediction_embed_weight"), val = tensor<fp32, [1025, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<int32, []> y_batch_dims_0 = const()[name = tensor<string, []>("y_batch_dims_0"), val = tensor<int32, []>(0)];
+            tensor<bool, []> y_validate_indices_0 = const()[name = tensor<string, []>("y_validate_indices_0"), val = tensor<bool, []>(false)];
+            tensor<int32, []> greater_equal_0_y_0 = const()[name = tensor<string, []>("greater_equal_0_y_0"), val = tensor<int32, []>(0)];
+            tensor<bool, [1, 1]> greater_equal_0 = greater_equal(x = token, y = greater_equal_0_y_0)[name = tensor<string, []>("greater_equal_0")];
+            tensor<int32, []> slice_by_index_0 = const()[name = tensor<string, []>("slice_by_index_0"), val = tensor<int32, []>(1025)];
+            tensor<int32, [1, 1]> add_2 = add(x = token, y = slice_by_index_0)[name = tensor<string, []>("add_2")];
+            tensor<int32, [1, 1]> select_0 = select(a = token, b = add_2, cond = greater_equal_0)[name = tensor<string, []>("select_0")];
+            tensor<int32, []> y_axis_1 = const()[name = tensor<string, []>("y_axis_1"), val = tensor<int32, []>(0)];
+            tensor<fp32, [1, 1, 640]> y = gather(axis = y_axis_1, batch_dims = y_batch_dims_0, indices = select_0, validate_indices = y_validate_indices_0, x = module_prediction_embed_weight)[name = tensor<string, []>("y")];
+            tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<int32, []> split_0_num_splits_0 = const()[name = tensor<string, []>("split_0_num_splits_0"), val = tensor<int32, []>(2)];
+            tensor<int32, []> split_0_axis_0 = const()[name = tensor<string, []>("split_0_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [1, 1, 640]> split_0_0, tensor<fp32, [1, 1, 640]> split_0_1 = split(axis = split_0_axis_0, num_splits = split_0_num_splits_0, x = h_in)[name = tensor<string, []>("split_0")];
+            tensor<int32, []> split_1_num_splits_0 = const()[name = tensor<string, []>("split_1_num_splits_0"), val = tensor<int32, []>(2)];
+            tensor<int32, []> split_1_axis_0 = const()[name = tensor<string, []>("split_1_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [1, 1, 640]> split_1_0, tensor<fp32, [1, 1, 640]> split_1_1 = split(axis = split_1_axis_0, num_splits = split_1_num_splits_0, x = c_in)[name = tensor<string, []>("split_1")];
+            tensor<fp32, [2560]> concat_0 = const()[name = tensor<string, []>("concat_0"), val = tensor<fp32, [2560]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(2624128)))];
+            tensor<fp32, [2560, 640]> concat_1 = const()[name = tensor<string, []>("concat_1"), val = tensor<fp32, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(2634432)))];
+            tensor<fp32, [2560, 640]> concat_2 = const()[name = tensor<string, []>("concat_2"), val = tensor<fp32, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9188096)))];
+            tensor<int32, [1]> input_lstm_layer_0_lstm_h0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_layer_0_lstm_h0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 640]> input_lstm_layer_0_lstm_h0_squeeze = squeeze(axes = input_lstm_layer_0_lstm_h0_squeeze_axes_0, x = split_0_0)[name = tensor<string, []>("input_lstm_layer_0_lstm_h0_squeeze")];
+            tensor<int32, [1]> input_lstm_layer_0_lstm_c0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_layer_0_lstm_c0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 640]> input_lstm_layer_0_lstm_c0_squeeze = squeeze(axes = input_lstm_layer_0_lstm_c0_squeeze_axes_0, x = split_1_0)[name = tensor<string, []>("input_lstm_layer_0_lstm_c0_squeeze")];
+            tensor<string, []> input_lstm_layer_0_direction_0 = const()[name = tensor<string, []>("input_lstm_layer_0_direction_0"), val = tensor<string, []>("forward")];
+            tensor<bool, []> input_lstm_layer_0_output_sequence_0 = const()[name = tensor<string, []>("input_lstm_layer_0_output_sequence_0"), val = tensor<bool, []>(true)];
+            tensor<string, []> input_lstm_layer_0_recurrent_activation_0 = const()[name = tensor<string, []>("input_lstm_layer_0_recurrent_activation_0"), val = tensor<string, []>("sigmoid")];
+            tensor<string, []> input_lstm_layer_0_cell_activation_0 = const()[name = tensor<string, []>("input_lstm_layer_0_cell_activation_0"), val = tensor<string, []>("tanh")];
+            tensor<string, []> input_lstm_layer_0_activation_0 = const()[name = tensor<string, []>("input_lstm_layer_0_activation_0"), val = tensor<string, []>("tanh")];
+            tensor<fp32, [1, 1, 640]> input_3 = transpose(perm = input_3_perm_0, x = y)[name = tensor<string, []>("transpose_2")];
+            tensor<fp32, [1, 1, 640]> input_lstm_layer_0_0, tensor<fp32, [1, 640]> input_lstm_layer_0_1, tensor<fp32, [1, 640]> input_lstm_layer_0_2 = lstm(activation = input_lstm_layer_0_activation_0, bias = concat_0, cell_activation = input_lstm_layer_0_cell_activation_0, direction = input_lstm_layer_0_direction_0, initial_c = input_lstm_layer_0_lstm_c0_squeeze, initial_h = input_lstm_layer_0_lstm_h0_squeeze, output_sequence = input_lstm_layer_0_output_sequence_0, recurrent_activation = input_lstm_layer_0_recurrent_activation_0, weight_hh = concat_2, weight_ih = concat_1, x = input_3)[name = tensor<string, []>("input_lstm_layer_0")];
+            tensor<fp32, [2560]> concat_3 = const()[name = tensor<string, []>("concat_3"), val = tensor<fp32, [2560]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15741760)))];
+            tensor<fp32, [2560, 640]> concat_4 = const()[name = tensor<string, []>("concat_4"), val = tensor<fp32, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15752064)))];
+            tensor<fp32, [2560, 640]> concat_5 = const()[name = tensor<string, []>("concat_5"), val = tensor<fp32, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22305728)))];
+            tensor<int32, [1]> input_lstm_h0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_h0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 640]> input_lstm_h0_squeeze = squeeze(axes = input_lstm_h0_squeeze_axes_0, x = split_0_1)[name = tensor<string, []>("input_lstm_h0_squeeze")];
+            tensor<int32, [1]> input_lstm_c0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_c0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 640]> input_lstm_c0_squeeze = squeeze(axes = input_lstm_c0_squeeze_axes_0, x = split_1_1)[name = tensor<string, []>("input_lstm_c0_squeeze")];
+            tensor<string, []> input_direction_0 = const()[name = tensor<string, []>("input_direction_0"), val = tensor<string, []>("forward")];
+            tensor<bool, []> input_output_sequence_0 = const()[name = tensor<string, []>("input_output_sequence_0"), val = tensor<bool, []>(true)];
+            tensor<string, []> input_recurrent_activation_0 = const()[name = tensor<string, []>("input_recurrent_activation_0"), val = tensor<string, []>("sigmoid")];
+            tensor<string, []> input_cell_activation_0 = const()[name = tensor<string, []>("input_cell_activation_0"), val = tensor<string, []>("tanh")];
+            tensor<string, []> input_activation_0 = const()[name = tensor<string, []>("input_activation_0"), val = tensor<string, []>("tanh")];
+            tensor<fp32, [1, 1, 640]> input_0, tensor<fp32, [1, 640]> input_1, tensor<fp32, [1, 640]> input_2 = lstm(activation = input_activation_0, bias = concat_3, cell_activation = input_cell_activation_0, direction = input_direction_0, initial_c = input_lstm_c0_squeeze, initial_h = input_lstm_h0_squeeze, output_sequence = input_output_sequence_0, recurrent_activation = input_recurrent_activation_0, weight_hh = concat_5, weight_ih = concat_4, x = input_lstm_layer_0_0)[name = tensor<string, []>("input")];
+            tensor<int32, []> obj_3_axis_0 = const()[name = tensor<string, []>("obj_3_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 1, 640]> h_out = stack(axis = obj_3_axis_0, values = (input_lstm_layer_0_1, input_1))[name = tensor<string, []>("obj_3")];
+            tensor<int32, []> obj_axis_0 = const()[name = tensor<string, []>("obj_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 1, 640]> c_out = stack(axis = obj_axis_0, values = (input_lstm_layer_0_2, input_2))[name = tensor<string, []>("obj")];
+            tensor<int32, [3]> transpose_0_perm_0 = const()[name = tensor<string, []>("transpose_0_perm_0"), val = tensor<int32, [3]>([1, 2, 0])];
+            tensor<fp32, [1, 640, 1]> decoder_out = transpose(perm = transpose_0_perm_0, x = input_0)[name = tensor<string, []>("transpose_1")];
+            tensor<int32, [1]> token_length_tmp = identity(x = token_length)[name = tensor<string, []>("token_length_tmp")];
+        } -> (decoder_out, h_out, c_out);
+}

models/decoder.mlmodelc/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03add11563357087393b3ef162925bfb93fa5caf070aa4b91abd909cbbab1aed
+size 28859392

models/decoder.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b766ca2ac7f121abd7ac1d8c4ebdee568912084c3f3fa356e0097ac155597834
+size 8734

models/decoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03add11563357087393b3ef162925bfb93fa5caf070aa4b91abd909cbbab1aed
+size 28859392

models/decoder.mlpackage/Manifest.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "fileFormatVersion": "1.0.0",
+    "itemInfoEntries": {
+        "6493F213-8E94-4135-BA80-88CBEAF57D4F": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
+        },
+        "9ACB4EE7-C027-4AAA-B75C-BA0B33F7B714": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
+        }
+    },
+    "rootModelIdentifier": "9ACB4EE7-C027-4AAA-B75C-BA0B33F7B714"
+}

models/encoder/encoder_float32.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7353a9c695cce6d4431164021f0551e5a9dd8515ed1ee3a0945212fb5c3db961
+size 243

models/encoder/encoder_float32.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:28ba4f3d7a7dc602ee2fdd0f7869a9e81aaac4576601aa2f57633187a810aa90
+size 607

models/encoder/encoder_float32.mlmodelc/metadata.json ADDED Viewed

	@@ -0,0 +1,168 @@

+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float32",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 1024 × 14)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 14]",
+        "name" : "encoded",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "encoded_length",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 24 × 70 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 24, 70, 1024]",
+        "name" : "cache_channel_out",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 24 × 1024 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 24, 1024, 8]",
+        "name" : "cache_time_out",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "cache_len_out",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+    ],
+    "specificationVersion" : 8,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios17.logicalAnd" : 3,
+      "Ios17.reshape" : 145,
+      "Ios16.softmax" : 24,
+      "Ios17.matmul" : 72,
+      "Ios17.transpose" : 224,
+      "Split" : 24,
+      "Ios17.expandDims" : 18,
+      "Select" : 72,
+      "Ios17.add" : 180,
+      "Tile" : 8,
+      "Ios17.sliceByIndex" : 147,
+      "Ios16.sigmoid" : 24,
+      "Pad" : 27,
+      "Ios17.logicalNot" : 2,
+      "Ios17.layerNorm" : 144,
+      "Ios17.less" : 5,
+      "Ios17.sub" : 4,
+      "Ios17.conv" : 77,
+      "Ios16.relu" : 3,
+      "Ios17.clip" : 2,
+      "Ios17.linear" : 193,
+      "Ios17.greaterEqual" : 1,
+      "Ios17.floorDiv" : 3,
+      "Ios17.cast" : 12,
+      "Ios16.silu" : 72,
+      "Ios17.concat" : 72,
+      "Stack" : 2,
+      "Ios17.mul" : 106
+    },
+    "computePrecision" : "Mixed (Float32, Int32)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+    ],
+    "availability" : {
+      "macOS" : "14.0",
+      "tvOS" : "17.0",
+      "visionOS" : "1.0",
+      "watchOS" : "10.0",
+      "iOS" : "17.0",
+      "macCatalyst" : "17.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.conversion_date" : "2026-01-11",
+      "com.github.apple.coremltools.source" : "torch==2.9.1",
+      "com.github.apple.coremltools.version" : "9.0",
+      "com.github.apple.coremltools.source_dialect" : "TorchScript"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 128 × 121)",
+        "shortDescription" : "",
+        "shape" : "[1, 128, 121]",
+        "name" : "mel",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "mel_length",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 24 × 70 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 24, 70, 1024]",
+        "name" : "cache_channel",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 24 × 1024 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 24, 1024, 8]",
+        "name" : "cache_time",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "cache_len",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "encoder_float32",
+    "method" : "predict"
+  }
+]

models/encoder/encoder_float32.mlmodelc/model.mil ADDED Viewed

The diff for this file is too large to render. See raw diff

models/encoder/encoder_float32.mlmodelc/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:97ea4cb0d81aded8be4a2fbc8cbfb0621b77bd4a87c2d286d7cdf63a0f1d3e71
+size 2352382336

models/encoder/encoder_float32.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a055d64d270aa66f578b06e238a189ea0e4d299ae4cbc484c8e1ea9b6256914b
+size 640913

models/encoder/encoder_float32.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:97ea4cb0d81aded8be4a2fbc8cbfb0621b77bd4a87c2d286d7cdf63a0f1d3e71
+size 2352382336

models/encoder/encoder_float32.mlpackage/Manifest.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "fileFormatVersion": "1.0.0",
+    "itemInfoEntries": {
+        "0AA85F5B-F286-49A1-9DD7-1F815799BAC6": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
+        },
+        "E3E8C673-979B-4A80-9FB0-0C4D8418551A": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
+        }
+    },
+    "rootModelIdentifier": "0AA85F5B-F286-49A1-9DD7-1F815799BAC6"
+}

models/encoder/encoder_int8.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa63528a4d283b724b63a62c78e2a179e4dc16c61d819131b0a0ac26518d342f
+size 243

models/encoder/encoder_int8.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d4b6aa7cb0c9c89c59ab979b7f1ce0688688db66eb117b53f2bd0a5caa61a53
+size 669

models/encoder/encoder_int8.mlmodelc/metadata.json ADDED Viewed

	@@ -0,0 +1,171 @@

+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "shortDescription" : "Nemotron Streaming Encoder (int8 quantized)",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 1024 × 14)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 14]",
+        "name" : "encoded",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "encoded_length",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 24 × 70 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 24, 70, 1024]",
+        "name" : "cache_channel_out",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 24 × 1024 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 24, 1024, 8]",
+        "name" : "cache_time_out",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "cache_len_out",
+        "type" : "MultiArray"
+      }
+    ],
+    "storagePrecision" : "Mixed (Float32, Int8)",
+    "modelParameters" : [
+    ],
+    "author" : "Fluid Inference",
+    "specificationVersion" : 8,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios17.logicalAnd" : 3,
+      "Ios17.reshape" : 145,
+      "Ios16.softmax" : 24,
+      "Ios17.matmul" : 72,
+      "Ios17.transpose" : 224,
+      "Split" : 24,
+      "Ios17.expandDims" : 18,
+      "Select" : 72,
+      "Ios17.add" : 180,
+      "Tile" : 8,
+      "Ios17.sliceByIndex" : 147,
+      "Ios16.sigmoid" : 24,
+      "Pad" : 27,
+      "Ios17.logicalNot" : 2,
+      "Ios17.layerNorm" : 144,
+      "Ios16.constexprAffineDequantize" : 294,
+      "Ios17.less" : 5,
+      "Ios17.sub" : 4,
+      "Ios17.conv" : 77,
+      "Ios16.relu" : 3,
+      "Ios17.clip" : 2,
+      "Ios17.linear" : 193,
+      "Ios17.greaterEqual" : 1,
+      "Ios17.floorDiv" : 3,
+      "Ios17.cast" : 12,
+      "Ios16.silu" : 72,
+      "Ios17.concat" : 72,
+      "Stack" : 2,
+      "Ios17.mul" : 106
+    },
+    "computePrecision" : "Mixed (Float32, Int32)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+    ],
+    "availability" : {
+      "macOS" : "14.0",
+      "tvOS" : "17.0",
+      "visionOS" : "1.0",
+      "watchOS" : "10.0",
+      "iOS" : "17.0",
+      "macCatalyst" : "17.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 128 × 121)",
+        "shortDescription" : "",
+        "shape" : "[1, 128, 121]",
+        "name" : "mel",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "mel_length",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 24 × 70 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 24, 70, 1024]",
+        "name" : "cache_channel",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 24 × 1024 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 24, 1024, 8]",
+        "name" : "cache_time",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "cache_len",
+        "type" : "MultiArray"
+      }
+    ],
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.conversion_date" : "2026-01-11",
+      "com.github.apple.coremltools.source" : "torch==2.9.1",
+      "com.github.apple.coremltools.version" : "9.0",
+      "com.github.apple.coremltools.source_dialect" : "TorchScript"
+    },
+    "generatedClassName" : "encoder_int8",
+    "method" : "predict"
+  }
+]

models/encoder/encoder_int8.mlmodelc/model.mil ADDED Viewed

The diff for this file is too large to render. See raw diff

models/encoder/encoder_int8.mlmodelc/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a3f944f82ea23f5642118b8a660d5969d6ebbb779bb2e9890c0eb46b5042d7e2
+size 591463516

models/encoder/encoder_int8.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6e52e534bd7638bdaa3ccf7cfb8625be1aa71de1337954a54f9bd8295b3f9b1d
+size 707354

models/encoder/encoder_int8.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a3f944f82ea23f5642118b8a660d5969d6ebbb779bb2e9890c0eb46b5042d7e2
+size 591463516

models/encoder/encoder_int8.mlpackage/Manifest.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "fileFormatVersion": "1.0.0",
+    "itemInfoEntries": {
+        "7F7741C1-C7D8-4BE3-B8EC-4E0951D7E1E7": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
+        },
+        "E4F8549F-5A90-413F-ABFB-09C4684D9BB5": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
+        }
+    },
+    "rootModelIdentifier": "E4F8549F-5A90-413F-ABFB-09C4684D9BB5"
+}

models/joint.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4b0c121a00eec47e7b3c60afad3c8237c623ab3ddef1230360bf55615f82e3ca
+size 243

models/joint.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c83c073a351da6afa49aa3c9b8e22d3f62951a01b92a67746c88d23500c24dd
+size 400

models/joint.mlmodelc/metadata.json ADDED Viewed

	@@ -0,0 +1,75 @@

+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float32",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 1 × 1 × 1025)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1, 1025]",
+        "name" : "logits",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+    ],
+    "specificationVersion" : 8,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios17.expandDims" : 2,
+      "Ios17.transpose" : 2,
+      "Ios17.linear" : 3,
+      "Ios17.add" : 1,
+      "Ios16.relu" : 1
+    },
+    "computePrecision" : "Mixed (Float32, Int32)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+    ],
+    "availability" : {
+      "macOS" : "14.0",
+      "tvOS" : "17.0",
+      "visionOS" : "1.0",
+      "watchOS" : "10.0",
+      "iOS" : "17.0",
+      "macCatalyst" : "17.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.conversion_date" : "2026-01-11",
+      "com.github.apple.coremltools.source" : "torch==2.9.1",
+      "com.github.apple.coremltools.version" : "9.0",
+      "com.github.apple.coremltools.source_dialect" : "TorchScript"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 1024 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1]",
+        "name" : "encoder",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 640 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 640, 1]",
+        "name" : "decoder",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "joint",
+    "method" : "predict"
+  }
+]

models/joint.mlmodelc/model.mil ADDED Viewed

	@@ -0,0 +1,25 @@

+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.1"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
+{
+    func main<ios17>(tensor<fp32, [1, 640, 1]> decoder, tensor<fp32, [1, 1024, 1]> encoder) {
+            tensor<fp32, [640]> module_enc_bias = const()[name = tensor<string, []>("module_enc_bias"), val = tensor<fp32, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp32, [640, 1024]> module_enc_weight = const()[name = tensor<string, []>("module_enc_weight"), val = tensor<fp32, [640, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(2688)))];
+            tensor<fp32, [640]> module_pred_bias = const()[name = tensor<string, []>("module_pred_bias"), val = tensor<fp32, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(2624192)))];
+            tensor<fp32, [640, 640]> module_pred_weight = const()[name = tensor<string, []>("module_pred_weight"), val = tensor<fp32, [640, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(2626816)))];
+            tensor<fp32, [1025]> module_joint_net_2_bias = const()[name = tensor<string, []>("module_joint_net_2_bias"), val = tensor<fp32, [1025]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4265280)))];
+            tensor<fp32, [1025, 640]> module_joint_net_2_weight = const()[name = tensor<string, []>("module_joint_net_2_weight"), val = tensor<fp32, [1025, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4269504)))];
+            tensor<int32, [3]> input_1_perm_0 = const()[name = tensor<string, []>("input_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [1, 1, 1024]> input_1 = transpose(perm = input_1_perm_0, x = encoder)[name = tensor<string, []>("transpose_1")];
+            tensor<fp32, [1, 1, 640]> enc_proj = linear(bias = module_enc_bias, weight = module_enc_weight, x = input_1)[name = tensor<string, []>("linear_0")];
+            tensor<fp32, [1, 1, 640]> input_3 = transpose(perm = input_3_perm_0, x = decoder)[name = tensor<string, []>("transpose_0")];
+            tensor<fp32, [1, 1, 640]> dec_proj = linear(bias = module_pred_bias, weight = module_pred_weight, x = input_3)[name = tensor<string, []>("linear_1")];
+            tensor<int32, [1]> var_23_axes_0 = const()[name = tensor<string, []>("op_23_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp32, [1, 1, 1, 640]> var_23 = expand_dims(axes = var_23_axes_0, x = enc_proj)[name = tensor<string, []>("op_23")];
+            tensor<int32, [1]> var_25_axes_0 = const()[name = tensor<string, []>("op_25_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1, 1, 1, 640]> var_25 = expand_dims(axes = var_25_axes_0, x = dec_proj)[name = tensor<string, []>("op_25")];
+            tensor<fp32, [1, 1, 1, 640]> input_5 = add(x = var_23, y = var_25)[name = tensor<string, []>("input_5")];
+            tensor<fp32, [1, 1, 1, 640]> input_7 = relu(x = input_5)[name = tensor<string, []>("input_7")];
+            tensor<fp32, [1, 1, 1, 1025]> logits = linear(bias = module_joint_net_2_bias, weight = module_joint_net_2_weight, x = input_7)[name = tensor<string, []>("linear_2")];
+        } -> (logits);
+}

models/joint.mlmodelc/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a578e03371ba424fc0c426b34857d4f8646020bf60d8a329c759fe36e430cf1
+size 6893568

models/joint.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3d03b858b4ae47ce1b6afdc775be8bf8c649dccc7836d6a1db3dc4a606ff400
+size 3326

models/joint.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a578e03371ba424fc0c426b34857d4f8646020bf60d8a329c759fe36e430cf1
+size 6893568

models/joint.mlpackage/Manifest.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "fileFormatVersion": "1.0.0",
+    "itemInfoEntries": {
+        "37CC94B4-E114-47FE-9862-7A63DD114FF2": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
+        },
+        "A81F6D51-BC94-457E-AA1C-CC93B9D57D96": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
+        }
+    },
+    "rootModelIdentifier": "37CC94B4-E114-47FE-9862-7A63DD114FF2"
+}

models/metadata.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "model": "nvidia/nemotron-speech-streaming-en-0.6b",
+  "sample_rate": 16000,
+  "chunk_mel_frames": 112,
+  "pre_encode_cache": 9,
+  "total_mel_frames": 121,
+  "vocab_size": 1024,
+  "blank_idx": 1024,
+  "cache_channel_shape": [
+    1,
+    24,
+    70,
+    1024
+  ],
+  "cache_time_shape": [
+    1,
+    24,
+    1024,
+    8
+  ],
+  "decoder_hidden": 640,
+  "decoder_layers": 2
+}

models/preprocessor.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8ab0fe255177fdc29f9a59582bfc1d328d26da8e40717f1e3fa14e90b814419
+size 243

models/preprocessor.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c0a4b8a4452288e4dcc4a7d59eca80470641eae784a4f7ad2228785b53a07b7
+size 430

models/preprocessor.mlmodelc/metadata.json ADDED Viewed

	@@ -0,0 +1,106 @@

+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float32",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32)",
+        "shortDescription" : "",
+        "shape" : "[]",
+        "name" : "mel",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "mel_length",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+    ],
+    "specificationVersion" : 8,
+    "mlProgramOperationTypeHistogram" : {
+      "Range1d" : 2,
+      "Ios17.equal" : 1,
+      "Ios17.reshape" : 2,
+      "Identity" : 1,
+      "Ios17.matmul" : 1,
+      "Select" : 3,
+      "Ios17.expandDims" : 7,
+      "Ios17.add" : 2,
+      "Ios17.sliceByIndex" : 3,
+      "Ios16.reduceSum" : 1,
+      "Shape" : 2,
+      "Ios17.gather" : 2,
+      "Ios17.logicalNot" : 1,
+      "Pad" : 1,
+      "Ios17.log" : 1,
+      "Ios17.less" : 1,
+      "Ios17.sub" : 2,
+      "Ios17.conv" : 2,
+      "Ios17.pow" : 1,
+      "Ios17.concat" : 1,
+      "Stack" : 1,
+      "Ios17.floorDiv" : 1,
+      "Ios17.greaterEqual" : 1,
+      "Ios17.mul" : 1
+    },
+    "computePrecision" : "Mixed (Float32, Int32)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+    ],
+    "availability" : {
+      "macOS" : "14.0",
+      "tvOS" : "17.0",
+      "visionOS" : "1.0",
+      "watchOS" : "10.0",
+      "iOS" : "17.0",
+      "macCatalyst" : "17.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.conversion_date" : "2026-01-11",
+      "com.github.apple.coremltools.source" : "torch==2.9.1",
+      "com.github.apple.coremltools.version" : "9.0",
+      "com.github.apple.coremltools.source_dialect" : "TorchScript"
+    },
+    "inputSchema" : [
+      {
+        "dataType" : "Float32",
+        "hasShapeFlexibility" : "1",
+        "isOptional" : "0",
+        "shapeFlexibility" : "1 × 1...480000",
+        "shapeRange" : "[[1, 1], [1, 480000]]",
+        "formattedType" : "MultiArray (Float32 1 × 1)",
+        "type" : "MultiArray",
+        "shape" : "[1, 1]",
+        "name" : "audio",
+        "shortDescription" : ""
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "audio_length",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "preprocessor",
+    "method" : "predict"
+  }
+]

models/preprocessor.mlmodelc/model.mil ADDED Viewed

	@@ -0,0 +1,110 @@

+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.1"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
+{
+    func main<ios17>(tensor<fp32, [1, ?]> audio, tensor<int32, [1]> audio_length) [FlexibleShapeInformation = tuple<tuple<tensor<string, []>, dict<tensor<string, []>, tensor<int32, [?]>>>, tuple<tensor<string, []>, dict<tensor<string, []>, list<tensor<int32, [2]>, ?>>>>((("DefaultShapes", {{"audio", [1, 1]}}), ("RangeDims", {{"audio", [[1, 1], [1, 480000]]}})))] {
+            tensor<int32, []> var_9 = const()[name = tensor<string, []>("op_9"), val = tensor<int32, []>(1)];
+            tensor<int32, []> var_10 = const()[name = tensor<string, []>("op_10"), val = tensor<int32, []>(160)];
+            tensor<int32, []> var_12 = const()[name = tensor<string, []>("op_12"), val = tensor<int32, []>(0)];
+            tensor<fp32, []> var_16 = const()[name = tensor<string, []>("op_16"), val = tensor<fp32, []>(0x0p+0)];
+            tensor<int32, []> var_33 = const()[name = tensor<string, []>("op_33"), val = tensor<int32, []>(512)];
+            tensor<int32, [1]> var_34 = add(x = audio_length, y = var_33)[name = tensor<string, []>("op_34")];
+            tensor<int32, []> var_35 = const()[name = tensor<string, []>("op_35"), val = tensor<int32, []>(512)];
+            tensor<int32, [1]> var_36 = sub(x = var_34, y = var_35)[name = tensor<string, []>("op_36")];
+            tensor<int32, [1]> floor_div_0 = floor_div(x = var_36, y = var_10)[name = tensor<string, []>("floor_div_0")];
+            tensor<bool, [1]> var_39 = equal(x = audio_length, y = var_12)[name = tensor<string, []>("op_39")];
+            tensor<int32, [1]> var_40 = const()[name = tensor<string, []>("op_40"), val = tensor<int32, [1]>([0])];
+            tensor<int32, [1]> mel_length = select(a = var_40, b = floor_div_0, cond = var_39)[name = tensor<string, []>("seq_len")];
+            tensor<int32, [2]> var_42_shape = shape(x = audio)[name = tensor<string, []>("op_42_shape")];
+            tensor<int32, []> gather_0_batch_dims_0 = const()[name = tensor<string, []>("gather_0_batch_dims_0"), val = tensor<int32, []>(0)];
+            tensor<bool, []> gather_0_validate_indices_0 = const()[name = tensor<string, []>("gather_0_validate_indices_0"), val = tensor<bool, []>(false)];
+            tensor<int32, []> select_0 = const()[name = tensor<string, []>("select_0"), val = tensor<int32, []>(1)];
+            tensor<int32, []> gather_0_axis_1 = const()[name = tensor<string, []>("gather_0_axis_1"), val = tensor<int32, []>(0)];
+            tensor<int32, []> gather_0 = gather(axis = gather_0_axis_1, batch_dims = gather_0_batch_dims_0, indices = select_0, validate_indices = gather_0_validate_indices_0, x = var_42_shape)[name = tensor<string, []>("gather_0")];
+            tensor<int32, []> const_0 = const()[name = tensor<string, []>("const_0"), val = tensor<int32, []>(0)];
+            tensor<int32, []> const_1 = const()[name = tensor<string, []>("const_1"), val = tensor<int32, []>(1)];
+            tensor<int32, [?]> var_43 = range_1d(end = gather_0, start = const_0, step = const_1)[name = tensor<string, []>("op_43")];
+            tensor<int32, [1]> var_44_axes_0 = const()[name = tensor<string, []>("op_44_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<int32, [1, ?]> var_44 = expand_dims(axes = var_44_axes_0, x = var_43)[name = tensor<string, []>("op_44")];
+            tensor<int32, [1]> var_45_axes_0 = const()[name = tensor<string, []>("op_45_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [1, 1]> var_45 = expand_dims(axes = var_45_axes_0, x = audio_length)[name = tensor<string, []>("op_45")];
+            tensor<bool, [1, ?]> timemask = less(x = var_44, y = var_45)[name = tensor<string, []>("timemask")];
+            tensor<int32, [2]> var_48_begin_0 = const()[name = tensor<string, []>("op_48_begin_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [2]> var_48_end_0 = const()[name = tensor<string, []>("op_48_end_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<bool, [2]> var_48_end_mask_0 = const()[name = tensor<string, []>("op_48_end_mask_0"), val = tensor<bool, [2]>([true, false])];
+            tensor<bool, [2]> var_48_squeeze_mask_0 = const()[name = tensor<string, []>("op_48_squeeze_mask_0"), val = tensor<bool, [2]>([false, true])];
+            tensor<fp32, [1]> var_48 = slice_by_index(begin = var_48_begin_0, end = var_48_end_0, end_mask = var_48_end_mask_0, squeeze_mask = var_48_squeeze_mask_0, x = audio)[name = tensor<string, []>("op_48")];
+            tensor<int32, [1]> var_49_axes_0 = const()[name = tensor<string, []>("op_49_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1, 1]> var_49 = expand_dims(axes = var_49_axes_0, x = var_48)[name = tensor<string, []>("op_49")];
+            tensor<int32, [2]> var_51_begin_0 = const()[name = tensor<string, []>("op_51_begin_0"), val = tensor<int32, [2]>([0, 1])];
+            tensor<int32, [2]> var_51_end_0 = const()[name = tensor<string, []>("op_51_end_0"), val = tensor<int32, [2]>([1, 0])];
+            tensor<bool, [2]> var_51_end_mask_0 = const()[name = tensor<string, []>("op_51_end_mask_0"), val = tensor<bool, [2]>([true, true])];
+            tensor<fp32, [1, ?]> var_51 = slice_by_index(begin = var_51_begin_0, end = var_51_end_0, end_mask = var_51_end_mask_0, x = audio)[name = tensor<string, []>("op_51")];
+            tensor<int32, [2]> var_53_begin_0 = const()[name = tensor<string, []>("op_53_begin_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [2]> var_53_end_0 = const()[name = tensor<string, []>("op_53_end_0"), val = tensor<int32, [2]>([1, -1])];
+            tensor<bool, [2]> var_53_end_mask_0 = const()[name = tensor<string, []>("op_53_end_mask_0"), val = tensor<bool, [2]>([true, false])];
+            tensor<fp32, [1, ?]> var_53 = slice_by_index(begin = var_53_begin_0, end = var_53_end_0, end_mask = var_53_end_mask_0, x = audio)[name = tensor<string, []>("op_53")];
+            tensor<fp32, []> var_54 = const()[name = tensor<string, []>("op_54"), val = tensor<fp32, []>(0x1.f0a3d8p-1)];
+            tensor<fp32, [1, ?]> var_55 = mul(x = var_53, y = var_54)[name = tensor<string, []>("op_55")];
+            tensor<fp32, [1, ?]> var_56 = sub(x = var_51, y = var_55)[name = tensor<string, []>("op_56")];
+            tensor<bool, []> x_3_interleave_0 = const()[name = tensor<string, []>("x_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, ?]> x_3 = concat(axis = var_9, interleave = x_3_interleave_0, values = (var_49, var_56))[name = tensor<string, []>("x_3")];
+            tensor<bool, [1, ?]> var_59 = logical_not(x = timemask)[name = tensor<string, []>("op_59")];
+            tensor<fp32, [1, ?]> input_1 = select(a = var_16, b = x_3, cond = var_59)[name = tensor<string, []>("input_1")];
+            tensor<int32, [3]> concat_1x = const()[name = tensor<string, []>("concat_1x"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp32, [1, 1, ?]> input_3 = reshape(shape = concat_1x, x = input_1)[name = tensor<string, []>("input_3")];
+            tensor<fp32, []> const_3 = const()[name = tensor<string, []>("const_3"), val = tensor<fp32, []>(0x0p+0)];
+            tensor<int32, [6]> input_5_pad_0 = const()[name = tensor<string, []>("input_5_pad_0"), val = tensor<int32, [6]>([0, 0, 0, 0, 256, 256])];
+            tensor<string, []> input_5_mode_0 = const()[name = tensor<string, []>("input_5_mode_0"), val = tensor<string, []>("constant")];
+            tensor<fp32, [1, 1, ?]> input_5 = pad(constant_val = const_3, mode = input_5_mode_0, pad = input_5_pad_0, x = input_3)[name = tensor<string, []>("input_5")];
+            tensor<int32, [2]> concat_2x = const()[name = tensor<string, []>("concat_2x"), val = tensor<int32, [2]>([1, -1])];
+            tensor<fp32, [1, ?]> input = reshape(shape = concat_2x, x = input_5)[name = tensor<string, []>("input")];
+            tensor<fp32, [257, 1, 512]> expand_dims_1 = const()[name = tensor<string, []>("expand_dims_1"), val = tensor<fp32, [257, 1, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp32, [257, 1, 512]> expand_dims_2 = const()[name = tensor<string, []>("expand_dims_2"), val = tensor<fp32, [257, 1, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(526464)))];
+            tensor<int32, [1]> expand_dims_3 = const()[name = tensor<string, []>("expand_dims_3"), val = tensor<int32, [1]>([160])];
+            tensor<int32, [1]> expand_dims_4_axes_0 = const()[name = tensor<string, []>("expand_dims_4_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1, 1, ?]> expand_dims_4 = expand_dims(axes = expand_dims_4_axes_0, x = input)[name = tensor<string, []>("expand_dims_4")];
+            tensor<string, []> conv_0_pad_type_0 = const()[name = tensor<string, []>("conv_0_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [2]> conv_0_pad_0 = const()[name = tensor<string, []>("conv_0_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> conv_0_dilations_0 = const()[name = tensor<string, []>("conv_0_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> conv_0_groups_0 = const()[name = tensor<string, []>("conv_0_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 257, ?]> conv_0 = conv(dilations = conv_0_dilations_0, groups = conv_0_groups_0, pad = conv_0_pad_0, pad_type = conv_0_pad_type_0, strides = expand_dims_3, weight = expand_dims_1, x = expand_dims_4)[name = tensor<string, []>("conv_0")];
+            tensor<string, []> conv_1_pad_type_0 = const()[name = tensor<string, []>("conv_1_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [2]> conv_1_pad_0 = const()[name = tensor<string, []>("conv_1_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> conv_1_dilations_0 = const()[name = tensor<string, []>("conv_1_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> conv_1_groups_0 = const()[name = tensor<string, []>("conv_1_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 257, ?]> conv_1 = conv(dilations = conv_1_dilations_0, groups = conv_1_groups_0, pad = conv_1_pad_0, pad_type = conv_1_pad_type_0, strides = expand_dims_3, weight = expand_dims_2, x = expand_dims_4)[name = tensor<string, []>("conv_1")];
+            tensor<int32, []> stack_0_axis_0 = const()[name = tensor<string, []>("stack_0_axis_0"), val = tensor<int32, []>(-1)];
+            tensor<fp32, [1, 257, ?, 2]> stack_0 = stack(axis = stack_0_axis_0, values = (conv_0, conv_1))[name = tensor<string, []>("stack_0")];
+            tensor<fp32, []> var_19_promoted = const()[name = tensor<string, []>("op_19_promoted"), val = tensor<fp32, []>(0x1p+1)];
+            tensor<fp32, [1, 257, ?, 2]> var_74 = pow(x = stack_0, y = var_19_promoted)[name = tensor<string, []>("op_74")];
+            tensor<int32, [1]> var_76_axes_0 = const()[name = tensor<string, []>("op_76_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<bool, []> var_76_keep_dims_0 = const()[name = tensor<string, []>("op_76_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 257, ?]> var_76 = reduce_sum(axes = var_76_axes_0, keep_dims = var_76_keep_dims_0, x = var_74)[name = tensor<string, []>("op_76")];
+            tensor<fp32, [1, 257, ?]> x_11 = identity(x = var_76)[name = tensor<string, []>("x_11")];
+            tensor<fp32, [1, 128, 257]> const_4 = const()[name = tensor<string, []>("const_4"), val = tensor<fp32, [1, 128, 257]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1052864)))];
+            tensor<bool, []> x_13_transpose_x_0 = const()[name = tensor<string, []>("x_13_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> x_13_transpose_y_0 = const()[name = tensor<string, []>("x_13_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 128, ?]> x_13 = matmul(transpose_x = x_13_transpose_x_0, transpose_y = x_13_transpose_y_0, x = const_4, y = x_11)[name = tensor<string, []>("x_13")];
+            tensor<fp32, []> var_83 = const()[name = tensor<string, []>("op_83"), val = tensor<fp32, []>(0x1p-24)];
+            tensor<fp32, [1, 128, ?]> var_84 = add(x = x_13, y = var_83)[name = tensor<string, []>("op_84")];
+            tensor<fp32, []> x_epsilon_0 = const()[name = tensor<string, []>("x_epsilon_0"), val = tensor<fp32, []>(0x1p-149)];
+            tensor<fp32, [1, 128, ?]> x = log(epsilon = x_epsilon_0, x = var_84)[name = tensor<string, []>("x")];
+            tensor<int32, [3]> var_86_shape = shape(x = x)[name = tensor<string, []>("op_86_shape")];
+            tensor<int32, []> gather_5_batch_dims_0 = const()[name = tensor<string, []>("gather_5_batch_dims_0"), val = tensor<int32, []>(0)];
+            tensor<bool, []> gather_5_validate_indices_0 = const()[name = tensor<string, []>("gather_5_validate_indices_0"), val = tensor<bool, []>(false)];
+            tensor<int32, []> select_3 = const()[name = tensor<string, []>("select_3"), val = tensor<int32, []>(2)];
+            tensor<int32, []> gather_5_axis_1 = const()[name = tensor<string, []>("gather_5_axis_1"), val = tensor<int32, []>(0)];
+            tensor<int32, []> gather_5 = gather(axis = gather_5_axis_1, batch_dims = gather_5_batch_dims_0, indices = select_3, validate_indices = gather_5_validate_indices_0, x = var_86_shape)[name = tensor<string, []>("gather_5")];
+            tensor<int32, []> const_5 = const()[name = tensor<string, []>("const_5"), val = tensor<int32, []>(0)];
+            tensor<int32, []> const_6 = const()[name = tensor<string, []>("const_6"), val = tensor<int32, []>(1)];
+            tensor<int32, [?]> mask_1 = range_1d(end = gather_5, start = const_5, step = const_6)[name = tensor<string, []>("mask_1")];
+            tensor<int32, [1]> expand_dims_0_axes_0 = const()[name = tensor<string, []>("expand_dims_0_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<int32, [1, ?]> expand_dims_0 = expand_dims(axes = expand_dims_0_axes_0, x = mask_1)[name = tensor<string, []>("expand_dims_0")];
+            tensor<int32, [1]> var_91_axes_0 = const()[name = tensor<string, []>("op_91_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [1, 1]> var_91 = expand_dims(axes = var_91_axes_0, x = mel_length)[name = tensor<string, []>("op_91")];
+            tensor<bool, [1, ?]> mask = greater_equal(x = expand_dims_0, y = var_91)[name = tensor<string, []>("mask")];
+            tensor<int32, [1]> var_93_axes_0 = const()[name = tensor<string, []>("op_93_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<bool, [1, 1, ?]> var_93 = expand_dims(axes = var_93_axes_0, x = mask)[name = tensor<string, []>("op_93")];
+            tensor<fp32, [1, 128, ?]> mel = select(a = var_16, b = x, cond = var_93)[name = tensor<string, []>("processed_signal")];
+        } -> (mel, mel_length);
+}

models/preprocessor.mlmodelc/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eff8082d1cc59b4aeaf963d61fa982f84e805554ede7506aed89d9dfd0d2549e
+size 1184512

models/preprocessor.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c6be7b66a9ff7e469719957fd58676fa5a5f8c432f67638ea24e756ec34b97e4
+size 12961

models/preprocessor.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eff8082d1cc59b4aeaf963d61fa982f84e805554ede7506aed89d9dfd0d2549e
+size 1184512

models/preprocessor.mlpackage/Manifest.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "fileFormatVersion": "1.0.0",
+    "itemInfoEntries": {
+        "A4922046-212C-4752-B1A4-F82AFD0BE152": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
+        },
+        "C2E826E5-D793-4300-AA2D-A7E743CF5F83": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
+        }
+    },
+    "rootModelIdentifier": "C2E826E5-D793-4300-AA2D-A7E743CF5F83"
+}

models/tokenizer.json ADDED Viewed

	@@ -0,0 +1,1026 @@

+{
+  "0": "<unk>",
+  "1": "\u2581t",
+  "2": "\u2581th",
+  "3": "\u2581a",
+  "4": "in",
+  "5": "\u2581the",
+  "6": "re",
+  "7": "\u2581w",
+  "8": "\u2581o",
+  "9": "\u2581s",
+  "10": "er",
+  "11": "at",
+  "12": "ou",
+  "13": "nd",
+  "14": "it",
+  "15": "is",
+  "16": "\u2581h",
+  "17": "\u2581b",
+  "18": "on",
+  "19": "\u2581c",
+  "20": "ing",
+  "21": "en",
+  "22": "\u2581to",
+  "23": "\u2581m",
+  "24": "\u2581f",
+  "25": "\u2581p",
+  "26": "or",
+  "27": "an",
+  "28": "es",
+  "29": "\u2581of",
+  "30": "\u2581d",
+  "31": "ed",
+  "32": "ll",
+  "33": "\u2581and",
+  "34": "\u2581I",
+  "35": "\u2581in",
+  "36": "\u2581l",
+  "37": "ar",
+  "38": "\u2581y",
+  "39": "\u2581g",
+  "40": "as",
+  "41": "\u2581you",
+  "42": "om",
+  "43": "\u2581n",
+  "44": "ic",
+  "45": "ve",
+  "46": "al",
+  "47": "ion",
+  "48": "us",
+  "49": "\u2581be",
+  "50": "ow",
+  "51": "le",
+  "52": "\u2581wh",
+  "53": "\u2581e",
+  "54": "ot",
+  "55": "ut",
+  "56": "\u2581it",
+  "57": "\u2581is",
+  "58": "\u2581we",
+  "59": "\u2581T",
+  "60": "\u2581re",
+  "61": "et",
+  "62": "\u2581A",
+  "63": "ent",
+  "64": "\u2581on",
+  "65": "\u2581ha",
+  "66": "ay",
+  "67": "\u2581S",
+  "68": "ct",
+  "69": "\u2581Th",
+  "70": "ver",
+  "71": "id",
+  "72": "ig",
+  "73": "im",
+  "74": "ro",
+  "75": "\u2581for",
+  "76": "ly",
+  "77": "\u2581he",
+  "78": "ke",
+  "79": "ld",
+  "80": "se",
+  "81": "st",
+  "82": "ch",
+  "83": "\u2581st",
+  "84": "all",
+  "85": "ce",
+  "86": "ur",
+  "87": "ith",
+  "88": "am",
+  "89": "if",
+  "90": "ir",
+  "91": "\u2581go",
+  "92": "\u2581u",
+  "93": "\u2581as",
+  "94": "\u2581was",
+  "95": "ad",
+  "96": "\u2581W",
+  "97": "\u2581k",
+  "98": "\u2581an",
+  "99": "ht",
+  "100": "th",
+  "101": "\u2581r",
+  "102": "\u2581are",
+  "103": "ere",
+  "104": "\u2581se",
+  "105": "\u2581do",
+  "106": "\u2581B",
+  "107": "\u2581so",
+  "108": "\u2581sh",
+  "109": "\u2581not",
+  "110": "\u2581li",
+  "111": "od",
+  "112": "\u2581C",
+  "113": "ust",
+  "114": "ill",
+  "115": "ight",
+  "116": "ally",
+  "117": "\u2581And",
+  "118": "ter",
+  "119": "\u2581or",
+  "120": "\u2581me",
+  "121": "\u2581M",
+  "122": "ome",
+  "123": "op",
+  "124": "\u2581at",
+  "125": "il",
+  "126": "\u2581The",
+  "127": "ould",
+  "128": "\u2581j",
+  "129": "ant",
+  "130": "\u2581So",
+  "131": "\u2581H",
+  "132": "ol",
+  "133": "ain",
+  "134": "\u2581can",
+  "135": "\u2581de",
+  "136": "\u2581ne",
+  "137": "ore",
+  "138": "\u2581con",
+  "139": "\u2581kn",
+  "140": "ck",
+  "141": "ul",
+  "142": "\u2581fr",
+  "143": "\u2581ab",
+  "144": "ers",
+  "145": "ess",
+  "146": "ge",
+  "147": "\u2581pro",
+  "148": "pe",
+  "149": "ate",
+  "150": "\u2581su",
+  "151": "\u2581com",
+  "152": "\u2581but",
+  "153": "\u2581all",
+  "154": "est",
+  "155": "qu",
+  "156": "\u2581ex",
+  "157": "\u2581al",
+  "158": "ra",
+  "159": "\u2581O",
+  "160": "out",
+  "161": "use",
+  "162": "very",
+  "163": "pp",
+  "164": "\u2581Y",
+  "165": "\u2581ch",
+  "166": "ri",
+  "167": "ist",
+  "168": "\u2581v",
+  "169": "\u2581lo",
+  "170": "ment",
+  "171": "art",
+  "172": "\u2581P",
+  "173": "nt",
+  "174": "ab",
+  "175": "\u2581one",
+  "176": "\u2581N",
+  "177": "ive",
+  "178": "\u2581wor",
+  "179": "ions",
+  "180": "ort",
+  "181": "\u2581L",
+  "182": "\u2581by",
+  "183": "ich",
+  "184": "\u2581my",
+  "185": "ity",
+  "186": "ok",
+  "187": "\u2581G",
+  "188": "res",
+  "189": "\u2581up",
+  "190": "un",
+  "191": "um",
+  "192": "ea",
+  "193": "ind",
+  "194": "and",
+  "195": "ink",
+  "196": "el",
+  "197": "\u2581D",
+  "198": "em",
+  "199": "\u2581E",
+  "200": "os",
+  "201": "oug",
+  "202": "\u2581if",
+  "203": "ca",
+  "204": "\u2581out",
+  "205": "\u2581int",
+  "206": "ie",
+  "207": "\u2581F",
+  "208": "\u2581It",
+  "209": "\u2581his",
+  "210": "ard",
+  "211": "\u2581had",
+  "212": "\u2581tr",
+  "213": "her",
+  "214": "our",
+  "215": "ies",
+  "216": "ake",
+  "217": "\u2581R",
+  "218": "\u2581We",
+  "219": "\u2581get",
+  "220": "\u2581don",
+  "221": "\u2581us",
+  "222": "ak",
+  "223": "\u2581pl",
+  "224": "ect",
+  "225": "ure",
+  "226": "ame",
+  "227": "ast",
+  "228": "\u2581who",
+  "229": "ack",
+  "230": "\u2581le",
+  "231": "\u2581sa",
+  "232": "iv",
+  "233": "ci",
+  "234": "ide",
+  "235": "\u2581tim",
+  "236": "\u2581our",
+  "237": "ound",
+  "238": "ous",
+  "239": "\u2581co",
+  "240": "\u2581pe",
+  "241": "ose",
+  "242": "ud",
+  "243": "\u2581see",
+  "244": "ough",
+  "245": "\u2581man",
+  "246": "\u2581qu",
+  "247": "\u2581You",
+  "248": "so",
+  "249": "ople",
+  "250": "\u2581Wh",
+  "251": "ong",
+  "252": "ap",
+  "253": "ther",
+  "254": "\u2581J",
+  "255": "are",
+  "256": "ine",
+  "257": "\u2581say",
+  "258": "\u2581im",
+  "259": "\u2581But",
+  "260": "ings",
+  "261": "\u2581has",
+  "262": "\u2581ag",
+  "263": "ff",
+  "264": "\u2581her",
+  "265": "itt",
+  "266": "one",
+  "267": "\u2581en",
+  "268": "\u2581ar",
+  "269": "\u2581fe",
+  "270": "ven",
+  "271": "\u2581any",
+  "272": "\u2581mo",
+  "273": "reat",
+  "274": "ag",
+  "275": "\u2581how",
+  "276": "\u2581cl",
+  "277": "pt",
+  "278": "\u2581now",
+  "279": "own",
+  "280": "ber",
+  "281": "\u2581him",
+  "282": "\u2581act",
+  "283": "hing",
+  "284": "ice",
+  "285": "\u2581no",
+  "286": "ans",
+  "287": "iz",
+  "288": "\u2581fa",
+  "289": "per",
+  "290": "pl",
+  "291": "\u2581te",
+  "292": "\u2581ad",
+  "293": "age",
+  "294": "ree",
+  "295": "\u2581tw",
+  "296": "ank",
+  "297": "\u2581He",
+  "298": "ple",
+  "299": "ite",
+  "300": "ry",
+  "301": "\u2581U",
+  "302": "ish",
+  "303": "ire",
+  "304": "ue",
+  "305": "\u2581In",
+  "306": "\u2581she",
+  "307": "ble",
+  "308": "cc",
+  "309": "nder",
+  "310": "\u2581way",
+  "311": "\u2581pr",
+  "312": "ear",
+  "313": "\u2581did",
+  "314": "\u2581po",
+  "315": "eah",
+  "316": "\u2581un",
+  "317": "omet",
+  "318": "ence",
+  "319": "ep",
+  "320": "uch",
+  "321": "\u2581sp",
+  "322": "ach",
+  "323": "og",
+  "324": "ance",
+  "325": "able",
+  "326": "iff",
+  "327": "sel",
+  "328": "\u2581got",
+  "329": "way",
+  "330": "\u2581gr",
+  "331": "alk",
+  "332": "\u2581res",
+  "333": "ated",
+  "334": "irst",
+  "335": "ick",
+  "336": "ass",
+  "337": "\u2581two",
+  "338": "\u2581dis",
+  "339": "ord",
+  "340": "\u2581pre",
+  "341": "ount",
+  "342": "ase",
+  "343": "ip",
+  "344": "ult",
+  "345": "ical",
+  "346": "orm",
+  "347": "ary",
+  "348": "ace",
+  "349": "\u2581spe",
+  "350": "\u2581Ch",
+  "351": "\u2581thr",
+  "352": "\u2581imp",
+  "353": "int",
+  "354": "\u2581am",
+  "355": "\u2581off",
+  "356": "act",
+  "357": "ia",
+  "358": "\u2581ro",
+  "359": "ress",
+  "360": "\u2581per",
+  "361": "\u2581fo",
+  "362": "\u2581br",
+  "363": "\u2581K",
+  "364": "vel",
+  "365": "\u2581gu",
+  "366": "\u2581bo",
+  "367": "ang",
+  "368": "kay",
+  "369": "ub",
+  "370": "ign",
+  "371": "\u2581may",
+  "372": "ving",
+  "373": "ces",
+  "374": "ens",
+  "375": "cl",
+  "376": "\u2581lot",
+  "377": "ru",
+  "378": "ade",
+  "379": "\u2581bet",
+  "380": "\u2581bl",
+  "381": "\u2581let",
+  "382": "fore",
+  "383": "co",
+  "384": "ild",
+  "385": "ning",
+  "386": "xt",
+  "387": "ile",
+  "388": "ark",
+  "389": "self",
+  "390": "\u2581app",
+  "391": "ory",
+  "392": "du",
+  "393": "\u2581day",
+  "394": "\u2581St",
+  "395": "ater",
+  "396": "\u2581use",
+  "397": "ys",
+  "398": "fter",
+  "399": "\u2581new",
+  "400": "ious",
+  "401": "ial",
+  "402": "he",
+  "403": "wn",
+  "404": "ved",
+  "405": "red",
+  "406": "\u2581fl",
+  "407": "iss",
+  "408": "ody",
+  "409": "form",
+  "410": "ian",
+  "411": "tain",
+  "412": "\u2581bu",
+  "413": "\u2581V",
+  "414": "\u2581rec",
+  "415": "ty",
+  "416": "be",
+  "417": "\u2581sc",
+  "418": "ors",
+  "419": "vers",
+  "420": "\u2581put",
+  "421": "ife",
+  "422": "\u2581If",
+  "423": "we",
+  "424": "te",
+  "425": "ject",
+  "426": "ath",
+  "427": "ting",
+  "428": "\u2581rem",
+  "429": "\u2581acc",
+  "430": "ull",
+  "431": "ons",
+  "432": "\u2581ind",
+  "433": "\u2581ser",
+  "434": "\u2581ke",
+  "435": "ates",
+  "436": "ves",
+  "437": "na",
+  "438": "lic",
+  "439": "\u2581des",
+  "440": "\u2581its",
+  "441": "ful",
+  "442": "ents",
+  "443": "erm",
+  "444": "ac",
+  "445": "ered",
+  "446": "ise",
+  "447": "\u2581sy",
+  "448": "urn",
+  "449": "\u2581em",
+  "450": "oth",
+  "451": "ual",
+  "452": "ne",
+  "453": "ward",
+  "454": "ib",
+  "455": "\u2581try",
+  "456": "\u2581pos",
+  "457": "nds",
+  "458": "ft",
+  "459": "get",
+  "460": "ph",
+  "461": "\u2581ob",
+  "462": "ady",
+  "463": "igh",
+  "464": "ood",
+  "465": "\u2581rel",
+  "466": "\u2581wr",
+  "467": "ug",
+  "468": "ears",
+  "469": "ail",
+  "470": "\u2581Now",
+  "471": "\u2581bit",
+  "472": "ng",
+  "473": "\u2581Oh",
+  "474": "\u2581hel",
+  "475": "ange",
+  "476": "\u2581reg",
+  "477": "\u2581rep",
+  "478": "\u2581bel",
+  "479": "\u2581sm",
+  "480": "ost",
+  "481": "tern",
+  "482": "gr",
+  "483": "\u2581own",
+  "484": "\u2581end",
+  "485": "pect",
+  "486": "ily",
+  "487": "day",
+  "488": "ied",
+  "489": "ific",
+  "490": "ower",
+  "491": "\u2581add",
+  "492": "cess",
+  "493": "ict",
+  "494": "ible",
+  "495": "\u2581bas",
+  "496": "\u2581i",
+  "497": "\u2581op",
+  "498": "cial",
+  "499": "ular",
+  "500": "\u2581Be",
+  "501": "ced",
+  "502": "\u2581too",
+  "503": "ks",
+  "504": "ew",
+  "505": "mer",
+  "506": "\u2581ph",
+  "507": "ob",
+  "508": "==",
+  "509": "\u2581la",
+  "510": "\u2581set",
+  "511": "\u2581min",
+  "512": "\u2581sub",
+  "513": "\u2581gen",
+  "514": "atch",
+  "515": "..",
+  "516": "\u2581inv",
+  "517": "\u2581As",
+  "518": "\u2581nat",
+  "519": "\u2581sl",
+  "520": "\u2581num",
+  "521": "av",
+  "522": "ways",
+  "523": "\u2581God",
+  "524": "stem",
+  "525": "\u2581ac",
+  "526": "\u2581att",
+  "527": "\u2581ev",
+  "528": "\u2581def",
+  "529": "llow",
+  "530": "\u2581str",
+  "531": "lect",
+  "532": "ars",
+  "533": "\u2581cr",
+  "534": "\u2581Is",
+  "535": "olog",
+  "536": "les",
+  "537": "oy",
+  "538": "\u2581ask",
+  "539": "\u2581inc",
+  "540": "body",
+  "541": "\u2581ent",
+  "542": "\u2581pol",
+  "543": "ness",
+  "544": "ix",
+  "545": "\u2581why",
+  "546": "onna",
+  "547": "\u2581ear",
+  "548": "\u2581tak",
+  "549": "\u2581Un",
+  "550": "ited",
+  "551": "mun",
+  "552": "li",
+  "553": "ute",
+  "554": "ract",
+  "555": "\u2581dec",
+  "556": "uro",
+  "557": "\u2581mak",
+  "558": "\u2581fin",
+  "559": "ween",
+  "560": "\u2581No",
+  "561": "arch",
+  "562": "\u2581bec",
+  "563": "gan",
+  "564": "old",
+  "565": "cy",
+  "566": "\u2581big",
+  "567": "\u2581For",
+  "568": "ren",
+  "569": "als",
+  "570": "und",
+  "571": "\u2581Al",
+  "572": "\u2581All",
+  "573": "ss",
+  "574": "ows",
+  "575": "\u2581mod",
+  "576": "ock",
+  "577": "\u2581id",
+  "578": "ism",
+  "579": "cus",
+  "580": "\u2581gl",
+  "581": "ably",
+  "582": "\u2581ass",
+  "583": "\u2581car",
+  "584": "ata",
+  "585": "ppen",
+  "586": "led",
+  "587": "\u2581sim",
+  "588": "\u2581mon",
+  "589": "ics",
+  "590": "\u2581giv",
+  "591": "cept",
+  "592": "\u2581Mr",
+  "593": "pan",
+  "594": "\u2581pub",
+  "595": "\u2581eff",
+  "596": "\u2581How",
+  "597": "ps",
+  "598": "vern",
+  "599": "end",
+  "600": "hip",
+  "601": "iew",
+  "602": "ope",
+  "603": "\u2581An",
+  "604": "\u2581She",
+  "605": "\u2581Com",
+  "606": "ee",
+  "607": "ures",
+  "608": "ell",
+  "609": "ouse",
+  "610": "cond",
+  "611": "king",
+  "612": "oc",
+  "613": "ues",
+  "614": "ever",
+  "615": "\u2581To",
+  "616": "clud",
+  "617": "\u2581ins",
+  "618": "\u2581exp",
+  "619": "\u2581old",
+  "620": "\u2581mem",
+  "621": "\u2581ref",
+  "622": "\u2581tra",
+  "623": "\u2581far",
+  "624": "ave",
+  "625": "rat",
+  "626": "\u2581sur",
+  "627": "ruct",
+  "628": "rib",
+  "629": "duct",
+  "630": "uff",
+  "631": "\u2581met",
+  "632": "\u2581sch",
+  "633": "ince",
+  "634": "\u2581run",
+  "635": "ense",
+  "636": "\u2581cle",
+  "637": "\u2581==",
+  "638": "mon",
+  "639": "ize",
+  "640": "\u2581ord",
+  "641": "blem",
+  "642": "tin",
+  "643": "\u2581Let",
+  "644": "ner",
+  "645": "ond",
+  "646": "its",
+  "647": "\u2581cor",
+  "648": "land",
+  "649": "\u2581cur",
+  "650": "\u2581Re",
+  "651": "\u2581bus",
+  "652": "\u2581uh",
+  "653": "air",
+  "654": "ote",
+  "655": "ants",
+  "656": "ason",
+  "657": "ric",
+  "658": "\u2581el",
+  "659": "\u2581cer",
+  "660": "nce",
+  "661": "\u2581fam",
+  "662": "\u2581cap",
+  "663": "uck",
+  "664": "ool",
+  "665": "ried",
+  "666": "\u2581cou",
+  "667": "\u2581fun",
+  "668": "\u2581wom",
+  "669": "\u2581hum",
+  "670": "\u2581ty",
+  "671": "\u2581ap",
+  "672": "ike",
+  "673": "\u2581few",
+  "674": "oney",
+  "675": "\u2581inf",
+  "676": "ont",
+  "677": "ese",
+  "678": "ook",
+  "679": "gy",
+  "680": "uth",
+  "681": "ulat",
+  "682": "ieve",
+  "683": "ized",
+  "684": "ross",
+  "685": "\u2581ple",
+  "686": "\u2581um",
+  "687": "\u2581val",
+  "688": "\u2581equ",
+  "689": "\u2581lea",
+  "690": "\u2581lar",
+  "691": "ah",
+  "692": "eral",
+  "693": "\u2581ed",
+  "694": "ared",
+  "695": "lish",
+  "696": "arn",
+  "697": "ds",
+  "698": "esn",
+  "699": "\u2581iss",
+  "700": "\u2581ca",
+  "701": "ted",
+  "702": "ices",
+  "703": "\u2581wee",
+  "704": "ash",
+  "705": "\u2581top",
+  "706": "ten",
+  "707": "up",
+  "708": "ts",
+  "709": "gin",
+  "710": "con",
+  "711": "ari",
+  "712": "\u2581opp",
+  "713": "osed",
+  "714": "\u2581eas",
+  "715": "\u2581ext",
+  "716": "gg",
+  "717": "az",
+  "718": "\u2581Fr",
+  "719": "ideo",
+  "720": "izat",
+  "721": "\u2581men",
+  "722": "\u2581mom",
+  "723": "\u2581ret",
+  "724": "tty",
+  "725": "rist",
+  "726": "\u2581gra",
+  "727": "alth",
+  "728": "ef",
+  "729": "\u2581det",
+  "730": "ax",
+  "731": "\u2581mat",
+  "732": "chn",
+  "733": "ern",
+  "734": "peri",
+  "735": "\u2581bre",
+  "736": "\u2581Sh",
+  "737": "sw",
+  "738": "erat",
+  "739": "\u2581sit",
+  "740": "ters",
+  "741": "ale",
+  "742": "man",
+  "743": "\u2581sol",
+  "744": "ork",
+  "745": "\u2581adv",
+  "746": "ety",
+  "747": "\u2581vis",
+  "748": "\u2581med",
+  "749": "uc",
+  "750": "less",
+  "751": "\u2581unt",
+  "752": "gram",
+  "753": "ets",
+  "754": "ists",
+  "755": "\u2581ey",
+  "756": "\u2581col",
+  "757": "imes",
+  "758": "\u2581law",
+  "759": "\u2581pri",
+  "760": "sid",
+  "761": "\u2581On",
+  "762": "\u2581mot",
+  "763": "ield",
+  "764": "\u2581Do",
+  "765": "\u2581At",
+  "766": "ages",
+  "767": "amp",
+  "768": "\u2581art",
+  "769": "miss",
+  "770": "\u2581sk",
+  "771": "alf",
+  "772": "pr",
+  "773": "ier",
+  "774": "\u2581beh",
+  "775": "\u2581Yes",
+  "776": "ural",
+  "777": "ime",
+  "778": "\u2581wa",
+  "779": "oks",
+  "780": "bers",
+  "781": "ger",
+  "782": "ient",
+  "783": "ries",
+  "784": "...",
+  "785": "\u2581che",
+  "786": "\u2581Br",
+  "787": "ird",
+  "788": "\u2581Ar",
+  "789": "\u2581war",
+  "790": "inat",
+  "791": "\u2581My",
+  "792": "ital",
+  "793": "wh",
+  "794": "med",
+  "795": "\u2581pur",
+  "796": "ully",
+  "797": "\u2581One",
+  "798": "\u2581rat",
+  "799": "ines",
+  "800": "\u2581Of",
+  "801": "io",
+  "802": "\u2581loc",
+  "803": "ret",
+  "804": "ctor",
+  "805": "\u2581leg",
+  "806": "stit",
+  "807": "ined",
+  "808": "ught",
+  "809": "\u2581dur",
+  "810": "\u2581es",
+  "811": "vent",
+  "812": "aj",
+  "813": "\u2581bro",
+  "814": "\u2581saw",
+  "815": "\u2581sec",
+  "816": "ream",
+  "817": "\u2581pop",
+  "818": "reen",
+  "819": "\u2581Ind",
+  "820": "els",
+  "821": "\u2581yet",
+  "822": "ired",
+  "823": "\u2581sw",
+  "824": "tro",
+  "825": "oup",
+  "826": "most",
+  "827": "pean",
+  "828": "eds",
+  "829": "ush",
+  "830": "oh",
+  "831": "\u2581Se",
+  "832": "\u2581tea",
+  "833": "ann",
+  "834": "ilit",
+  "835": "err",
+  "836": "pend",
+  "837": "ton",
+  "838": "ased",
+  "839": "\u2581aff",
+  "840": "\u2581mor",
+  "841": "\u2581dra",
+  "842": "put",
+  "843": "\u2581dr",
+  "844": "ins",
+  "845": "uat",
+  "846": "nect",
+  "847": "cri",
+  "848": "outh",
+  "849": "\u2581ra",
+  "850": "\u2581pay",
+  "851": "ms",
+  "852": "\u2581av",
+  "853": "bs",
+  "854": "ling",
+  "855": "\u2581De",
+  "856": "\u2581Or",
+  "857": "ove",
+  "858": "\u2581Can",
+  "859": "\u2581eng",
+  "860": "ames",
+  "861": "ided",
+  "862": "\u2581Go",
+  "863": "mitt",
+  "864": "ode",
+  "865": "\u2581cre",
+  "866": "par",
+  "867": "ides",
+  "868": "pos",
+  "869": "\u2581fav",
+  "870": "\u2581air",
+  "871": "\u2581New",
+  "872": "\u2581bad",
+  "873": "\u2581six",
+  "874": "vat",
+  "875": "\u2581pat",
+  "876": "not",
+  "877": "\u2581di",
+  "878": "rop",
+  "879": "ral",
+  "880": "orn",
+  "881": "\u2581par",
+  "882": "cing",
+  "883": "\u2581aw",
+  "884": "orts",
+  "885": "ox",
+  "886": "\u2581yes",
+  "887": "cuss",
+  "888": "eng",
+  "889": "ives",
+  "890": "erms",
+  "891": "\u2581job",
+  "892": "mand",
+  "893": "ying",
+  "894": "\u2581occ",
+  "895": "aps",
+  "896": "ases",
+  "897": "\u2581Not",
+  "898": "rent",
+  "899": "ency",
+  "900": "att",
+  "901": "ised",
+  "902": "vice",
+  "903": "\u2581Eng",
+  "904": "\u2581est",
+  "905": "oked",
+  "906": "\u2581Q",
+  "907": "iron",
+  "908": "idd",
+  "909": "me",
+  "910": "unch",
+  "911": "ane",
+  "912": "\u2581z",
+  "913": "br",
+  "914": "arts",
+  "915": "\u2581fat",
+  "916": "ery",
+  "917": "anks",
+  "918": "\u2581jo",
+  "919": "\u2581mar",
+  "920": "aw",
+  "921": "ott",
+  "922": "ards",
+  "923": "\u2581oh",
+  "924": "ians",
+  "925": "\u2581sci",
+  "926": "row",
+  "927": "unt",
+  "928": "ury",
+  "929": "\u2581abs",
+  "930": "ergy",
+  "931": "\u2581Z",
+  "932": "ump",
+  "933": "\u2581Am",
+  "934": "ened",
+  "935": "angu",
+  "936": "\u2581Pro",
+  "937": "icat",
+  "938": "itch",
+  "939": "\u2581dri",
+  "940": "iat",
+  "941": "\u2581",
+  "942": "e",
+  "943": "t",
+  "944": "o",
+  "945": "a",
+  "946": "n",
+  "947": "i",
+  "948": "s",
+  "949": "r",
+  "950": "h",
+  "951": "l",
+  "952": "d",
+  "953": "u",
+  "954": "c",
+  "955": "m",
+  "956": "y",
+  "957": "g",
+  "958": "w",
+  "959": "f",
+  "960": "p",
+  "961": ",",
+  "962": ".",
+  "963": "b",
+  "964": "v",
+  "965": "k",
+  "966": "'",
+  "967": "I",
+  "968": "T",
+  "969": "A",
+  "970": "S",
+  "971": "x",
+  "972": "W",
+  "973": "j",
+  "974": "C",
+  "975": "B",
+  "976": "M",
+  "977": "?",
+  "978": "H",
+  "979": "O",
+  "980": "0",
+  "981": "P",
+  "982": "q",
+  "983": "Y",
+  "984": "N",
+  "985": "L",
+  "986": "D",
+  "987": "1",
+  "988": "E",
+  "989": "G",
+  "990": "z",
+  "991": "F",
+  "992": "R",
+  "993": "-",
+  "994": "2",
+  "995": "J",
+  "996": "U",
+  "997": "9",
+  "998": "K",
+  "999": "5",
+  "1000": "3",
+  "1001": "V",
+  "1002": "=",
+  "1003": "4",
+  "1004": "8",
+  "1005": "6",
+  "1006": "7",
+  "1007": "!",
+  "1008": "%",
+  "1009": ":",
+  "1010": "Q",
+  "1011": "Z",
+  "1012": "$",
+  "1013": "X",
+  "1014": "\"",
+  "1015": "&",
+  "1016": "*",
+  "1017": "/",
+  "1018": "\u00a3",
+  "1019": "+",
+  "1020": "\u20ac",
+  "1021": "_",
+  "1022": "^",
+  "1023": "\u00a5"
+}

pyproject.toml ADDED Viewed

	@@ -0,0 +1,14 @@

+[project]
+name = "nemotron-streaming"
+version = "0.1.0"
+description = "NeMo Nemotron Streaming Reference Implementation"
+requires-python = ">=3.10,<3.11"
+dependencies = [
+    "torch>=2.0.0",
+    "nemo_toolkit[asr]>=2.0.0",
+    "soundfile>=0.12.0",
+    "numpy>=1.24.0",
+]
+[tool.uv]
+dev-dependencies = []

scripts/benchmark_wer.py ADDED Viewed

	@@ -0,0 +1,171 @@

+#!/usr/bin/env python3
+"""
+WER Benchmark for Nemotron Streaming 0.6b on LibriSpeech test-clean
+"""
+import glob
+import numpy as np
+import soundfile as sf
+import torch
+from pathlib import Path
+import nemo.collections.asr as nemo_asr
+from nemo.collections.asr.parts.utils.streaming_utils import CacheAwareStreamingAudioBuffer
+def load_ground_truth(librispeech_path: str) -> dict:
+    """Load all ground truth transcriptions."""
+    gt = {}
+    for trans_file in glob.glob(f"{librispeech_path}/**/*.trans.txt", recursive=True):
+        with open(trans_file) as f:
+            for line in f:
+                parts = line.strip().split(" ", 1)
+                if len(parts) == 2:
+                    file_id, text = parts
+                    gt[file_id] = text.lower()
+    return gt
+def normalize_text(text: str) -> str:
+    """Normalize text for WER calculation - remove punctuation, lowercase."""
+    import re
+    text = re.sub(r'[^\w\s]', '', text)
+    return ' '.join(text.lower().split())
+def compute_wer(reference: str, hypothesis: str) -> tuple:
+    """Compute WER between reference and hypothesis."""
+    ref_words = normalize_text(reference).split()
+    hyp_words = normalize_text(hypothesis).split()
+    d = np.zeros((len(ref_words) + 1, len(hyp_words) + 1), dtype=np.uint32)
+    for i in range(len(ref_words) + 1):
+        d[i, 0] = i
+    for j in range(len(hyp_words) + 1):
+        d[0, j] = j
+    for i in range(1, len(ref_words) + 1):
+        for j in range(1, len(hyp_words) + 1):
+            if ref_words[i-1] == hyp_words[j-1]:
+                d[i, j] = d[i-1, j-1]
+            else:
+                d[i, j] = min(d[i-1, j] + 1, d[i, j-1] + 1, d[i-1, j-1] + 1)
+    errors = d[len(ref_words), len(hyp_words)]
+    return errors, len(ref_words)
+def calc_drop_extra_pre_encoded(model, step_num, pad_and_drop_preencoded):
+    """Calculate drop_extra_pre_encoded value per NVIDIA's reference."""
+    if step_num == 0 and not pad_and_drop_preencoded:
+        return 0
+    return model.encoder.streaming_cfg.drop_extra_pre_encoded
+def transcribe_streaming(model, audio: np.ndarray, pad_and_drop_preencoded: bool = False) -> str:
+    """Streaming transcription using conformer_stream_step API."""
+    model.encoder.setup_streaming_params()
+    streaming_buffer = CacheAwareStreamingAudioBuffer(
+        model=model,
+        pad_and_drop_preencoded=pad_and_drop_preencoded,
+    )
+    streaming_buffer.reset_buffer()
+    streaming_buffer.append_audio(audio)
+    cache_last_channel, cache_last_time, cache_last_channel_len = \
+        model.encoder.get_initial_cache_state(batch_size=1)
+    previous_hypotheses = None
+    pred_out_stream = None
+    final_text = ""
+    with torch.inference_mode():
+        for step_num, (chunk_audio, chunk_lengths) in enumerate(streaming_buffer):
+            (
+                pred_out_stream,
+                transcribed_texts,
+                cache_last_channel,
+                cache_last_time,
+                cache_last_channel_len,
+                previous_hypotheses,
+            ) = model.conformer_stream_step(
+                processed_signal=chunk_audio,
+                processed_signal_length=chunk_lengths,
+                cache_last_channel=cache_last_channel,
+                cache_last_time=cache_last_time,
+                cache_last_channel_len=cache_last_channel_len,
+                keep_all_outputs=streaming_buffer.is_buffer_empty(),
+                previous_hypotheses=previous_hypotheses,
+                previous_pred_out=pred_out_stream,
+                drop_extra_pre_encoded=calc_drop_extra_pre_encoded(model, step_num, pad_and_drop_preencoded),
+                return_transcription=True,
+            )
+            if transcribed_texts and len(transcribed_texts) > 0:
+                text = transcribed_texts[0]
+                if hasattr(text, 'text'):
+                    final_text = text.text
+                else:
+                    final_text = str(text)
+    return final_text
+def main():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-files", type=int, default=100)
+    parser.add_argument("--dataset", type=str, default="datasets/LibriSpeech/test-clean")
+    args = parser.parse_args()
+    print("=" * 70)
+    print("NEMOTRON STREAMING 0.6B - WER BENCHMARK")
+    print("=" * 70)
+    # Load ground truth
+    print(f"\nLoading ground truth from {args.dataset}...")
+    gt = load_ground_truth(args.dataset)
+    print(f"Loaded {len(gt)} transcriptions")
+    # Get audio files
+    audio_files = sorted(glob.glob(f"{args.dataset}/**/*.flac", recursive=True))[:args.num_files]
+    print(f"Testing on {len(audio_files)} files")
+    # Load model
+    print("\nLoading model...")
+    model = nemo_asr.models.ASRModel.from_pretrained("nvidia/nemotron-speech-streaming-en-0.6b")
+    model.eval()
+    # Streaming transcription
+    print("\n[STREAMING MODE]")
+    stream_errors = 0
+    stream_words = 0
+    for i, audio_path in enumerate(audio_files):
+        file_id = Path(audio_path).stem
+        print(f"  [{i+1}/{len(audio_files)}] {file_id}", end=" ", flush=True)
+        audio, sr = sf.read(audio_path, dtype="float32")
+        hyp = transcribe_streaming(model, audio)
+        if file_id in gt:
+            errors, words = compute_wer(gt[file_id], hyp)
+            stream_errors += errors
+            stream_words += words
+            current_wer = 100 * stream_errors / stream_words
+            print(f"-> {errors} errs, WER so far: {current_wer:.2f}%")
+        else:
+            print("-> (no ground truth)")
+    stream_wer = 100 * stream_errors / stream_words if stream_words > 0 else 0
+    # Summary
+    print("\n" + "=" * 70)
+    print("SUMMARY")
+    print("=" * 70)
+    print(f"Files tested:    {len(audio_files)}")
+    print(f"Streaming WER:   {stream_wer:.2f}%")
+    print(f"NVIDIA claimed:  2.31%")
+if __name__ == "__main__":
+    main()

scripts/convert_nemotron_streaming.py ADDED Viewed

	@@ -0,0 +1,229 @@

+#!/usr/bin/env python3
+"""Export Nemotron Speech Streaming 0.6B to CoreML.
+Exports 4 components for streaming RNNT inference:
+1. Preprocessor: audio → mel
+2. Encoder: mel + cache → encoded + new_cache
+3. Decoder: token + state → decoder_out + new_state
+4. Joint: encoder + decoder → logits
+"""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Dict, Optional, Tuple
+import coremltools as ct
+import numpy as np
+import torch
+import typer
+import nemo.collections.asr as nemo_asr
+from individual_components import (
+    DecoderWrapper,
+    EncoderStreamingWrapper,
+    ExportSettings,
+    JointWrapper,
+    PreprocessorWrapper,
+    _coreml_convert,
+)
+DEFAULT_MODEL_ID = "nvidia/nemotron-speech-streaming-en-0.6b"
+# Streaming config from model:
+# chunk_size=[105, 112], pre_encode_cache_size=[0, 9], valid_out_len=14
+CHUNK_MEL_FRAMES = 112
+PRE_ENCODE_CACHE = 9
+TOTAL_MEL_FRAMES = CHUNK_MEL_FRAMES + PRE_ENCODE_CACHE  # 121
+def _tensor_shape(t: torch.Tensor) -> Tuple[int, ...]:
+    return tuple(int(d) for d in t.shape)
+def _parse_cu(name: str) -> ct.ComputeUnit:
+    mapping = {
+        "ALL": ct.ComputeUnit.ALL,
+        "CPU_ONLY": ct.ComputeUnit.CPU_ONLY,
+        "CPU_AND_GPU": ct.ComputeUnit.CPU_AND_GPU,
+        "CPU_AND_NE": ct.ComputeUnit.CPU_AND_NE,
+    }
+    return mapping.get(name.upper(), ct.ComputeUnit.CPU_ONLY)
+app = typer.Typer(add_completion=False)
+@app.command()
+def convert(
+    output_dir: Path = typer.Option(Path("nemotron_coreml"), help="Output directory"),
+    encoder_cu: str = typer.Option("CPU_AND_NE", help="Encoder compute units"),
+    precision: str = typer.Option("FLOAT32", help="FLOAT32 or FLOAT16"),
+) -> None:
+    """Export Nemotron Streaming to CoreML."""
+    output_dir.mkdir(parents=True, exist_ok=True)
+    typer.echo("Loading model...")
+    model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(DEFAULT_MODEL_ID, map_location="cpu")
+    model.eval()
+    sample_rate = int(model.cfg.preprocessor.sample_rate)
+    encoder = model.encoder
+    encoder.setup_streaming_params()
+    # Get cache shapes
+    cache_channel, cache_time, cache_len = encoder.get_initial_cache_state(batch_size=1, device="cpu")
+    cache_len = cache_len.to(torch.int32)
+    # Transpose to [B, L, ...] for CoreML
+    cache_channel_b = cache_channel.transpose(0, 1)
+    cache_time_b = cache_time.transpose(0, 1)
+    typer.echo(f"Cache shapes: channel={cache_channel_b.shape}, time={cache_time_b.shape}")
+    # Create wrappers
+    preprocessor = PreprocessorWrapper(model.preprocessor.eval())
+    encoder_streaming = EncoderStreamingWrapper(encoder.eval())
+    decoder = DecoderWrapper(model.decoder.eval())
+    joint = JointWrapper(model.joint.eval())
+    model.decoder._rnnt_export = True
+    settings = ExportSettings(
+        output_dir=output_dir,
+        compute_units=ct.ComputeUnit.CPU_ONLY,
+        deployment_target=ct.target.iOS17,
+        compute_precision=ct.precision.FLOAT16 if precision.upper() == "FLOAT16" else ct.precision.FLOAT32,
+        max_audio_seconds=30.0,
+        max_symbol_steps=1,
+        chunk_size_frames=14,
+        cache_size=cache_channel.shape[2],
+    )
+    # === Preprocessor ===
+    typer.echo("Exporting preprocessor...")
+    max_samples = 30 * sample_rate
+    audio = torch.randn(1, max_samples)
+    audio_len = torch.tensor([max_samples], dtype=torch.int32)
+    traced = torch.jit.trace(preprocessor, (audio, audio_len), strict=False)
+    inputs = [
+        ct.TensorType(name="audio", shape=(1, ct.RangeDim(1, max_samples)), dtype=np.float32),
+        ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
+    ]
+    outputs = [
+        ct.TensorType(name="mel", dtype=np.float32),
+        ct.TensorType(name="mel_length", dtype=np.int32),
+    ]
+    mlmodel = _coreml_convert(traced, inputs, outputs, settings, ct.ComputeUnit.CPU_ONLY)
+    mlmodel.save(str(output_dir / "preprocessor.mlpackage"))
+    # === Encoder (streaming) ===
+    typer.echo("Exporting encoder...")
+    mel_features = int(model.cfg.preprocessor.features)  # 128 for this model
+    mel = torch.randn(1, mel_features, TOTAL_MEL_FRAMES)
+    mel_len = torch.tensor([TOTAL_MEL_FRAMES], dtype=torch.int32)
+    traced = torch.jit.trace(
+        encoder_streaming,
+        (mel, mel_len, cache_channel_b, cache_time_b, cache_len),
+        strict=False
+    )
+    inputs = [
+        ct.TensorType(name="mel", shape=_tensor_shape(mel), dtype=np.float32),
+        ct.TensorType(name="mel_length", shape=(1,), dtype=np.int32),
+        ct.TensorType(name="cache_channel", shape=_tensor_shape(cache_channel_b), dtype=np.float32),
+        ct.TensorType(name="cache_time", shape=_tensor_shape(cache_time_b), dtype=np.float32),
+        ct.TensorType(name="cache_len", shape=(1,), dtype=np.int32),
+    ]
+    outputs = [
+        ct.TensorType(name="encoded", dtype=np.float32),
+        ct.TensorType(name="encoded_length", dtype=np.int32),
+        ct.TensorType(name="cache_channel_out", dtype=np.float32),
+        ct.TensorType(name="cache_time_out", dtype=np.float32),
+        ct.TensorType(name="cache_len_out", dtype=np.int32),
+    ]
+    mlmodel = _coreml_convert(traced, inputs, outputs, settings, _parse_cu(encoder_cu))
+    mlmodel.save(str(output_dir / "encoder.mlpackage"))
+    # === Decoder ===
+    typer.echo("Exporting decoder...")
+    decoder_hidden = int(model.decoder.pred_hidden)
+    decoder_layers = int(model.decoder.pred_rnn_layers)
+    targets = torch.tensor([[model.decoder.blank_idx]], dtype=torch.int32)
+    target_len = torch.tensor([1], dtype=torch.int32)
+    h = torch.zeros(decoder_layers, 1, decoder_hidden)
+    c = torch.zeros(decoder_layers, 1, decoder_hidden)
+    traced = torch.jit.trace(decoder, (targets, target_len, h, c), strict=False)
+    inputs = [
+        ct.TensorType(name="token", shape=(1, 1), dtype=np.int32),
+        ct.TensorType(name="token_length", shape=(1,), dtype=np.int32),
+        ct.TensorType(name="h_in", shape=_tensor_shape(h), dtype=np.float32),
+        ct.TensorType(name="c_in", shape=_tensor_shape(c), dtype=np.float32),
+    ]
+    outputs = [
+        ct.TensorType(name="decoder_out", dtype=np.float32),
+        ct.TensorType(name="h_out", dtype=np.float32),
+        ct.TensorType(name="c_out", dtype=np.float32),
+    ]
+    mlmodel = _coreml_convert(traced, inputs, outputs, settings, ct.ComputeUnit.CPU_ONLY)
+    mlmodel.save(str(output_dir / "decoder.mlpackage"))
+    # === Joint ===
+    typer.echo("Exporting joint...")
+    with torch.no_grad():
+        mel_test, _ = preprocessor(audio[:, :sample_rate], torch.tensor([sample_rate], dtype=torch.int32))
+        # Run through encoder wrapper (not model.encoder directly to avoid typed method issues)
+        enc_out, _, _, _, _ = encoder_streaming(
+            mel_test,
+            torch.tensor([mel_test.shape[2]], dtype=torch.int32),
+            cache_channel_b,
+            cache_time_b,
+            cache_len
+        )
+        dec_out, _, _ = decoder(targets, target_len, h, c)
+    # Single step: [B, D, 1]
+    enc_step = enc_out[:, :, :1].contiguous()
+    dec_step = dec_out[:, :, :1].contiguous()
+    traced = torch.jit.trace(joint, (enc_step, dec_step), strict=False)
+    inputs = [
+        ct.TensorType(name="encoder", shape=_tensor_shape(enc_step), dtype=np.float32),
+        ct.TensorType(name="decoder", shape=_tensor_shape(dec_step), dtype=np.float32),
+    ]
+    outputs = [ct.TensorType(name="logits", dtype=np.float32)]
+    mlmodel = _coreml_convert(traced, inputs, outputs, settings, ct.ComputeUnit.CPU_ONLY)
+    mlmodel.save(str(output_dir / "joint.mlpackage"))
+    # === Metadata ===
+    vocab_size = int(model.tokenizer.vocab_size)
+    metadata = {
+        "model": DEFAULT_MODEL_ID,
+        "sample_rate": sample_rate,
+        "mel_features": mel_features,
+        "chunk_mel_frames": CHUNK_MEL_FRAMES,
+        "pre_encode_cache": PRE_ENCODE_CACHE,
+        "total_mel_frames": TOTAL_MEL_FRAMES,
+        "vocab_size": vocab_size,
+        "blank_idx": int(model.decoder.blank_idx),
+        "cache_channel_shape": list(cache_channel_b.shape),
+        "cache_time_shape": list(cache_time_b.shape),
+        "decoder_hidden": decoder_hidden,
+        "decoder_layers": decoder_layers,
+        "encoder_dim": int(enc_out.shape[1]),
+    }
+    (output_dir / "metadata.json").write_text(json.dumps(metadata, indent=2))
+    # Tokenizer
+    tokenizer = {str(i): model.tokenizer.ids_to_tokens([i])[0] for i in range(vocab_size)}
+    (output_dir / "tokenizer.json").write_text(json.dumps(tokenizer, indent=2))
+    typer.echo(f"Done! Exported to {output_dir}")
+if __name__ == "__main__":
+    app()

scripts/individual_components.py ADDED Viewed

	@@ -0,0 +1,322 @@

+#!/usr/bin/env python3
+"""Export Parakeet Realtime EOU RNNT components into CoreML.
+This model uses a cache-aware streaming FastConformer encoder.
+The encoder requires splitting into:
+1. Initial encoder (no cache, for first chunk)
+2. Streaming encoder (with cache inputs/outputs)
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional, Tuple
+import coremltools as ct
+import torch
+@dataclass
+class ExportSettings:
+    output_dir: Path
+    compute_units: ct.ComputeUnit
+    deployment_target: Optional[ct.target]
+    compute_precision: Optional[ct.precision]
+    max_audio_seconds: float
+    max_symbol_steps: int
+    # Streaming-specific settings
+    chunk_size_frames: int  # Number of frames per chunk (after subsampling)
+    cache_size: int  # Size of the channel cache
+class PreprocessorWrapper(torch.nn.Module):
+    """Wrapper for the preprocessor (mel spectrogram extraction)."""
+    def __init__(self, module: torch.nn.Module) -> None:
+        super().__init__()
+        self.module = module
+    def forward(
+        self, audio_signal: torch.Tensor, length: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        mel, mel_length = self.module(
+            input_signal=audio_signal, length=length.to(dtype=torch.long)
+        )
+        return mel, mel_length
+class EncoderInitialWrapper(torch.nn.Module):
+    """Encoder wrapper for the initial chunk (no cache input).
+    This is used for the first chunk of audio where there's no previous cache.
+    It outputs the encoder features and initial cache states.
+    """
+    def __init__(self, module: torch.nn.Module) -> None:
+        super().__init__()
+        self.module = module
+    def forward(
+        self, features: torch.Tensor, length: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Forward pass for initial chunk without cache.
+        Args:
+            features: Mel spectrogram [B, D, T]
+            length: Sequence lengths [B]
+        Returns:
+            encoded: Encoder output [B, D, T_enc]
+            encoded_lengths: Output lengths [B]
+        """
+        # Initial forward without cache
+        encoded, encoded_lengths = self.module(
+            audio_signal=features, length=length.to(dtype=torch.long)
+        )
+        return encoded, encoded_lengths
+class EncoderStreamingWrapper(torch.nn.Module):
+    """Encoder wrapper for streaming with cache.
+    This is used for subsequent chunks where cache states are available.
+    It takes cache states as input and outputs updated cache states.
+    """
+    def __init__(self, module: torch.nn.Module) -> None:
+        super().__init__()
+        self.module = module
+    def forward(
+        self,
+        features: torch.Tensor,
+        length: torch.Tensor,
+        cache_last_channel: torch.Tensor,
+        cache_last_time: torch.Tensor,
+        cache_last_channel_len: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Forward pass with cache for streaming."""
+        # Transpose caches from [B, L, ...] to [L, B, ...] for NeMo
+        cache_last_channel_t = cache_last_channel.transpose(0, 1)
+        cache_last_time_t = cache_last_time.transpose(0, 1)
+        cache_len_i64 = cache_last_channel_len.to(dtype=torch.int64)
+        # Call encoder forward with cache parameters
+        encoded, encoded_lengths, cache_ch_next, cache_t_next, cache_len_next = self.module(
+            audio_signal=features,
+            length=length.to(dtype=torch.long),
+            cache_last_channel=cache_last_channel_t,
+            cache_last_time=cache_last_time_t,
+            cache_last_channel_len=cache_len_i64,
+        )
+        # Transpose caches back from [L, B, ...] to [B, L, ...]
+        cache_ch_next = cache_ch_next.transpose(0, 1)
+        cache_t_next = cache_t_next.transpose(0, 1)
+        return (
+            encoded,
+            encoded_lengths.to(dtype=torch.int32),
+            cache_ch_next,
+            cache_t_next,
+            cache_len_next.to(dtype=torch.int32),
+        )
+class DecoderWrapper(torch.nn.Module):
+    """Wrapper for the RNNT prediction network (decoder)."""
+    def __init__(self, module: torch.nn.Module) -> None:
+        super().__init__()
+        self.module = module
+    def forward(
+        self,
+        targets: torch.Tensor,
+        target_lengths: torch.Tensor,
+        h_in: torch.Tensor,
+        c_in: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        state = [h_in, c_in]
+        decoder_output, _, new_state = self.module(
+            targets=targets.to(dtype=torch.long),
+            target_length=target_lengths.to(dtype=torch.long),
+            states=state,
+        )
+        return decoder_output, new_state[0], new_state[1]
+class JointWrapper(torch.nn.Module):
+    """Wrapper for the RNNT joint network."""
+    def __init__(self, module: torch.nn.Module) -> None:
+        super().__init__()
+        self.module = module
+    def forward(
+        self, encoder_outputs: torch.Tensor, decoder_outputs: torch.Tensor
+    ) -> torch.Tensor:
+        # Input: encoder_outputs [B, D, T], decoder_outputs [B, D, U]
+        # Transpose to match what projection layers expect
+        encoder_outputs = encoder_outputs.transpose(1, 2)  # [B, T, D]
+        decoder_outputs = decoder_outputs.transpose(1, 2)  # [B, U, D]
+        # Apply projections
+        enc_proj = self.module.enc(encoder_outputs)  # [B, T, joint_dim]
+        dec_proj = self.module.pred(decoder_outputs)  # [B, U, joint_dim]
+        # Explicit broadcasting along T and U
+        x = enc_proj.unsqueeze(2) + dec_proj.unsqueeze(1)  # [B, T, U, joint_dim]
+        x = self.module.joint_net[0](x)  # ReLU
+        x = self.module.joint_net[1](x)  # Dropout (no-op in eval)
+        out = self.module.joint_net[2](x)  # Linear -> logits
+        return out
+class MelEncoderWrapper(torch.nn.Module):
+    """Fused wrapper: waveform -> mel -> encoder (no cache, initial chunk)."""
+    def __init__(
+        self, preprocessor: PreprocessorWrapper, encoder: EncoderInitialWrapper
+    ) -> None:
+        super().__init__()
+        self.preprocessor = preprocessor
+        self.encoder = encoder
+    def forward(
+        self, audio_signal: torch.Tensor, audio_length: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        mel, mel_length = self.preprocessor(audio_signal, audio_length)
+        encoded, enc_len = self.encoder(mel, mel_length.to(dtype=torch.int32))
+        return encoded, enc_len
+class MelEncoderStreamingWrapper(torch.nn.Module):
+    """Fused wrapper: waveform -> mel -> encoder (with cache, streaming)."""
+    def __init__(
+        self, preprocessor: PreprocessorWrapper, encoder: EncoderStreamingWrapper
+    ) -> None:
+        super().__init__()
+        self.preprocessor = preprocessor
+        self.encoder = encoder
+    def forward(
+        self,
+        audio_signal: torch.Tensor,
+        audio_length: torch.Tensor,
+        cache_last_channel: torch.Tensor,
+        cache_last_time: torch.Tensor,
+        cache_last_channel_len: torch.Tensor,
+    ) -> Tuple[
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+    ]:
+        mel, mel_length = self.preprocessor(audio_signal, audio_length)
+        return self.encoder(
+            mel,
+            mel_length.to(dtype=torch.int32),
+            cache_last_channel,
+            cache_last_time,
+            cache_last_channel_len,
+        )
+class JointDecisionWrapper(torch.nn.Module):
+    """Joint + decision head: outputs label id, label prob.
+    Unlike TDT, EOU models don't have duration outputs.
+    They have a special EOU token that marks end of utterance.
+    """
+    def __init__(self, joint: JointWrapper, vocab_size: int) -> None:
+        super().__init__()
+        self.joint = joint
+        self.vocab_with_blank = int(vocab_size) + 1
+    def forward(
+        self, encoder_outputs: torch.Tensor, decoder_outputs: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        logits = self.joint(encoder_outputs, decoder_outputs)
+        token_logits = logits[..., : self.vocab_with_blank]
+        # Token selection
+        token_ids = torch.argmax(token_logits, dim=-1).to(dtype=torch.int32)
+        token_probs_all = torch.softmax(token_logits, dim=-1)
+        token_prob = torch.gather(
+            token_probs_all, dim=-1, index=token_ids.long().unsqueeze(-1)
+        ).squeeze(-1)
+        return token_ids, token_prob
+class JointDecisionSingleStep(torch.nn.Module):
+    """Single-step variant for streaming: encoder_step [1, D, 1] -> [1,1,1].
+    Inputs:
+      - encoder_step: [B=1, D, T=1]
+      - decoder_step: [B=1, D, U=1]
+    Returns:
+      - token_id: [1, 1, 1] int32
+      - token_prob: [1, 1, 1] float32
+      - top_k_ids: [1, 1, 1, K] int32
+      - top_k_logits: [1, 1, 1, K] float32
+    """
+    def __init__(self, joint: JointWrapper, vocab_size: int, top_k: int = 64) -> None:
+        super().__init__()
+        self.joint = joint
+        self.vocab_with_blank = int(vocab_size) + 1
+        self.top_k = int(top_k)
+    def forward(
+        self, encoder_step: torch.Tensor, decoder_step: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        logits = self.joint(encoder_step, decoder_step)  # [1, 1, 1, V]
+        token_logits = logits[..., : self.vocab_with_blank]
+        token_ids = torch.argmax(token_logits, dim=-1, keepdim=False).to(
+            dtype=torch.int32
+        )
+        token_probs_all = torch.softmax(token_logits, dim=-1)
+        token_prob = torch.gather(
+            token_probs_all, dim=-1, index=token_ids.long().unsqueeze(-1)
+        ).squeeze(-1)
+        # Top-K candidates for host-side re-ranking
+        topk_logits, topk_ids_long = torch.topk(
+            token_logits, k=min(self.top_k, token_logits.shape[-1]), dim=-1
+        )
+        topk_ids = topk_ids_long.to(dtype=torch.int32)
+        return token_ids, token_prob, topk_ids, topk_logits
+def _coreml_convert(
+    traced: torch.jit.ScriptModule,
+    inputs,
+    outputs,
+    settings: ExportSettings,
+    compute_units_override: Optional[ct.ComputeUnit] = None,
+) -> ct.models.MLModel:
+    cu = (
+        compute_units_override
+        if compute_units_override is not None
+        else settings.compute_units
+    )
+    kwargs = {
+        "convert_to": "mlprogram",
+        "inputs": inputs,
+        "outputs": outputs,
+        "compute_units": cu,
+    }
+    print("Converting:", traced.__class__.__name__)
+    print("Conversion kwargs:", kwargs)
+    if settings.deployment_target is not None:
+        kwargs["minimum_deployment_target"] = settings.deployment_target
+    if settings.compute_precision is not None:
+        kwargs["compute_precision"] = settings.compute_precision
+    return ct.convert(traced, **kwargs)

scripts/nemo_streaming_reference.py ADDED Viewed

	@@ -0,0 +1,110 @@

+#!/usr/bin/env python3
+"""
+NeMo Nemotron Streaming Reference Implementation
+Streaming inference with nemotron-speech-streaming-en-0.6b using 1.12s chunks.
+Uses conformer_stream_step API with CacheAwareStreamingAudioBuffer.
+"""
+import numpy as np
+import soundfile as sf
+import torch
+import nemo.collections.asr as nemo_asr
+from nemo.collections.asr.parts.utils.streaming_utils import CacheAwareStreamingAudioBuffer
+def calc_drop_extra_pre_encoded(model, step_num, pad_and_drop_preencoded):
+    """Calculate drop_extra_pre_encoded value per NVIDIA's reference."""
+    if step_num == 0 and not pad_and_drop_preencoded:
+        return 0
+    return model.encoder.streaming_cfg.drop_extra_pre_encoded
+def transcribe_streaming(model, audio: np.ndarray, sr: int = 16000, pad_and_drop_preencoded: bool = False) -> str:
+    """
+    Streaming transcription using NeMo's conformer_stream_step API.
+    Args:
+        model: NeMo ASR model (must support streaming)
+        audio: Audio samples as float32 numpy array
+        sr: Sample rate (must be 16000)
+        pad_and_drop_preencoded: Whether to pad and drop preencoded frames.
+            False (default) gives better WER, True is needed for ONNX export.
+    Returns:
+        Transcribed text
+    """
+    model.encoder.setup_streaming_params()
+    streaming_buffer = CacheAwareStreamingAudioBuffer(
+        model=model,
+        pad_and_drop_preencoded=pad_and_drop_preencoded,
+    )
+    streaming_buffer.reset_buffer()
+    streaming_buffer.append_audio(audio)
+    cache_last_channel, cache_last_time, cache_last_channel_len = \
+        model.encoder.get_initial_cache_state(batch_size=1)
+    previous_hypotheses = None
+    pred_out_stream = None
+    final_text = ""
+    with torch.inference_mode():
+        for step_num, (chunk_audio, chunk_lengths) in enumerate(streaming_buffer):
+            (
+                pred_out_stream,
+                transcribed_texts,
+                cache_last_channel,
+                cache_last_time,
+                cache_last_channel_len,
+                previous_hypotheses,
+            ) = model.conformer_stream_step(
+                processed_signal=chunk_audio,
+                processed_signal_length=chunk_lengths,
+                cache_last_channel=cache_last_channel,
+                cache_last_time=cache_last_time,
+                cache_last_channel_len=cache_last_channel_len,
+                keep_all_outputs=streaming_buffer.is_buffer_empty(),
+                previous_hypotheses=previous_hypotheses,
+                previous_pred_out=pred_out_stream,
+                drop_extra_pre_encoded=calc_drop_extra_pre_encoded(model, step_num, pad_and_drop_preencoded),
+                return_transcription=True,
+            )
+            if transcribed_texts and len(transcribed_texts) > 0:
+                text = transcribed_texts[0]
+                if hasattr(text, 'text'):
+                    final_text = text.text
+                else:
+                    final_text = str(text)
+    return final_text
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description="NeMo Streaming Reference")
+    parser.add_argument("--audio", type=str, required=True, help="Path to audio file")
+    parser.add_argument("--duration", type=float, default=None, help="Duration in seconds to transcribe")
+    args = parser.parse_args()
+    audio, sr = sf.read(args.audio, dtype="float32")
+    if args.duration:
+        audio = audio[:int(args.duration * sr)]
+    print("=" * 70)
+    print("NEMOTRON STREAMING")
+    print("=" * 70)
+    print(f"Audio: {len(audio)/sr:.1f}s @ {sr}Hz")
+    print("\nLoading model...")
+    model = nemo_asr.models.ASRModel.from_pretrained("nvidia/nemotron-speech-streaming-en-0.6b")
+    model.eval()
+    print("\n[STREAMING MODE] (1.12s chunks)")
+    text = transcribe_streaming(model, audio, sr)
+    print(f"  {text}")
+if __name__ == "__main__":
+    main()