Upload 54 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- BENCHMARK_RESULTS.md +41 -0
- README.md +216 -0
- models/.DS_Store +0 -0
- models/decoder.mlmodelc/analytics/coremldata.bin +3 -0
- models/decoder.mlmodelc/coremldata.bin +3 -0
- models/decoder.mlmodelc/metadata.json +120 -0
- models/decoder.mlmodelc/model.mil +57 -0
- models/decoder.mlmodelc/weights/weight.bin +3 -0
- models/decoder.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- models/decoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- models/decoder.mlpackage/Manifest.json +18 -0
- models/encoder/encoder_float32.mlmodelc/analytics/coremldata.bin +3 -0
- models/encoder/encoder_float32.mlmodelc/coremldata.bin +3 -0
- models/encoder/encoder_float32.mlmodelc/metadata.json +168 -0
- models/encoder/encoder_float32.mlmodelc/model.mil +0 -0
- models/encoder/encoder_float32.mlmodelc/weights/weight.bin +3 -0
- models/encoder/encoder_float32.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- models/encoder/encoder_float32.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- models/encoder/encoder_float32.mlpackage/Manifest.json +18 -0
- models/encoder/encoder_int8.mlmodelc/analytics/coremldata.bin +3 -0
- models/encoder/encoder_int8.mlmodelc/coremldata.bin +3 -0
- models/encoder/encoder_int8.mlmodelc/metadata.json +171 -0
- models/encoder/encoder_int8.mlmodelc/model.mil +0 -0
- models/encoder/encoder_int8.mlmodelc/weights/weight.bin +3 -0
- models/encoder/encoder_int8.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- models/encoder/encoder_int8.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- models/encoder/encoder_int8.mlpackage/Manifest.json +18 -0
- models/joint.mlmodelc/analytics/coremldata.bin +3 -0
- models/joint.mlmodelc/coremldata.bin +3 -0
- models/joint.mlmodelc/metadata.json +75 -0
- models/joint.mlmodelc/model.mil +25 -0
- models/joint.mlmodelc/weights/weight.bin +3 -0
- models/joint.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- models/joint.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- models/joint.mlpackage/Manifest.json +18 -0
- models/metadata.json +23 -0
- models/preprocessor.mlmodelc/analytics/coremldata.bin +3 -0
- models/preprocessor.mlmodelc/coremldata.bin +3 -0
- models/preprocessor.mlmodelc/metadata.json +106 -0
- models/preprocessor.mlmodelc/model.mil +110 -0
- models/preprocessor.mlmodelc/weights/weight.bin +3 -0
- models/preprocessor.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- models/preprocessor.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- models/preprocessor.mlpackage/Manifest.json +18 -0
- models/tokenizer.json +1026 -0
- pyproject.toml +14 -0
- scripts/benchmark_wer.py +171 -0
- scripts/convert_nemotron_streaming.py +229 -0
- scripts/individual_components.py +322 -0
- scripts/nemo_streaming_reference.py +110 -0
BENCHMARK_RESULTS.md
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Nemotron Streaming 0.6B - WER Benchmark Results
|
| 2 |
+
|
| 3 |
+
Model: `nvidia/nemotron-speech-streaming-en-0.6b`
|
| 4 |
+
Dataset: LibriSpeech test-clean
|
| 5 |
+
Chunk size: 1.12s
|
| 6 |
+
|
| 7 |
+
## Results
|
| 8 |
+
|
| 9 |
+
### 10 Files
|
| 10 |
+
|
| 11 |
+
| Mode | WER | Errors | Words |
|
| 12 |
+
|------|-----|--------|-------|
|
| 13 |
+
| `pad_and_drop_preencoded=False` | 1.79% | 3 | 168 |
|
| 14 |
+
| `pad_and_drop_preencoded=True` | 3.57% | 6 | 168 |
|
| 15 |
+
|
| 16 |
+
### 100 Files
|
| 17 |
+
|
| 18 |
+
| Mode | WER | Errors | Words |
|
| 19 |
+
|------|-----|--------|-------|
|
| 20 |
+
| `pad_and_drop_preencoded=False` | 1.88% | - | - |
|
| 21 |
+
|
| 22 |
+
### NVIDIA Claimed
|
| 23 |
+
|
| 24 |
+
| Dataset | WER |
|
| 25 |
+
|---------|-----|
|
| 26 |
+
| LibriSpeech test-clean (1.12s chunks) | 2.31% |
|
| 27 |
+
|
| 28 |
+
## Notes
|
| 29 |
+
|
| 30 |
+
- `pad_and_drop_preencoded=False`: Better WER, but cannot be exported to ONNX/CoreML
|
| 31 |
+
- `pad_and_drop_preencoded=True`: Worse WER (~3%), but required for ONNX/CoreML export
|
| 32 |
+
- NVIDIA's 2.31% likely uses `pad_and_drop_preencoded=True` on full 2620 files
|
| 33 |
+
- Our implementation uses `conformer_stream_step` API with `CacheAwareStreamingAudioBuffer`
|
| 34 |
+
|
| 35 |
+
## Run Benchmark
|
| 36 |
+
|
| 37 |
+
```bash
|
| 38 |
+
cd nemotron-speech-streaming-0.6b/coreml
|
| 39 |
+
uv sync
|
| 40 |
+
uv run python benchmark_wer.py --num-files 100
|
| 41 |
+
```
|
README.md
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Nemotron Speech Streaming 0.6B - CoreML Conversion
|
| 2 |
+
|
| 3 |
+
CoreML conversion of NVIDIA's `nvidia/nemotron-speech-streaming-en-0.6b` for real-time streaming ASR on Apple devices.
|
| 4 |
+
|
| 5 |
+
## Model Overview
|
| 6 |
+
|
| 7 |
+
| Property | Value |
|
| 8 |
+
|----------|-------|
|
| 9 |
+
| Source Model | `nvidia/nemotron-speech-streaming-en-0.6b` |
|
| 10 |
+
| Architecture | FastConformer RNNT (Streaming) |
|
| 11 |
+
| Parameters | 0.6B |
|
| 12 |
+
| Chunk Size | 1.12 seconds (112 mel frames) |
|
| 13 |
+
| Sample Rate | 16kHz |
|
| 14 |
+
| Mel Features | 128 bins |
|
| 15 |
+
|
| 16 |
+
## CoreML Models
|
| 17 |
+
|
| 18 |
+
4 mlpackage files for the streaming RNNT pipeline:
|
| 19 |
+
|
| 20 |
+
| Model | Size | Function |
|
| 21 |
+
|-------|------|----------|
|
| 22 |
+
| `preprocessor.mlpackage` | 1.2M | audio → 128-dim mel spectrogram |
|
| 23 |
+
| `encoder.mlpackage` | 2.2G | mel + cache → encoded + new_cache |
|
| 24 |
+
| `decoder.mlpackage` | 28M | token + LSTM state → decoder_out + new_state |
|
| 25 |
+
| `joint.mlpackage` | 6.6M | encoder + decoder → logits |
|
| 26 |
+
|
| 27 |
+
Plus:
|
| 28 |
+
- `metadata.json` - Model configuration
|
| 29 |
+
- `tokenizer.json` - Vocabulary (1024 tokens)
|
| 30 |
+
|
| 31 |
+
## Streaming Configuration
|
| 32 |
+
|
| 33 |
+
```json
|
| 34 |
+
{
|
| 35 |
+
"sample_rate": 16000,
|
| 36 |
+
"mel_features": 128,
|
| 37 |
+
"chunk_mel_frames": 112,
|
| 38 |
+
"pre_encode_cache": 9,
|
| 39 |
+
"total_mel_frames": 121,
|
| 40 |
+
"vocab_size": 1024,
|
| 41 |
+
"blank_idx": 1024,
|
| 42 |
+
"encoder_dim": 1024,
|
| 43 |
+
"decoder_hidden": 640,
|
| 44 |
+
"decoder_layers": 2
|
| 45 |
+
}
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
### Chunk Timing
|
| 49 |
+
|
| 50 |
+
| Parameter | Value |
|
| 51 |
+
|-----------|-------|
|
| 52 |
+
| window_stride | 10ms |
|
| 53 |
+
| chunk_mel_frames | 112 |
|
| 54 |
+
| **chunk duration** | 112 × 10ms = **1.120s** |
|
| 55 |
+
| samples per chunk | 17,920 |
|
| 56 |
+
|
| 57 |
+
### Cache Shapes
|
| 58 |
+
|
| 59 |
+
| Cache | Shape | Description |
|
| 60 |
+
|-------|-------|-------------|
|
| 61 |
+
| cache_channel | [1, 24, 70, 1024] | Attention context cache |
|
| 62 |
+
| cache_time | [1, 24, 1024, 8] | Convolution time cache |
|
| 63 |
+
| cache_len | [1] | Cache fill level |
|
| 64 |
+
|
| 65 |
+
## Benchmark Results
|
| 66 |
+
|
| 67 |
+
### WER on LibriSpeech test-clean
|
| 68 |
+
|
| 69 |
+
| Mode | Files | WER | Notes |
|
| 70 |
+
|------|-------|-----|-------|
|
| 71 |
+
| PyTorch `pad_and_drop=False` | 100 | 1.88% | Non-streaming (full context) |
|
| 72 |
+
| PyTorch `pad_and_drop=True` | 10 | 3.57% | True streaming |
|
| 73 |
+
| CoreML Non-streaming | 100 | 1.83% | Full audio preprocessed |
|
| 74 |
+
| CoreML Streaming | 100 | 1.79% | Audio chunked at 1.12s |
|
| 75 |
+
| NVIDIA Claimed | 2620 | 2.31% | Full test-clean |
|
| 76 |
+
|
| 77 |
+
### Streaming Modes Explained
|
| 78 |
+
|
| 79 |
+
```
|
| 80 |
+
NON-STREAMING (test_coreml_inference.py):
|
| 81 |
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 82 |
+
1. Full audio → preprocessor → FULL mel (one continuous spectrogram)
|
| 83 |
+
2. Slice mel into chunks for encoder
|
| 84 |
+
3. Each slice has natural continuity (no chunk boundaries)
|
| 85 |
+
|
| 86 |
+
CHEAT: The mel was computed with full audio context
|
| 87 |
+
WER: ~1.83%
|
| 88 |
+
```
|
| 89 |
+
|
| 90 |
+
```
|
| 91 |
+
TRUE STREAMING (test_coreml_streaming.py):
|
| 92 |
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 93 |
+
1. Audio chunk 1 → preprocessor → mel_1
|
| 94 |
+
2. Audio chunk 2 → preprocessor → mel_2 (computed separately!)
|
| 95 |
+
3. Prepend last 9 frames of mel_1 to mel_2 (mel_cache)
|
| 96 |
+
|
| 97 |
+
mel_cache = bridge between separately-computed mels (NOT cheating)
|
| 98 |
+
WER: ~1.79%
|
| 99 |
+
```
|
| 100 |
+
|
| 101 |
+
### What is mel_cache?
|
| 102 |
+
|
| 103 |
+
The encoder's subsampling layer needs 9 frames (~90ms) of look-back context:
|
| 104 |
+
|
| 105 |
+
```
|
| 106 |
+
ENCODER INPUT (needs 121 frames = 9 cache + 112 new)
|
| 107 |
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 108 |
+
│9│ 112 frames │
|
| 109 |
+
↑
|
| 110 |
+
mel_cache = last 9 frames from PREVIOUS chunk's mel
|
| 111 |
+
|
| 112 |
+
Chunk 1: [000000000][mel_chunk_1] ← pad with zeros (no previous)
|
| 113 |
+
Chunk 2: [mel_1_end][mel_chunk_2] ← 9 frames from chunk 1
|
| 114 |
+
Chunk 3: [mel_2_end][mel_chunk_3] ← 9 frames from chunk 2
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
This is **NOT cheating** - in real-time streaming you DO have the previous 90ms of audio.
|
| 118 |
+
|
| 119 |
+
## Inference Pipeline
|
| 120 |
+
|
| 121 |
+
```
|
| 122 |
+
┌─────────────────────────────────────────────────────────────────┐
|
| 123 |
+
│ STREAMING RNNT PIPELINE │
|
| 124 |
+
└─────────────────────────────────────────────────────────────────┘
|
| 125 |
+
|
| 126 |
+
1. PREPROCESSOR (per 1.12s audio chunk)
|
| 127 |
+
audio [1, 17920] → mel [1, 128, 112]
|
| 128 |
+
|
| 129 |
+
2. ENCODER (with cache)
|
| 130 |
+
mel [1, 128, 121] + cache → encoded [1, 1024, 14] + new_cache
|
| 131 |
+
(121 = 9 mel_cache + 112 new frames)
|
| 132 |
+
(14 output frames after 8x subsampling)
|
| 133 |
+
|
| 134 |
+
3. DECODER + JOINT (greedy loop per encoder frame)
|
| 135 |
+
For each of 14 encoder frames:
|
| 136 |
+
┌──────────────────────────────────────────┐
|
| 137 |
+
│ token → DECODER → decoder_out │
|
| 138 |
+
│ encoder_step + decoder_out → JOINT │
|
| 139 |
+
│ → logits → argmax → predicted token │
|
| 140 |
+
│ if token == BLANK: next encoder frame │
|
| 141 |
+
│ else: emit token, update decoder state │
|
| 142 |
+
└──────────────────────────────────────────┘
|
| 143 |
+
```
|
| 144 |
+
|
| 145 |
+
## Usage
|
| 146 |
+
|
| 147 |
+
### Convert to CoreML
|
| 148 |
+
|
| 149 |
+
```bash
|
| 150 |
+
cd conversion_scripts
|
| 151 |
+
uv sync
|
| 152 |
+
uv run python convert_nemotron_streaming.py --output-dir ../nemotron_coreml
|
| 153 |
+
```
|
| 154 |
+
|
| 155 |
+
Options:
|
| 156 |
+
- `--encoder-cu`: Encoder compute units (default: CPU_AND_NE)
|
| 157 |
+
- `--precision`: FLOAT32 or FLOAT16
|
| 158 |
+
|
| 159 |
+
### Run WER Benchmark (PyTorch)
|
| 160 |
+
|
| 161 |
+
```bash
|
| 162 |
+
cd conversion_scripts
|
| 163 |
+
uv run python ../benchmark_wer.py --num-files 100
|
| 164 |
+
```
|
| 165 |
+
|
| 166 |
+
### Test CoreML Inference
|
| 167 |
+
|
| 168 |
+
Non-streaming (full audio preprocessing):
|
| 169 |
+
```bash
|
| 170 |
+
uv run python ../test_coreml_inference.py --model-dir ../nemotron_coreml --num-files 10
|
| 171 |
+
```
|
| 172 |
+
|
| 173 |
+
True streaming (audio chunked at 1.12s):
|
| 174 |
+
```bash
|
| 175 |
+
uv run python ../test_coreml_streaming.py --model-dir ../nemotron_coreml --num-files 10
|
| 176 |
+
```
|
| 177 |
+
|
| 178 |
+
## Files
|
| 179 |
+
|
| 180 |
+
```
|
| 181 |
+
nemotron-speech-streaming-0.6b/coreml/
|
| 182 |
+
├── README.md # This file
|
| 183 |
+
├── BENCHMARK_RESULTS.md # WER benchmark results
|
| 184 |
+
├── benchmark_wer.py # PyTorch streaming WER benchmark
|
| 185 |
+
├── nemo_streaming_reference.py # NeMo streaming reference implementation
|
| 186 |
+
├── test_coreml_inference.py # CoreML non-streaming test
|
| 187 |
+
├── test_coreml_streaming.py # CoreML true streaming test
|
| 188 |
+
├── conversion_scripts/
|
| 189 |
+
│ ├── pyproject.toml # Python dependencies (uv)
|
| 190 |
+
│ ├── convert_nemotron_streaming.py # Main conversion script
|
| 191 |
+
│ └── individual_components.py # Wrapper classes for export
|
| 192 |
+
├── nemotron_coreml/ # Exported CoreML models
|
| 193 |
+
│ ├── preprocessor.mlpackage
|
| 194 |
+
│ ├── encoder.mlpackage
|
| 195 |
+
│ ├── decoder.mlpackage
|
| 196 |
+
│ ├── joint.mlpackage
|
| 197 |
+
│ ├── metadata.json
|
| 198 |
+
│ └── tokenizer.json
|
| 199 |
+
└── datasets/
|
| 200 |
+
└── LibriSpeech/test-clean/ # 2620 test files
|
| 201 |
+
```
|
| 202 |
+
|
| 203 |
+
## Dependencies
|
| 204 |
+
|
| 205 |
+
- Python 3.10
|
| 206 |
+
- PyTorch 2.x
|
| 207 |
+
- NeMo Toolkit 2.x
|
| 208 |
+
- CoreMLTools 7.x
|
| 209 |
+
- soundfile, numpy, typer
|
| 210 |
+
|
| 211 |
+
## Notes
|
| 212 |
+
|
| 213 |
+
- The encoder is the largest model (2.2GB) with 24 Conformer layers
|
| 214 |
+
- Model uses 128 mel bins (not the typical 80)
|
| 215 |
+
- RNNT blank token index is 1024 (vocab_size)
|
| 216 |
+
- Decoder uses 2-layer LSTM with 640 hidden units
|
models/.DS_Store
ADDED
|
Binary file (8.2 kB). View file
|
|
|
models/decoder.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:496b8a25e0697cb5f15196251c89730c25574d1c9eed4111f70adc6457198b8b
|
| 3 |
+
size 243
|
models/decoder.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2483acce3793dcef37b4c27d99af51125c9c0f6f11641e3a76fab5518391203b
|
| 3 |
+
size 492
|
models/decoder.mlmodelc/metadata.json
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"metadataOutputVersion" : "3.0",
|
| 4 |
+
"storagePrecision" : "Float32",
|
| 5 |
+
"outputSchema" : [
|
| 6 |
+
{
|
| 7 |
+
"hasShapeFlexibility" : "0",
|
| 8 |
+
"isOptional" : "0",
|
| 9 |
+
"dataType" : "Float32",
|
| 10 |
+
"formattedType" : "MultiArray (Float32 1 × 640 × 1)",
|
| 11 |
+
"shortDescription" : "",
|
| 12 |
+
"shape" : "[1, 640, 1]",
|
| 13 |
+
"name" : "decoder_out",
|
| 14 |
+
"type" : "MultiArray"
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"hasShapeFlexibility" : "0",
|
| 18 |
+
"isOptional" : "0",
|
| 19 |
+
"dataType" : "Float32",
|
| 20 |
+
"formattedType" : "MultiArray (Float32 2 × 1 × 640)",
|
| 21 |
+
"shortDescription" : "",
|
| 22 |
+
"shape" : "[2, 1, 640]",
|
| 23 |
+
"name" : "h_out",
|
| 24 |
+
"type" : "MultiArray"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"hasShapeFlexibility" : "0",
|
| 28 |
+
"isOptional" : "0",
|
| 29 |
+
"dataType" : "Float32",
|
| 30 |
+
"formattedType" : "MultiArray (Float32 2 × 1 × 640)",
|
| 31 |
+
"shortDescription" : "",
|
| 32 |
+
"shape" : "[2, 1, 640]",
|
| 33 |
+
"name" : "c_out",
|
| 34 |
+
"type" : "MultiArray"
|
| 35 |
+
}
|
| 36 |
+
],
|
| 37 |
+
"modelParameters" : [
|
| 38 |
+
|
| 39 |
+
],
|
| 40 |
+
"specificationVersion" : 8,
|
| 41 |
+
"mlProgramOperationTypeHistogram" : {
|
| 42 |
+
"Select" : 1,
|
| 43 |
+
"Ios17.squeeze" : 4,
|
| 44 |
+
"Ios17.gather" : 1,
|
| 45 |
+
"Ios17.lstm" : 2,
|
| 46 |
+
"Identity" : 1,
|
| 47 |
+
"Ios17.transpose" : 2,
|
| 48 |
+
"Split" : 2,
|
| 49 |
+
"Ios17.add" : 1,
|
| 50 |
+
"Ios17.greaterEqual" : 1,
|
| 51 |
+
"Stack" : 2
|
| 52 |
+
},
|
| 53 |
+
"computePrecision" : "Mixed (Float32, Int32)",
|
| 54 |
+
"isUpdatable" : "0",
|
| 55 |
+
"stateSchema" : [
|
| 56 |
+
|
| 57 |
+
],
|
| 58 |
+
"availability" : {
|
| 59 |
+
"macOS" : "14.0",
|
| 60 |
+
"tvOS" : "17.0",
|
| 61 |
+
"visionOS" : "1.0",
|
| 62 |
+
"watchOS" : "10.0",
|
| 63 |
+
"iOS" : "17.0",
|
| 64 |
+
"macCatalyst" : "17.0"
|
| 65 |
+
},
|
| 66 |
+
"modelType" : {
|
| 67 |
+
"name" : "MLModelType_mlProgram"
|
| 68 |
+
},
|
| 69 |
+
"userDefinedMetadata" : {
|
| 70 |
+
"com.github.apple.coremltools.conversion_date" : "2026-01-11",
|
| 71 |
+
"com.github.apple.coremltools.source" : "torch==2.9.1",
|
| 72 |
+
"com.github.apple.coremltools.version" : "9.0",
|
| 73 |
+
"com.github.apple.coremltools.source_dialect" : "TorchScript"
|
| 74 |
+
},
|
| 75 |
+
"inputSchema" : [
|
| 76 |
+
{
|
| 77 |
+
"hasShapeFlexibility" : "0",
|
| 78 |
+
"isOptional" : "0",
|
| 79 |
+
"dataType" : "Int32",
|
| 80 |
+
"formattedType" : "MultiArray (Int32 1 × 1)",
|
| 81 |
+
"shortDescription" : "",
|
| 82 |
+
"shape" : "[1, 1]",
|
| 83 |
+
"name" : "token",
|
| 84 |
+
"type" : "MultiArray"
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"hasShapeFlexibility" : "0",
|
| 88 |
+
"isOptional" : "0",
|
| 89 |
+
"dataType" : "Int32",
|
| 90 |
+
"formattedType" : "MultiArray (Int32 1)",
|
| 91 |
+
"shortDescription" : "",
|
| 92 |
+
"shape" : "[1]",
|
| 93 |
+
"name" : "token_length",
|
| 94 |
+
"type" : "MultiArray"
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"hasShapeFlexibility" : "0",
|
| 98 |
+
"isOptional" : "0",
|
| 99 |
+
"dataType" : "Float32",
|
| 100 |
+
"formattedType" : "MultiArray (Float32 2 × 1 × 640)",
|
| 101 |
+
"shortDescription" : "",
|
| 102 |
+
"shape" : "[2, 1, 640]",
|
| 103 |
+
"name" : "h_in",
|
| 104 |
+
"type" : "MultiArray"
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"hasShapeFlexibility" : "0",
|
| 108 |
+
"isOptional" : "0",
|
| 109 |
+
"dataType" : "Float32",
|
| 110 |
+
"formattedType" : "MultiArray (Float32 2 × 1 × 640)",
|
| 111 |
+
"shortDescription" : "",
|
| 112 |
+
"shape" : "[2, 1, 640]",
|
| 113 |
+
"name" : "c_in",
|
| 114 |
+
"type" : "MultiArray"
|
| 115 |
+
}
|
| 116 |
+
],
|
| 117 |
+
"generatedClassName" : "decoder",
|
| 118 |
+
"method" : "predict"
|
| 119 |
+
}
|
| 120 |
+
]
|
models/decoder.mlmodelc/model.mil
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
program(1.0)
|
| 2 |
+
[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.1"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
|
| 3 |
+
{
|
| 4 |
+
func main<ios17>(tensor<fp32, [2, 1, 640]> c_in, tensor<fp32, [2, 1, 640]> h_in, tensor<int32, [1, 1]> token, tensor<int32, [1]> token_length) {
|
| 5 |
+
tensor<fp32, [1025, 640]> module_prediction_embed_weight = const()[name = tensor<string, []>("module_prediction_embed_weight"), val = tensor<fp32, [1025, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
|
| 6 |
+
tensor<int32, []> y_batch_dims_0 = const()[name = tensor<string, []>("y_batch_dims_0"), val = tensor<int32, []>(0)];
|
| 7 |
+
tensor<bool, []> y_validate_indices_0 = const()[name = tensor<string, []>("y_validate_indices_0"), val = tensor<bool, []>(false)];
|
| 8 |
+
tensor<int32, []> greater_equal_0_y_0 = const()[name = tensor<string, []>("greater_equal_0_y_0"), val = tensor<int32, []>(0)];
|
| 9 |
+
tensor<bool, [1, 1]> greater_equal_0 = greater_equal(x = token, y = greater_equal_0_y_0)[name = tensor<string, []>("greater_equal_0")];
|
| 10 |
+
tensor<int32, []> slice_by_index_0 = const()[name = tensor<string, []>("slice_by_index_0"), val = tensor<int32, []>(1025)];
|
| 11 |
+
tensor<int32, [1, 1]> add_2 = add(x = token, y = slice_by_index_0)[name = tensor<string, []>("add_2")];
|
| 12 |
+
tensor<int32, [1, 1]> select_0 = select(a = token, b = add_2, cond = greater_equal_0)[name = tensor<string, []>("select_0")];
|
| 13 |
+
tensor<int32, []> y_axis_1 = const()[name = tensor<string, []>("y_axis_1"), val = tensor<int32, []>(0)];
|
| 14 |
+
tensor<fp32, [1, 1, 640]> y = gather(axis = y_axis_1, batch_dims = y_batch_dims_0, indices = select_0, validate_indices = y_validate_indices_0, x = module_prediction_embed_weight)[name = tensor<string, []>("y")];
|
| 15 |
+
tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
|
| 16 |
+
tensor<int32, []> split_0_num_splits_0 = const()[name = tensor<string, []>("split_0_num_splits_0"), val = tensor<int32, []>(2)];
|
| 17 |
+
tensor<int32, []> split_0_axis_0 = const()[name = tensor<string, []>("split_0_axis_0"), val = tensor<int32, []>(0)];
|
| 18 |
+
tensor<fp32, [1, 1, 640]> split_0_0, tensor<fp32, [1, 1, 640]> split_0_1 = split(axis = split_0_axis_0, num_splits = split_0_num_splits_0, x = h_in)[name = tensor<string, []>("split_0")];
|
| 19 |
+
tensor<int32, []> split_1_num_splits_0 = const()[name = tensor<string, []>("split_1_num_splits_0"), val = tensor<int32, []>(2)];
|
| 20 |
+
tensor<int32, []> split_1_axis_0 = const()[name = tensor<string, []>("split_1_axis_0"), val = tensor<int32, []>(0)];
|
| 21 |
+
tensor<fp32, [1, 1, 640]> split_1_0, tensor<fp32, [1, 1, 640]> split_1_1 = split(axis = split_1_axis_0, num_splits = split_1_num_splits_0, x = c_in)[name = tensor<string, []>("split_1")];
|
| 22 |
+
tensor<fp32, [2560]> concat_0 = const()[name = tensor<string, []>("concat_0"), val = tensor<fp32, [2560]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(2624128)))];
|
| 23 |
+
tensor<fp32, [2560, 640]> concat_1 = const()[name = tensor<string, []>("concat_1"), val = tensor<fp32, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(2634432)))];
|
| 24 |
+
tensor<fp32, [2560, 640]> concat_2 = const()[name = tensor<string, []>("concat_2"), val = tensor<fp32, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9188096)))];
|
| 25 |
+
tensor<int32, [1]> input_lstm_layer_0_lstm_h0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_layer_0_lstm_h0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
|
| 26 |
+
tensor<fp32, [1, 640]> input_lstm_layer_0_lstm_h0_squeeze = squeeze(axes = input_lstm_layer_0_lstm_h0_squeeze_axes_0, x = split_0_0)[name = tensor<string, []>("input_lstm_layer_0_lstm_h0_squeeze")];
|
| 27 |
+
tensor<int32, [1]> input_lstm_layer_0_lstm_c0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_layer_0_lstm_c0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
|
| 28 |
+
tensor<fp32, [1, 640]> input_lstm_layer_0_lstm_c0_squeeze = squeeze(axes = input_lstm_layer_0_lstm_c0_squeeze_axes_0, x = split_1_0)[name = tensor<string, []>("input_lstm_layer_0_lstm_c0_squeeze")];
|
| 29 |
+
tensor<string, []> input_lstm_layer_0_direction_0 = const()[name = tensor<string, []>("input_lstm_layer_0_direction_0"), val = tensor<string, []>("forward")];
|
| 30 |
+
tensor<bool, []> input_lstm_layer_0_output_sequence_0 = const()[name = tensor<string, []>("input_lstm_layer_0_output_sequence_0"), val = tensor<bool, []>(true)];
|
| 31 |
+
tensor<string, []> input_lstm_layer_0_recurrent_activation_0 = const()[name = tensor<string, []>("input_lstm_layer_0_recurrent_activation_0"), val = tensor<string, []>("sigmoid")];
|
| 32 |
+
tensor<string, []> input_lstm_layer_0_cell_activation_0 = const()[name = tensor<string, []>("input_lstm_layer_0_cell_activation_0"), val = tensor<string, []>("tanh")];
|
| 33 |
+
tensor<string, []> input_lstm_layer_0_activation_0 = const()[name = tensor<string, []>("input_lstm_layer_0_activation_0"), val = tensor<string, []>("tanh")];
|
| 34 |
+
tensor<fp32, [1, 1, 640]> input_3 = transpose(perm = input_3_perm_0, x = y)[name = tensor<string, []>("transpose_2")];
|
| 35 |
+
tensor<fp32, [1, 1, 640]> input_lstm_layer_0_0, tensor<fp32, [1, 640]> input_lstm_layer_0_1, tensor<fp32, [1, 640]> input_lstm_layer_0_2 = lstm(activation = input_lstm_layer_0_activation_0, bias = concat_0, cell_activation = input_lstm_layer_0_cell_activation_0, direction = input_lstm_layer_0_direction_0, initial_c = input_lstm_layer_0_lstm_c0_squeeze, initial_h = input_lstm_layer_0_lstm_h0_squeeze, output_sequence = input_lstm_layer_0_output_sequence_0, recurrent_activation = input_lstm_layer_0_recurrent_activation_0, weight_hh = concat_2, weight_ih = concat_1, x = input_3)[name = tensor<string, []>("input_lstm_layer_0")];
|
| 36 |
+
tensor<fp32, [2560]> concat_3 = const()[name = tensor<string, []>("concat_3"), val = tensor<fp32, [2560]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15741760)))];
|
| 37 |
+
tensor<fp32, [2560, 640]> concat_4 = const()[name = tensor<string, []>("concat_4"), val = tensor<fp32, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15752064)))];
|
| 38 |
+
tensor<fp32, [2560, 640]> concat_5 = const()[name = tensor<string, []>("concat_5"), val = tensor<fp32, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22305728)))];
|
| 39 |
+
tensor<int32, [1]> input_lstm_h0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_h0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
|
| 40 |
+
tensor<fp32, [1, 640]> input_lstm_h0_squeeze = squeeze(axes = input_lstm_h0_squeeze_axes_0, x = split_0_1)[name = tensor<string, []>("input_lstm_h0_squeeze")];
|
| 41 |
+
tensor<int32, [1]> input_lstm_c0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_c0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
|
| 42 |
+
tensor<fp32, [1, 640]> input_lstm_c0_squeeze = squeeze(axes = input_lstm_c0_squeeze_axes_0, x = split_1_1)[name = tensor<string, []>("input_lstm_c0_squeeze")];
|
| 43 |
+
tensor<string, []> input_direction_0 = const()[name = tensor<string, []>("input_direction_0"), val = tensor<string, []>("forward")];
|
| 44 |
+
tensor<bool, []> input_output_sequence_0 = const()[name = tensor<string, []>("input_output_sequence_0"), val = tensor<bool, []>(true)];
|
| 45 |
+
tensor<string, []> input_recurrent_activation_0 = const()[name = tensor<string, []>("input_recurrent_activation_0"), val = tensor<string, []>("sigmoid")];
|
| 46 |
+
tensor<string, []> input_cell_activation_0 = const()[name = tensor<string, []>("input_cell_activation_0"), val = tensor<string, []>("tanh")];
|
| 47 |
+
tensor<string, []> input_activation_0 = const()[name = tensor<string, []>("input_activation_0"), val = tensor<string, []>("tanh")];
|
| 48 |
+
tensor<fp32, [1, 1, 640]> input_0, tensor<fp32, [1, 640]> input_1, tensor<fp32, [1, 640]> input_2 = lstm(activation = input_activation_0, bias = concat_3, cell_activation = input_cell_activation_0, direction = input_direction_0, initial_c = input_lstm_c0_squeeze, initial_h = input_lstm_h0_squeeze, output_sequence = input_output_sequence_0, recurrent_activation = input_recurrent_activation_0, weight_hh = concat_5, weight_ih = concat_4, x = input_lstm_layer_0_0)[name = tensor<string, []>("input")];
|
| 49 |
+
tensor<int32, []> obj_3_axis_0 = const()[name = tensor<string, []>("obj_3_axis_0"), val = tensor<int32, []>(0)];
|
| 50 |
+
tensor<fp32, [2, 1, 640]> h_out = stack(axis = obj_3_axis_0, values = (input_lstm_layer_0_1, input_1))[name = tensor<string, []>("obj_3")];
|
| 51 |
+
tensor<int32, []> obj_axis_0 = const()[name = tensor<string, []>("obj_axis_0"), val = tensor<int32, []>(0)];
|
| 52 |
+
tensor<fp32, [2, 1, 640]> c_out = stack(axis = obj_axis_0, values = (input_lstm_layer_0_2, input_2))[name = tensor<string, []>("obj")];
|
| 53 |
+
tensor<int32, [3]> transpose_0_perm_0 = const()[name = tensor<string, []>("transpose_0_perm_0"), val = tensor<int32, [3]>([1, 2, 0])];
|
| 54 |
+
tensor<fp32, [1, 640, 1]> decoder_out = transpose(perm = transpose_0_perm_0, x = input_0)[name = tensor<string, []>("transpose_1")];
|
| 55 |
+
tensor<int32, [1]> token_length_tmp = identity(x = token_length)[name = tensor<string, []>("token_length_tmp")];
|
| 56 |
+
} -> (decoder_out, h_out, c_out);
|
| 57 |
+
}
|
models/decoder.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:03add11563357087393b3ef162925bfb93fa5caf070aa4b91abd909cbbab1aed
|
| 3 |
+
size 28859392
|
models/decoder.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b766ca2ac7f121abd7ac1d8c4ebdee568912084c3f3fa356e0097ac155597834
|
| 3 |
+
size 8734
|
models/decoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:03add11563357087393b3ef162925bfb93fa5caf070aa4b91abd909cbbab1aed
|
| 3 |
+
size 28859392
|
models/decoder.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"6493F213-8E94-4135-BA80-88CBEAF57D4F": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Weights",
|
| 7 |
+
"name": "weights",
|
| 8 |
+
"path": "com.apple.CoreML/weights"
|
| 9 |
+
},
|
| 10 |
+
"9ACB4EE7-C027-4AAA-B75C-BA0B33F7B714": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Specification",
|
| 13 |
+
"name": "model.mlmodel",
|
| 14 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "9ACB4EE7-C027-4AAA-B75C-BA0B33F7B714"
|
| 18 |
+
}
|
models/encoder/encoder_float32.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7353a9c695cce6d4431164021f0551e5a9dd8515ed1ee3a0945212fb5c3db961
|
| 3 |
+
size 243
|
models/encoder/encoder_float32.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:28ba4f3d7a7dc602ee2fdd0f7869a9e81aaac4576601aa2f57633187a810aa90
|
| 3 |
+
size 607
|
models/encoder/encoder_float32.mlmodelc/metadata.json
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"metadataOutputVersion" : "3.0",
|
| 4 |
+
"storagePrecision" : "Float32",
|
| 5 |
+
"outputSchema" : [
|
| 6 |
+
{
|
| 7 |
+
"hasShapeFlexibility" : "0",
|
| 8 |
+
"isOptional" : "0",
|
| 9 |
+
"dataType" : "Float32",
|
| 10 |
+
"formattedType" : "MultiArray (Float32 1 × 1024 × 14)",
|
| 11 |
+
"shortDescription" : "",
|
| 12 |
+
"shape" : "[1, 1024, 14]",
|
| 13 |
+
"name" : "encoded",
|
| 14 |
+
"type" : "MultiArray"
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"hasShapeFlexibility" : "0",
|
| 18 |
+
"isOptional" : "0",
|
| 19 |
+
"dataType" : "Int32",
|
| 20 |
+
"formattedType" : "MultiArray (Int32 1)",
|
| 21 |
+
"shortDescription" : "",
|
| 22 |
+
"shape" : "[1]",
|
| 23 |
+
"name" : "encoded_length",
|
| 24 |
+
"type" : "MultiArray"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"hasShapeFlexibility" : "0",
|
| 28 |
+
"isOptional" : "0",
|
| 29 |
+
"dataType" : "Float32",
|
| 30 |
+
"formattedType" : "MultiArray (Float32 1 × 24 × 70 × 1024)",
|
| 31 |
+
"shortDescription" : "",
|
| 32 |
+
"shape" : "[1, 24, 70, 1024]",
|
| 33 |
+
"name" : "cache_channel_out",
|
| 34 |
+
"type" : "MultiArray"
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"hasShapeFlexibility" : "0",
|
| 38 |
+
"isOptional" : "0",
|
| 39 |
+
"dataType" : "Float32",
|
| 40 |
+
"formattedType" : "MultiArray (Float32 1 × 24 × 1024 × 8)",
|
| 41 |
+
"shortDescription" : "",
|
| 42 |
+
"shape" : "[1, 24, 1024, 8]",
|
| 43 |
+
"name" : "cache_time_out",
|
| 44 |
+
"type" : "MultiArray"
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
"hasShapeFlexibility" : "0",
|
| 48 |
+
"isOptional" : "0",
|
| 49 |
+
"dataType" : "Int32",
|
| 50 |
+
"formattedType" : "MultiArray (Int32 1)",
|
| 51 |
+
"shortDescription" : "",
|
| 52 |
+
"shape" : "[1]",
|
| 53 |
+
"name" : "cache_len_out",
|
| 54 |
+
"type" : "MultiArray"
|
| 55 |
+
}
|
| 56 |
+
],
|
| 57 |
+
"modelParameters" : [
|
| 58 |
+
|
| 59 |
+
],
|
| 60 |
+
"specificationVersion" : 8,
|
| 61 |
+
"mlProgramOperationTypeHistogram" : {
|
| 62 |
+
"Ios17.logicalAnd" : 3,
|
| 63 |
+
"Ios17.reshape" : 145,
|
| 64 |
+
"Ios16.softmax" : 24,
|
| 65 |
+
"Ios17.matmul" : 72,
|
| 66 |
+
"Ios17.transpose" : 224,
|
| 67 |
+
"Split" : 24,
|
| 68 |
+
"Ios17.expandDims" : 18,
|
| 69 |
+
"Select" : 72,
|
| 70 |
+
"Ios17.add" : 180,
|
| 71 |
+
"Tile" : 8,
|
| 72 |
+
"Ios17.sliceByIndex" : 147,
|
| 73 |
+
"Ios16.sigmoid" : 24,
|
| 74 |
+
"Pad" : 27,
|
| 75 |
+
"Ios17.logicalNot" : 2,
|
| 76 |
+
"Ios17.layerNorm" : 144,
|
| 77 |
+
"Ios17.less" : 5,
|
| 78 |
+
"Ios17.sub" : 4,
|
| 79 |
+
"Ios17.conv" : 77,
|
| 80 |
+
"Ios16.relu" : 3,
|
| 81 |
+
"Ios17.clip" : 2,
|
| 82 |
+
"Ios17.linear" : 193,
|
| 83 |
+
"Ios17.greaterEqual" : 1,
|
| 84 |
+
"Ios17.floorDiv" : 3,
|
| 85 |
+
"Ios17.cast" : 12,
|
| 86 |
+
"Ios16.silu" : 72,
|
| 87 |
+
"Ios17.concat" : 72,
|
| 88 |
+
"Stack" : 2,
|
| 89 |
+
"Ios17.mul" : 106
|
| 90 |
+
},
|
| 91 |
+
"computePrecision" : "Mixed (Float32, Int32)",
|
| 92 |
+
"isUpdatable" : "0",
|
| 93 |
+
"stateSchema" : [
|
| 94 |
+
|
| 95 |
+
],
|
| 96 |
+
"availability" : {
|
| 97 |
+
"macOS" : "14.0",
|
| 98 |
+
"tvOS" : "17.0",
|
| 99 |
+
"visionOS" : "1.0",
|
| 100 |
+
"watchOS" : "10.0",
|
| 101 |
+
"iOS" : "17.0",
|
| 102 |
+
"macCatalyst" : "17.0"
|
| 103 |
+
},
|
| 104 |
+
"modelType" : {
|
| 105 |
+
"name" : "MLModelType_mlProgram"
|
| 106 |
+
},
|
| 107 |
+
"userDefinedMetadata" : {
|
| 108 |
+
"com.github.apple.coremltools.conversion_date" : "2026-01-11",
|
| 109 |
+
"com.github.apple.coremltools.source" : "torch==2.9.1",
|
| 110 |
+
"com.github.apple.coremltools.version" : "9.0",
|
| 111 |
+
"com.github.apple.coremltools.source_dialect" : "TorchScript"
|
| 112 |
+
},
|
| 113 |
+
"inputSchema" : [
|
| 114 |
+
{
|
| 115 |
+
"hasShapeFlexibility" : "0",
|
| 116 |
+
"isOptional" : "0",
|
| 117 |
+
"dataType" : "Float32",
|
| 118 |
+
"formattedType" : "MultiArray (Float32 1 × 128 × 121)",
|
| 119 |
+
"shortDescription" : "",
|
| 120 |
+
"shape" : "[1, 128, 121]",
|
| 121 |
+
"name" : "mel",
|
| 122 |
+
"type" : "MultiArray"
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"hasShapeFlexibility" : "0",
|
| 126 |
+
"isOptional" : "0",
|
| 127 |
+
"dataType" : "Int32",
|
| 128 |
+
"formattedType" : "MultiArray (Int32 1)",
|
| 129 |
+
"shortDescription" : "",
|
| 130 |
+
"shape" : "[1]",
|
| 131 |
+
"name" : "mel_length",
|
| 132 |
+
"type" : "MultiArray"
|
| 133 |
+
},
|
| 134 |
+
{
|
| 135 |
+
"hasShapeFlexibility" : "0",
|
| 136 |
+
"isOptional" : "0",
|
| 137 |
+
"dataType" : "Float32",
|
| 138 |
+
"formattedType" : "MultiArray (Float32 1 × 24 × 70 × 1024)",
|
| 139 |
+
"shortDescription" : "",
|
| 140 |
+
"shape" : "[1, 24, 70, 1024]",
|
| 141 |
+
"name" : "cache_channel",
|
| 142 |
+
"type" : "MultiArray"
|
| 143 |
+
},
|
| 144 |
+
{
|
| 145 |
+
"hasShapeFlexibility" : "0",
|
| 146 |
+
"isOptional" : "0",
|
| 147 |
+
"dataType" : "Float32",
|
| 148 |
+
"formattedType" : "MultiArray (Float32 1 × 24 × 1024 × 8)",
|
| 149 |
+
"shortDescription" : "",
|
| 150 |
+
"shape" : "[1, 24, 1024, 8]",
|
| 151 |
+
"name" : "cache_time",
|
| 152 |
+
"type" : "MultiArray"
|
| 153 |
+
},
|
| 154 |
+
{
|
| 155 |
+
"hasShapeFlexibility" : "0",
|
| 156 |
+
"isOptional" : "0",
|
| 157 |
+
"dataType" : "Int32",
|
| 158 |
+
"formattedType" : "MultiArray (Int32 1)",
|
| 159 |
+
"shortDescription" : "",
|
| 160 |
+
"shape" : "[1]",
|
| 161 |
+
"name" : "cache_len",
|
| 162 |
+
"type" : "MultiArray"
|
| 163 |
+
}
|
| 164 |
+
],
|
| 165 |
+
"generatedClassName" : "encoder_float32",
|
| 166 |
+
"method" : "predict"
|
| 167 |
+
}
|
| 168 |
+
]
|
models/encoder/encoder_float32.mlmodelc/model.mil
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/encoder/encoder_float32.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:97ea4cb0d81aded8be4a2fbc8cbfb0621b77bd4a87c2d286d7cdf63a0f1d3e71
|
| 3 |
+
size 2352382336
|
models/encoder/encoder_float32.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a055d64d270aa66f578b06e238a189ea0e4d299ae4cbc484c8e1ea9b6256914b
|
| 3 |
+
size 640913
|
models/encoder/encoder_float32.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:97ea4cb0d81aded8be4a2fbc8cbfb0621b77bd4a87c2d286d7cdf63a0f1d3e71
|
| 3 |
+
size 2352382336
|
models/encoder/encoder_float32.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"0AA85F5B-F286-49A1-9DD7-1F815799BAC6": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Specification",
|
| 7 |
+
"name": "model.mlmodel",
|
| 8 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 9 |
+
},
|
| 10 |
+
"E3E8C673-979B-4A80-9FB0-0C4D8418551A": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Weights",
|
| 13 |
+
"name": "weights",
|
| 14 |
+
"path": "com.apple.CoreML/weights"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "0AA85F5B-F286-49A1-9DD7-1F815799BAC6"
|
| 18 |
+
}
|
models/encoder/encoder_int8.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aa63528a4d283b724b63a62c78e2a179e4dc16c61d819131b0a0ac26518d342f
|
| 3 |
+
size 243
|
models/encoder/encoder_int8.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7d4b6aa7cb0c9c89c59ab979b7f1ce0688688db66eb117b53f2bd0a5caa61a53
|
| 3 |
+
size 669
|
models/encoder/encoder_int8.mlmodelc/metadata.json
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"metadataOutputVersion" : "3.0",
|
| 4 |
+
"shortDescription" : "Nemotron Streaming Encoder (int8 quantized)",
|
| 5 |
+
"outputSchema" : [
|
| 6 |
+
{
|
| 7 |
+
"hasShapeFlexibility" : "0",
|
| 8 |
+
"isOptional" : "0",
|
| 9 |
+
"dataType" : "Float32",
|
| 10 |
+
"formattedType" : "MultiArray (Float32 1 × 1024 × 14)",
|
| 11 |
+
"shortDescription" : "",
|
| 12 |
+
"shape" : "[1, 1024, 14]",
|
| 13 |
+
"name" : "encoded",
|
| 14 |
+
"type" : "MultiArray"
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"hasShapeFlexibility" : "0",
|
| 18 |
+
"isOptional" : "0",
|
| 19 |
+
"dataType" : "Int32",
|
| 20 |
+
"formattedType" : "MultiArray (Int32 1)",
|
| 21 |
+
"shortDescription" : "",
|
| 22 |
+
"shape" : "[1]",
|
| 23 |
+
"name" : "encoded_length",
|
| 24 |
+
"type" : "MultiArray"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"hasShapeFlexibility" : "0",
|
| 28 |
+
"isOptional" : "0",
|
| 29 |
+
"dataType" : "Float32",
|
| 30 |
+
"formattedType" : "MultiArray (Float32 1 × 24 × 70 × 1024)",
|
| 31 |
+
"shortDescription" : "",
|
| 32 |
+
"shape" : "[1, 24, 70, 1024]",
|
| 33 |
+
"name" : "cache_channel_out",
|
| 34 |
+
"type" : "MultiArray"
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"hasShapeFlexibility" : "0",
|
| 38 |
+
"isOptional" : "0",
|
| 39 |
+
"dataType" : "Float32",
|
| 40 |
+
"formattedType" : "MultiArray (Float32 1 × 24 × 1024 × 8)",
|
| 41 |
+
"shortDescription" : "",
|
| 42 |
+
"shape" : "[1, 24, 1024, 8]",
|
| 43 |
+
"name" : "cache_time_out",
|
| 44 |
+
"type" : "MultiArray"
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
"hasShapeFlexibility" : "0",
|
| 48 |
+
"isOptional" : "0",
|
| 49 |
+
"dataType" : "Int32",
|
| 50 |
+
"formattedType" : "MultiArray (Int32 1)",
|
| 51 |
+
"shortDescription" : "",
|
| 52 |
+
"shape" : "[1]",
|
| 53 |
+
"name" : "cache_len_out",
|
| 54 |
+
"type" : "MultiArray"
|
| 55 |
+
}
|
| 56 |
+
],
|
| 57 |
+
"storagePrecision" : "Mixed (Float32, Int8)",
|
| 58 |
+
"modelParameters" : [
|
| 59 |
+
|
| 60 |
+
],
|
| 61 |
+
"author" : "Fluid Inference",
|
| 62 |
+
"specificationVersion" : 8,
|
| 63 |
+
"mlProgramOperationTypeHistogram" : {
|
| 64 |
+
"Ios17.logicalAnd" : 3,
|
| 65 |
+
"Ios17.reshape" : 145,
|
| 66 |
+
"Ios16.softmax" : 24,
|
| 67 |
+
"Ios17.matmul" : 72,
|
| 68 |
+
"Ios17.transpose" : 224,
|
| 69 |
+
"Split" : 24,
|
| 70 |
+
"Ios17.expandDims" : 18,
|
| 71 |
+
"Select" : 72,
|
| 72 |
+
"Ios17.add" : 180,
|
| 73 |
+
"Tile" : 8,
|
| 74 |
+
"Ios17.sliceByIndex" : 147,
|
| 75 |
+
"Ios16.sigmoid" : 24,
|
| 76 |
+
"Pad" : 27,
|
| 77 |
+
"Ios17.logicalNot" : 2,
|
| 78 |
+
"Ios17.layerNorm" : 144,
|
| 79 |
+
"Ios16.constexprAffineDequantize" : 294,
|
| 80 |
+
"Ios17.less" : 5,
|
| 81 |
+
"Ios17.sub" : 4,
|
| 82 |
+
"Ios17.conv" : 77,
|
| 83 |
+
"Ios16.relu" : 3,
|
| 84 |
+
"Ios17.clip" : 2,
|
| 85 |
+
"Ios17.linear" : 193,
|
| 86 |
+
"Ios17.greaterEqual" : 1,
|
| 87 |
+
"Ios17.floorDiv" : 3,
|
| 88 |
+
"Ios17.cast" : 12,
|
| 89 |
+
"Ios16.silu" : 72,
|
| 90 |
+
"Ios17.concat" : 72,
|
| 91 |
+
"Stack" : 2,
|
| 92 |
+
"Ios17.mul" : 106
|
| 93 |
+
},
|
| 94 |
+
"computePrecision" : "Mixed (Float32, Int32)",
|
| 95 |
+
"isUpdatable" : "0",
|
| 96 |
+
"stateSchema" : [
|
| 97 |
+
|
| 98 |
+
],
|
| 99 |
+
"availability" : {
|
| 100 |
+
"macOS" : "14.0",
|
| 101 |
+
"tvOS" : "17.0",
|
| 102 |
+
"visionOS" : "1.0",
|
| 103 |
+
"watchOS" : "10.0",
|
| 104 |
+
"iOS" : "17.0",
|
| 105 |
+
"macCatalyst" : "17.0"
|
| 106 |
+
},
|
| 107 |
+
"modelType" : {
|
| 108 |
+
"name" : "MLModelType_mlProgram"
|
| 109 |
+
},
|
| 110 |
+
"inputSchema" : [
|
| 111 |
+
{
|
| 112 |
+
"hasShapeFlexibility" : "0",
|
| 113 |
+
"isOptional" : "0",
|
| 114 |
+
"dataType" : "Float32",
|
| 115 |
+
"formattedType" : "MultiArray (Float32 1 × 128 × 121)",
|
| 116 |
+
"shortDescription" : "",
|
| 117 |
+
"shape" : "[1, 128, 121]",
|
| 118 |
+
"name" : "mel",
|
| 119 |
+
"type" : "MultiArray"
|
| 120 |
+
},
|
| 121 |
+
{
|
| 122 |
+
"hasShapeFlexibility" : "0",
|
| 123 |
+
"isOptional" : "0",
|
| 124 |
+
"dataType" : "Int32",
|
| 125 |
+
"formattedType" : "MultiArray (Int32 1)",
|
| 126 |
+
"shortDescription" : "",
|
| 127 |
+
"shape" : "[1]",
|
| 128 |
+
"name" : "mel_length",
|
| 129 |
+
"type" : "MultiArray"
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"hasShapeFlexibility" : "0",
|
| 133 |
+
"isOptional" : "0",
|
| 134 |
+
"dataType" : "Float32",
|
| 135 |
+
"formattedType" : "MultiArray (Float32 1 × 24 × 70 × 1024)",
|
| 136 |
+
"shortDescription" : "",
|
| 137 |
+
"shape" : "[1, 24, 70, 1024]",
|
| 138 |
+
"name" : "cache_channel",
|
| 139 |
+
"type" : "MultiArray"
|
| 140 |
+
},
|
| 141 |
+
{
|
| 142 |
+
"hasShapeFlexibility" : "0",
|
| 143 |
+
"isOptional" : "0",
|
| 144 |
+
"dataType" : "Float32",
|
| 145 |
+
"formattedType" : "MultiArray (Float32 1 × 24 × 1024 × 8)",
|
| 146 |
+
"shortDescription" : "",
|
| 147 |
+
"shape" : "[1, 24, 1024, 8]",
|
| 148 |
+
"name" : "cache_time",
|
| 149 |
+
"type" : "MultiArray"
|
| 150 |
+
},
|
| 151 |
+
{
|
| 152 |
+
"hasShapeFlexibility" : "0",
|
| 153 |
+
"isOptional" : "0",
|
| 154 |
+
"dataType" : "Int32",
|
| 155 |
+
"formattedType" : "MultiArray (Int32 1)",
|
| 156 |
+
"shortDescription" : "",
|
| 157 |
+
"shape" : "[1]",
|
| 158 |
+
"name" : "cache_len",
|
| 159 |
+
"type" : "MultiArray"
|
| 160 |
+
}
|
| 161 |
+
],
|
| 162 |
+
"userDefinedMetadata" : {
|
| 163 |
+
"com.github.apple.coremltools.conversion_date" : "2026-01-11",
|
| 164 |
+
"com.github.apple.coremltools.source" : "torch==2.9.1",
|
| 165 |
+
"com.github.apple.coremltools.version" : "9.0",
|
| 166 |
+
"com.github.apple.coremltools.source_dialect" : "TorchScript"
|
| 167 |
+
},
|
| 168 |
+
"generatedClassName" : "encoder_int8",
|
| 169 |
+
"method" : "predict"
|
| 170 |
+
}
|
| 171 |
+
]
|
models/encoder/encoder_int8.mlmodelc/model.mil
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/encoder/encoder_int8.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a3f944f82ea23f5642118b8a660d5969d6ebbb779bb2e9890c0eb46b5042d7e2
|
| 3 |
+
size 591463516
|
models/encoder/encoder_int8.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6e52e534bd7638bdaa3ccf7cfb8625be1aa71de1337954a54f9bd8295b3f9b1d
|
| 3 |
+
size 707354
|
models/encoder/encoder_int8.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a3f944f82ea23f5642118b8a660d5969d6ebbb779bb2e9890c0eb46b5042d7e2
|
| 3 |
+
size 591463516
|
models/encoder/encoder_int8.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"7F7741C1-C7D8-4BE3-B8EC-4E0951D7E1E7": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Weights",
|
| 7 |
+
"name": "weights",
|
| 8 |
+
"path": "com.apple.CoreML/weights"
|
| 9 |
+
},
|
| 10 |
+
"E4F8549F-5A90-413F-ABFB-09C4684D9BB5": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Specification",
|
| 13 |
+
"name": "model.mlmodel",
|
| 14 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "E4F8549F-5A90-413F-ABFB-09C4684D9BB5"
|
| 18 |
+
}
|
models/joint.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4b0c121a00eec47e7b3c60afad3c8237c623ab3ddef1230360bf55615f82e3ca
|
| 3 |
+
size 243
|
models/joint.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2c83c073a351da6afa49aa3c9b8e22d3f62951a01b92a67746c88d23500c24dd
|
| 3 |
+
size 400
|
models/joint.mlmodelc/metadata.json
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"metadataOutputVersion" : "3.0",
|
| 4 |
+
"storagePrecision" : "Float32",
|
| 5 |
+
"outputSchema" : [
|
| 6 |
+
{
|
| 7 |
+
"hasShapeFlexibility" : "0",
|
| 8 |
+
"isOptional" : "0",
|
| 9 |
+
"dataType" : "Float32",
|
| 10 |
+
"formattedType" : "MultiArray (Float32 1 × 1 × 1 × 1025)",
|
| 11 |
+
"shortDescription" : "",
|
| 12 |
+
"shape" : "[1, 1, 1, 1025]",
|
| 13 |
+
"name" : "logits",
|
| 14 |
+
"type" : "MultiArray"
|
| 15 |
+
}
|
| 16 |
+
],
|
| 17 |
+
"modelParameters" : [
|
| 18 |
+
|
| 19 |
+
],
|
| 20 |
+
"specificationVersion" : 8,
|
| 21 |
+
"mlProgramOperationTypeHistogram" : {
|
| 22 |
+
"Ios17.expandDims" : 2,
|
| 23 |
+
"Ios17.transpose" : 2,
|
| 24 |
+
"Ios17.linear" : 3,
|
| 25 |
+
"Ios17.add" : 1,
|
| 26 |
+
"Ios16.relu" : 1
|
| 27 |
+
},
|
| 28 |
+
"computePrecision" : "Mixed (Float32, Int32)",
|
| 29 |
+
"isUpdatable" : "0",
|
| 30 |
+
"stateSchema" : [
|
| 31 |
+
|
| 32 |
+
],
|
| 33 |
+
"availability" : {
|
| 34 |
+
"macOS" : "14.0",
|
| 35 |
+
"tvOS" : "17.0",
|
| 36 |
+
"visionOS" : "1.0",
|
| 37 |
+
"watchOS" : "10.0",
|
| 38 |
+
"iOS" : "17.0",
|
| 39 |
+
"macCatalyst" : "17.0"
|
| 40 |
+
},
|
| 41 |
+
"modelType" : {
|
| 42 |
+
"name" : "MLModelType_mlProgram"
|
| 43 |
+
},
|
| 44 |
+
"userDefinedMetadata" : {
|
| 45 |
+
"com.github.apple.coremltools.conversion_date" : "2026-01-11",
|
| 46 |
+
"com.github.apple.coremltools.source" : "torch==2.9.1",
|
| 47 |
+
"com.github.apple.coremltools.version" : "9.0",
|
| 48 |
+
"com.github.apple.coremltools.source_dialect" : "TorchScript"
|
| 49 |
+
},
|
| 50 |
+
"inputSchema" : [
|
| 51 |
+
{
|
| 52 |
+
"hasShapeFlexibility" : "0",
|
| 53 |
+
"isOptional" : "0",
|
| 54 |
+
"dataType" : "Float32",
|
| 55 |
+
"formattedType" : "MultiArray (Float32 1 × 1024 × 1)",
|
| 56 |
+
"shortDescription" : "",
|
| 57 |
+
"shape" : "[1, 1024, 1]",
|
| 58 |
+
"name" : "encoder",
|
| 59 |
+
"type" : "MultiArray"
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"hasShapeFlexibility" : "0",
|
| 63 |
+
"isOptional" : "0",
|
| 64 |
+
"dataType" : "Float32",
|
| 65 |
+
"formattedType" : "MultiArray (Float32 1 × 640 × 1)",
|
| 66 |
+
"shortDescription" : "",
|
| 67 |
+
"shape" : "[1, 640, 1]",
|
| 68 |
+
"name" : "decoder",
|
| 69 |
+
"type" : "MultiArray"
|
| 70 |
+
}
|
| 71 |
+
],
|
| 72 |
+
"generatedClassName" : "joint",
|
| 73 |
+
"method" : "predict"
|
| 74 |
+
}
|
| 75 |
+
]
|
models/joint.mlmodelc/model.mil
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
program(1.0)
|
| 2 |
+
[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.1"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
|
| 3 |
+
{
|
| 4 |
+
func main<ios17>(tensor<fp32, [1, 640, 1]> decoder, tensor<fp32, [1, 1024, 1]> encoder) {
|
| 5 |
+
tensor<fp32, [640]> module_enc_bias = const()[name = tensor<string, []>("module_enc_bias"), val = tensor<fp32, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
|
| 6 |
+
tensor<fp32, [640, 1024]> module_enc_weight = const()[name = tensor<string, []>("module_enc_weight"), val = tensor<fp32, [640, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(2688)))];
|
| 7 |
+
tensor<fp32, [640]> module_pred_bias = const()[name = tensor<string, []>("module_pred_bias"), val = tensor<fp32, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(2624192)))];
|
| 8 |
+
tensor<fp32, [640, 640]> module_pred_weight = const()[name = tensor<string, []>("module_pred_weight"), val = tensor<fp32, [640, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(2626816)))];
|
| 9 |
+
tensor<fp32, [1025]> module_joint_net_2_bias = const()[name = tensor<string, []>("module_joint_net_2_bias"), val = tensor<fp32, [1025]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4265280)))];
|
| 10 |
+
tensor<fp32, [1025, 640]> module_joint_net_2_weight = const()[name = tensor<string, []>("module_joint_net_2_weight"), val = tensor<fp32, [1025, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4269504)))];
|
| 11 |
+
tensor<int32, [3]> input_1_perm_0 = const()[name = tensor<string, []>("input_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
|
| 12 |
+
tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
|
| 13 |
+
tensor<fp32, [1, 1, 1024]> input_1 = transpose(perm = input_1_perm_0, x = encoder)[name = tensor<string, []>("transpose_1")];
|
| 14 |
+
tensor<fp32, [1, 1, 640]> enc_proj = linear(bias = module_enc_bias, weight = module_enc_weight, x = input_1)[name = tensor<string, []>("linear_0")];
|
| 15 |
+
tensor<fp32, [1, 1, 640]> input_3 = transpose(perm = input_3_perm_0, x = decoder)[name = tensor<string, []>("transpose_0")];
|
| 16 |
+
tensor<fp32, [1, 1, 640]> dec_proj = linear(bias = module_pred_bias, weight = module_pred_weight, x = input_3)[name = tensor<string, []>("linear_1")];
|
| 17 |
+
tensor<int32, [1]> var_23_axes_0 = const()[name = tensor<string, []>("op_23_axes_0"), val = tensor<int32, [1]>([2])];
|
| 18 |
+
tensor<fp32, [1, 1, 1, 640]> var_23 = expand_dims(axes = var_23_axes_0, x = enc_proj)[name = tensor<string, []>("op_23")];
|
| 19 |
+
tensor<int32, [1]> var_25_axes_0 = const()[name = tensor<string, []>("op_25_axes_0"), val = tensor<int32, [1]>([1])];
|
| 20 |
+
tensor<fp32, [1, 1, 1, 640]> var_25 = expand_dims(axes = var_25_axes_0, x = dec_proj)[name = tensor<string, []>("op_25")];
|
| 21 |
+
tensor<fp32, [1, 1, 1, 640]> input_5 = add(x = var_23, y = var_25)[name = tensor<string, []>("input_5")];
|
| 22 |
+
tensor<fp32, [1, 1, 1, 640]> input_7 = relu(x = input_5)[name = tensor<string, []>("input_7")];
|
| 23 |
+
tensor<fp32, [1, 1, 1, 1025]> logits = linear(bias = module_joint_net_2_bias, weight = module_joint_net_2_weight, x = input_7)[name = tensor<string, []>("linear_2")];
|
| 24 |
+
} -> (logits);
|
| 25 |
+
}
|
models/joint.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1a578e03371ba424fc0c426b34857d4f8646020bf60d8a329c759fe36e430cf1
|
| 3 |
+
size 6893568
|
models/joint.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b3d03b858b4ae47ce1b6afdc775be8bf8c649dccc7836d6a1db3dc4a606ff400
|
| 3 |
+
size 3326
|
models/joint.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1a578e03371ba424fc0c426b34857d4f8646020bf60d8a329c759fe36e430cf1
|
| 3 |
+
size 6893568
|
models/joint.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"37CC94B4-E114-47FE-9862-7A63DD114FF2": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Specification",
|
| 7 |
+
"name": "model.mlmodel",
|
| 8 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 9 |
+
},
|
| 10 |
+
"A81F6D51-BC94-457E-AA1C-CC93B9D57D96": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Weights",
|
| 13 |
+
"name": "weights",
|
| 14 |
+
"path": "com.apple.CoreML/weights"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "37CC94B4-E114-47FE-9862-7A63DD114FF2"
|
| 18 |
+
}
|
models/metadata.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "nvidia/nemotron-speech-streaming-en-0.6b",
|
| 3 |
+
"sample_rate": 16000,
|
| 4 |
+
"chunk_mel_frames": 112,
|
| 5 |
+
"pre_encode_cache": 9,
|
| 6 |
+
"total_mel_frames": 121,
|
| 7 |
+
"vocab_size": 1024,
|
| 8 |
+
"blank_idx": 1024,
|
| 9 |
+
"cache_channel_shape": [
|
| 10 |
+
1,
|
| 11 |
+
24,
|
| 12 |
+
70,
|
| 13 |
+
1024
|
| 14 |
+
],
|
| 15 |
+
"cache_time_shape": [
|
| 16 |
+
1,
|
| 17 |
+
24,
|
| 18 |
+
1024,
|
| 19 |
+
8
|
| 20 |
+
],
|
| 21 |
+
"decoder_hidden": 640,
|
| 22 |
+
"decoder_layers": 2
|
| 23 |
+
}
|
models/preprocessor.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f8ab0fe255177fdc29f9a59582bfc1d328d26da8e40717f1e3fa14e90b814419
|
| 3 |
+
size 243
|
models/preprocessor.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8c0a4b8a4452288e4dcc4a7d59eca80470641eae784a4f7ad2228785b53a07b7
|
| 3 |
+
size 430
|
models/preprocessor.mlmodelc/metadata.json
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"metadataOutputVersion" : "3.0",
|
| 4 |
+
"storagePrecision" : "Float32",
|
| 5 |
+
"outputSchema" : [
|
| 6 |
+
{
|
| 7 |
+
"hasShapeFlexibility" : "0",
|
| 8 |
+
"isOptional" : "0",
|
| 9 |
+
"dataType" : "Float32",
|
| 10 |
+
"formattedType" : "MultiArray (Float32)",
|
| 11 |
+
"shortDescription" : "",
|
| 12 |
+
"shape" : "[]",
|
| 13 |
+
"name" : "mel",
|
| 14 |
+
"type" : "MultiArray"
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"hasShapeFlexibility" : "0",
|
| 18 |
+
"isOptional" : "0",
|
| 19 |
+
"dataType" : "Int32",
|
| 20 |
+
"formattedType" : "MultiArray (Int32 1)",
|
| 21 |
+
"shortDescription" : "",
|
| 22 |
+
"shape" : "[1]",
|
| 23 |
+
"name" : "mel_length",
|
| 24 |
+
"type" : "MultiArray"
|
| 25 |
+
}
|
| 26 |
+
],
|
| 27 |
+
"modelParameters" : [
|
| 28 |
+
|
| 29 |
+
],
|
| 30 |
+
"specificationVersion" : 8,
|
| 31 |
+
"mlProgramOperationTypeHistogram" : {
|
| 32 |
+
"Range1d" : 2,
|
| 33 |
+
"Ios17.equal" : 1,
|
| 34 |
+
"Ios17.reshape" : 2,
|
| 35 |
+
"Identity" : 1,
|
| 36 |
+
"Ios17.matmul" : 1,
|
| 37 |
+
"Select" : 3,
|
| 38 |
+
"Ios17.expandDims" : 7,
|
| 39 |
+
"Ios17.add" : 2,
|
| 40 |
+
"Ios17.sliceByIndex" : 3,
|
| 41 |
+
"Ios16.reduceSum" : 1,
|
| 42 |
+
"Shape" : 2,
|
| 43 |
+
"Ios17.gather" : 2,
|
| 44 |
+
"Ios17.logicalNot" : 1,
|
| 45 |
+
"Pad" : 1,
|
| 46 |
+
"Ios17.log" : 1,
|
| 47 |
+
"Ios17.less" : 1,
|
| 48 |
+
"Ios17.sub" : 2,
|
| 49 |
+
"Ios17.conv" : 2,
|
| 50 |
+
"Ios17.pow" : 1,
|
| 51 |
+
"Ios17.concat" : 1,
|
| 52 |
+
"Stack" : 1,
|
| 53 |
+
"Ios17.floorDiv" : 1,
|
| 54 |
+
"Ios17.greaterEqual" : 1,
|
| 55 |
+
"Ios17.mul" : 1
|
| 56 |
+
},
|
| 57 |
+
"computePrecision" : "Mixed (Float32, Int32)",
|
| 58 |
+
"isUpdatable" : "0",
|
| 59 |
+
"stateSchema" : [
|
| 60 |
+
|
| 61 |
+
],
|
| 62 |
+
"availability" : {
|
| 63 |
+
"macOS" : "14.0",
|
| 64 |
+
"tvOS" : "17.0",
|
| 65 |
+
"visionOS" : "1.0",
|
| 66 |
+
"watchOS" : "10.0",
|
| 67 |
+
"iOS" : "17.0",
|
| 68 |
+
"macCatalyst" : "17.0"
|
| 69 |
+
},
|
| 70 |
+
"modelType" : {
|
| 71 |
+
"name" : "MLModelType_mlProgram"
|
| 72 |
+
},
|
| 73 |
+
"userDefinedMetadata" : {
|
| 74 |
+
"com.github.apple.coremltools.conversion_date" : "2026-01-11",
|
| 75 |
+
"com.github.apple.coremltools.source" : "torch==2.9.1",
|
| 76 |
+
"com.github.apple.coremltools.version" : "9.0",
|
| 77 |
+
"com.github.apple.coremltools.source_dialect" : "TorchScript"
|
| 78 |
+
},
|
| 79 |
+
"inputSchema" : [
|
| 80 |
+
{
|
| 81 |
+
"dataType" : "Float32",
|
| 82 |
+
"hasShapeFlexibility" : "1",
|
| 83 |
+
"isOptional" : "0",
|
| 84 |
+
"shapeFlexibility" : "1 × 1...480000",
|
| 85 |
+
"shapeRange" : "[[1, 1], [1, 480000]]",
|
| 86 |
+
"formattedType" : "MultiArray (Float32 1 × 1)",
|
| 87 |
+
"type" : "MultiArray",
|
| 88 |
+
"shape" : "[1, 1]",
|
| 89 |
+
"name" : "audio",
|
| 90 |
+
"shortDescription" : ""
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"hasShapeFlexibility" : "0",
|
| 94 |
+
"isOptional" : "0",
|
| 95 |
+
"dataType" : "Int32",
|
| 96 |
+
"formattedType" : "MultiArray (Int32 1)",
|
| 97 |
+
"shortDescription" : "",
|
| 98 |
+
"shape" : "[1]",
|
| 99 |
+
"name" : "audio_length",
|
| 100 |
+
"type" : "MultiArray"
|
| 101 |
+
}
|
| 102 |
+
],
|
| 103 |
+
"generatedClassName" : "preprocessor",
|
| 104 |
+
"method" : "predict"
|
| 105 |
+
}
|
| 106 |
+
]
|
models/preprocessor.mlmodelc/model.mil
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
program(1.0)
|
| 2 |
+
[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.1"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
|
| 3 |
+
{
|
| 4 |
+
func main<ios17>(tensor<fp32, [1, ?]> audio, tensor<int32, [1]> audio_length) [FlexibleShapeInformation = tuple<tuple<tensor<string, []>, dict<tensor<string, []>, tensor<int32, [?]>>>, tuple<tensor<string, []>, dict<tensor<string, []>, list<tensor<int32, [2]>, ?>>>>((("DefaultShapes", {{"audio", [1, 1]}}), ("RangeDims", {{"audio", [[1, 1], [1, 480000]]}})))] {
|
| 5 |
+
tensor<int32, []> var_9 = const()[name = tensor<string, []>("op_9"), val = tensor<int32, []>(1)];
|
| 6 |
+
tensor<int32, []> var_10 = const()[name = tensor<string, []>("op_10"), val = tensor<int32, []>(160)];
|
| 7 |
+
tensor<int32, []> var_12 = const()[name = tensor<string, []>("op_12"), val = tensor<int32, []>(0)];
|
| 8 |
+
tensor<fp32, []> var_16 = const()[name = tensor<string, []>("op_16"), val = tensor<fp32, []>(0x0p+0)];
|
| 9 |
+
tensor<int32, []> var_33 = const()[name = tensor<string, []>("op_33"), val = tensor<int32, []>(512)];
|
| 10 |
+
tensor<int32, [1]> var_34 = add(x = audio_length, y = var_33)[name = tensor<string, []>("op_34")];
|
| 11 |
+
tensor<int32, []> var_35 = const()[name = tensor<string, []>("op_35"), val = tensor<int32, []>(512)];
|
| 12 |
+
tensor<int32, [1]> var_36 = sub(x = var_34, y = var_35)[name = tensor<string, []>("op_36")];
|
| 13 |
+
tensor<int32, [1]> floor_div_0 = floor_div(x = var_36, y = var_10)[name = tensor<string, []>("floor_div_0")];
|
| 14 |
+
tensor<bool, [1]> var_39 = equal(x = audio_length, y = var_12)[name = tensor<string, []>("op_39")];
|
| 15 |
+
tensor<int32, [1]> var_40 = const()[name = tensor<string, []>("op_40"), val = tensor<int32, [1]>([0])];
|
| 16 |
+
tensor<int32, [1]> mel_length = select(a = var_40, b = floor_div_0, cond = var_39)[name = tensor<string, []>("seq_len")];
|
| 17 |
+
tensor<int32, [2]> var_42_shape = shape(x = audio)[name = tensor<string, []>("op_42_shape")];
|
| 18 |
+
tensor<int32, []> gather_0_batch_dims_0 = const()[name = tensor<string, []>("gather_0_batch_dims_0"), val = tensor<int32, []>(0)];
|
| 19 |
+
tensor<bool, []> gather_0_validate_indices_0 = const()[name = tensor<string, []>("gather_0_validate_indices_0"), val = tensor<bool, []>(false)];
|
| 20 |
+
tensor<int32, []> select_0 = const()[name = tensor<string, []>("select_0"), val = tensor<int32, []>(1)];
|
| 21 |
+
tensor<int32, []> gather_0_axis_1 = const()[name = tensor<string, []>("gather_0_axis_1"), val = tensor<int32, []>(0)];
|
| 22 |
+
tensor<int32, []> gather_0 = gather(axis = gather_0_axis_1, batch_dims = gather_0_batch_dims_0, indices = select_0, validate_indices = gather_0_validate_indices_0, x = var_42_shape)[name = tensor<string, []>("gather_0")];
|
| 23 |
+
tensor<int32, []> const_0 = const()[name = tensor<string, []>("const_0"), val = tensor<int32, []>(0)];
|
| 24 |
+
tensor<int32, []> const_1 = const()[name = tensor<string, []>("const_1"), val = tensor<int32, []>(1)];
|
| 25 |
+
tensor<int32, [?]> var_43 = range_1d(end = gather_0, start = const_0, step = const_1)[name = tensor<string, []>("op_43")];
|
| 26 |
+
tensor<int32, [1]> var_44_axes_0 = const()[name = tensor<string, []>("op_44_axes_0"), val = tensor<int32, [1]>([0])];
|
| 27 |
+
tensor<int32, [1, ?]> var_44 = expand_dims(axes = var_44_axes_0, x = var_43)[name = tensor<string, []>("op_44")];
|
| 28 |
+
tensor<int32, [1]> var_45_axes_0 = const()[name = tensor<string, []>("op_45_axes_0"), val = tensor<int32, [1]>([1])];
|
| 29 |
+
tensor<int32, [1, 1]> var_45 = expand_dims(axes = var_45_axes_0, x = audio_length)[name = tensor<string, []>("op_45")];
|
| 30 |
+
tensor<bool, [1, ?]> timemask = less(x = var_44, y = var_45)[name = tensor<string, []>("timemask")];
|
| 31 |
+
tensor<int32, [2]> var_48_begin_0 = const()[name = tensor<string, []>("op_48_begin_0"), val = tensor<int32, [2]>([0, 0])];
|
| 32 |
+
tensor<int32, [2]> var_48_end_0 = const()[name = tensor<string, []>("op_48_end_0"), val = tensor<int32, [2]>([1, 1])];
|
| 33 |
+
tensor<bool, [2]> var_48_end_mask_0 = const()[name = tensor<string, []>("op_48_end_mask_0"), val = tensor<bool, [2]>([true, false])];
|
| 34 |
+
tensor<bool, [2]> var_48_squeeze_mask_0 = const()[name = tensor<string, []>("op_48_squeeze_mask_0"), val = tensor<bool, [2]>([false, true])];
|
| 35 |
+
tensor<fp32, [1]> var_48 = slice_by_index(begin = var_48_begin_0, end = var_48_end_0, end_mask = var_48_end_mask_0, squeeze_mask = var_48_squeeze_mask_0, x = audio)[name = tensor<string, []>("op_48")];
|
| 36 |
+
tensor<int32, [1]> var_49_axes_0 = const()[name = tensor<string, []>("op_49_axes_0"), val = tensor<int32, [1]>([1])];
|
| 37 |
+
tensor<fp32, [1, 1]> var_49 = expand_dims(axes = var_49_axes_0, x = var_48)[name = tensor<string, []>("op_49")];
|
| 38 |
+
tensor<int32, [2]> var_51_begin_0 = const()[name = tensor<string, []>("op_51_begin_0"), val = tensor<int32, [2]>([0, 1])];
|
| 39 |
+
tensor<int32, [2]> var_51_end_0 = const()[name = tensor<string, []>("op_51_end_0"), val = tensor<int32, [2]>([1, 0])];
|
| 40 |
+
tensor<bool, [2]> var_51_end_mask_0 = const()[name = tensor<string, []>("op_51_end_mask_0"), val = tensor<bool, [2]>([true, true])];
|
| 41 |
+
tensor<fp32, [1, ?]> var_51 = slice_by_index(begin = var_51_begin_0, end = var_51_end_0, end_mask = var_51_end_mask_0, x = audio)[name = tensor<string, []>("op_51")];
|
| 42 |
+
tensor<int32, [2]> var_53_begin_0 = const()[name = tensor<string, []>("op_53_begin_0"), val = tensor<int32, [2]>([0, 0])];
|
| 43 |
+
tensor<int32, [2]> var_53_end_0 = const()[name = tensor<string, []>("op_53_end_0"), val = tensor<int32, [2]>([1, -1])];
|
| 44 |
+
tensor<bool, [2]> var_53_end_mask_0 = const()[name = tensor<string, []>("op_53_end_mask_0"), val = tensor<bool, [2]>([true, false])];
|
| 45 |
+
tensor<fp32, [1, ?]> var_53 = slice_by_index(begin = var_53_begin_0, end = var_53_end_0, end_mask = var_53_end_mask_0, x = audio)[name = tensor<string, []>("op_53")];
|
| 46 |
+
tensor<fp32, []> var_54 = const()[name = tensor<string, []>("op_54"), val = tensor<fp32, []>(0x1.f0a3d8p-1)];
|
| 47 |
+
tensor<fp32, [1, ?]> var_55 = mul(x = var_53, y = var_54)[name = tensor<string, []>("op_55")];
|
| 48 |
+
tensor<fp32, [1, ?]> var_56 = sub(x = var_51, y = var_55)[name = tensor<string, []>("op_56")];
|
| 49 |
+
tensor<bool, []> x_3_interleave_0 = const()[name = tensor<string, []>("x_3_interleave_0"), val = tensor<bool, []>(false)];
|
| 50 |
+
tensor<fp32, [1, ?]> x_3 = concat(axis = var_9, interleave = x_3_interleave_0, values = (var_49, var_56))[name = tensor<string, []>("x_3")];
|
| 51 |
+
tensor<bool, [1, ?]> var_59 = logical_not(x = timemask)[name = tensor<string, []>("op_59")];
|
| 52 |
+
tensor<fp32, [1, ?]> input_1 = select(a = var_16, b = x_3, cond = var_59)[name = tensor<string, []>("input_1")];
|
| 53 |
+
tensor<int32, [3]> concat_1x = const()[name = tensor<string, []>("concat_1x"), val = tensor<int32, [3]>([1, 1, -1])];
|
| 54 |
+
tensor<fp32, [1, 1, ?]> input_3 = reshape(shape = concat_1x, x = input_1)[name = tensor<string, []>("input_3")];
|
| 55 |
+
tensor<fp32, []> const_3 = const()[name = tensor<string, []>("const_3"), val = tensor<fp32, []>(0x0p+0)];
|
| 56 |
+
tensor<int32, [6]> input_5_pad_0 = const()[name = tensor<string, []>("input_5_pad_0"), val = tensor<int32, [6]>([0, 0, 0, 0, 256, 256])];
|
| 57 |
+
tensor<string, []> input_5_mode_0 = const()[name = tensor<string, []>("input_5_mode_0"), val = tensor<string, []>("constant")];
|
| 58 |
+
tensor<fp32, [1, 1, ?]> input_5 = pad(constant_val = const_3, mode = input_5_mode_0, pad = input_5_pad_0, x = input_3)[name = tensor<string, []>("input_5")];
|
| 59 |
+
tensor<int32, [2]> concat_2x = const()[name = tensor<string, []>("concat_2x"), val = tensor<int32, [2]>([1, -1])];
|
| 60 |
+
tensor<fp32, [1, ?]> input = reshape(shape = concat_2x, x = input_5)[name = tensor<string, []>("input")];
|
| 61 |
+
tensor<fp32, [257, 1, 512]> expand_dims_1 = const()[name = tensor<string, []>("expand_dims_1"), val = tensor<fp32, [257, 1, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
|
| 62 |
+
tensor<fp32, [257, 1, 512]> expand_dims_2 = const()[name = tensor<string, []>("expand_dims_2"), val = tensor<fp32, [257, 1, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(526464)))];
|
| 63 |
+
tensor<int32, [1]> expand_dims_3 = const()[name = tensor<string, []>("expand_dims_3"), val = tensor<int32, [1]>([160])];
|
| 64 |
+
tensor<int32, [1]> expand_dims_4_axes_0 = const()[name = tensor<string, []>("expand_dims_4_axes_0"), val = tensor<int32, [1]>([1])];
|
| 65 |
+
tensor<fp32, [1, 1, ?]> expand_dims_4 = expand_dims(axes = expand_dims_4_axes_0, x = input)[name = tensor<string, []>("expand_dims_4")];
|
| 66 |
+
tensor<string, []> conv_0_pad_type_0 = const()[name = tensor<string, []>("conv_0_pad_type_0"), val = tensor<string, []>("valid")];
|
| 67 |
+
tensor<int32, [2]> conv_0_pad_0 = const()[name = tensor<string, []>("conv_0_pad_0"), val = tensor<int32, [2]>([0, 0])];
|
| 68 |
+
tensor<int32, [1]> conv_0_dilations_0 = const()[name = tensor<string, []>("conv_0_dilations_0"), val = tensor<int32, [1]>([1])];
|
| 69 |
+
tensor<int32, []> conv_0_groups_0 = const()[name = tensor<string, []>("conv_0_groups_0"), val = tensor<int32, []>(1)];
|
| 70 |
+
tensor<fp32, [1, 257, ?]> conv_0 = conv(dilations = conv_0_dilations_0, groups = conv_0_groups_0, pad = conv_0_pad_0, pad_type = conv_0_pad_type_0, strides = expand_dims_3, weight = expand_dims_1, x = expand_dims_4)[name = tensor<string, []>("conv_0")];
|
| 71 |
+
tensor<string, []> conv_1_pad_type_0 = const()[name = tensor<string, []>("conv_1_pad_type_0"), val = tensor<string, []>("valid")];
|
| 72 |
+
tensor<int32, [2]> conv_1_pad_0 = const()[name = tensor<string, []>("conv_1_pad_0"), val = tensor<int32, [2]>([0, 0])];
|
| 73 |
+
tensor<int32, [1]> conv_1_dilations_0 = const()[name = tensor<string, []>("conv_1_dilations_0"), val = tensor<int32, [1]>([1])];
|
| 74 |
+
tensor<int32, []> conv_1_groups_0 = const()[name = tensor<string, []>("conv_1_groups_0"), val = tensor<int32, []>(1)];
|
| 75 |
+
tensor<fp32, [1, 257, ?]> conv_1 = conv(dilations = conv_1_dilations_0, groups = conv_1_groups_0, pad = conv_1_pad_0, pad_type = conv_1_pad_type_0, strides = expand_dims_3, weight = expand_dims_2, x = expand_dims_4)[name = tensor<string, []>("conv_1")];
|
| 76 |
+
tensor<int32, []> stack_0_axis_0 = const()[name = tensor<string, []>("stack_0_axis_0"), val = tensor<int32, []>(-1)];
|
| 77 |
+
tensor<fp32, [1, 257, ?, 2]> stack_0 = stack(axis = stack_0_axis_0, values = (conv_0, conv_1))[name = tensor<string, []>("stack_0")];
|
| 78 |
+
tensor<fp32, []> var_19_promoted = const()[name = tensor<string, []>("op_19_promoted"), val = tensor<fp32, []>(0x1p+1)];
|
| 79 |
+
tensor<fp32, [1, 257, ?, 2]> var_74 = pow(x = stack_0, y = var_19_promoted)[name = tensor<string, []>("op_74")];
|
| 80 |
+
tensor<int32, [1]> var_76_axes_0 = const()[name = tensor<string, []>("op_76_axes_0"), val = tensor<int32, [1]>([-1])];
|
| 81 |
+
tensor<bool, []> var_76_keep_dims_0 = const()[name = tensor<string, []>("op_76_keep_dims_0"), val = tensor<bool, []>(false)];
|
| 82 |
+
tensor<fp32, [1, 257, ?]> var_76 = reduce_sum(axes = var_76_axes_0, keep_dims = var_76_keep_dims_0, x = var_74)[name = tensor<string, []>("op_76")];
|
| 83 |
+
tensor<fp32, [1, 257, ?]> x_11 = identity(x = var_76)[name = tensor<string, []>("x_11")];
|
| 84 |
+
tensor<fp32, [1, 128, 257]> const_4 = const()[name = tensor<string, []>("const_4"), val = tensor<fp32, [1, 128, 257]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1052864)))];
|
| 85 |
+
tensor<bool, []> x_13_transpose_x_0 = const()[name = tensor<string, []>("x_13_transpose_x_0"), val = tensor<bool, []>(false)];
|
| 86 |
+
tensor<bool, []> x_13_transpose_y_0 = const()[name = tensor<string, []>("x_13_transpose_y_0"), val = tensor<bool, []>(false)];
|
| 87 |
+
tensor<fp32, [1, 128, ?]> x_13 = matmul(transpose_x = x_13_transpose_x_0, transpose_y = x_13_transpose_y_0, x = const_4, y = x_11)[name = tensor<string, []>("x_13")];
|
| 88 |
+
tensor<fp32, []> var_83 = const()[name = tensor<string, []>("op_83"), val = tensor<fp32, []>(0x1p-24)];
|
| 89 |
+
tensor<fp32, [1, 128, ?]> var_84 = add(x = x_13, y = var_83)[name = tensor<string, []>("op_84")];
|
| 90 |
+
tensor<fp32, []> x_epsilon_0 = const()[name = tensor<string, []>("x_epsilon_0"), val = tensor<fp32, []>(0x1p-149)];
|
| 91 |
+
tensor<fp32, [1, 128, ?]> x = log(epsilon = x_epsilon_0, x = var_84)[name = tensor<string, []>("x")];
|
| 92 |
+
tensor<int32, [3]> var_86_shape = shape(x = x)[name = tensor<string, []>("op_86_shape")];
|
| 93 |
+
tensor<int32, []> gather_5_batch_dims_0 = const()[name = tensor<string, []>("gather_5_batch_dims_0"), val = tensor<int32, []>(0)];
|
| 94 |
+
tensor<bool, []> gather_5_validate_indices_0 = const()[name = tensor<string, []>("gather_5_validate_indices_0"), val = tensor<bool, []>(false)];
|
| 95 |
+
tensor<int32, []> select_3 = const()[name = tensor<string, []>("select_3"), val = tensor<int32, []>(2)];
|
| 96 |
+
tensor<int32, []> gather_5_axis_1 = const()[name = tensor<string, []>("gather_5_axis_1"), val = tensor<int32, []>(0)];
|
| 97 |
+
tensor<int32, []> gather_5 = gather(axis = gather_5_axis_1, batch_dims = gather_5_batch_dims_0, indices = select_3, validate_indices = gather_5_validate_indices_0, x = var_86_shape)[name = tensor<string, []>("gather_5")];
|
| 98 |
+
tensor<int32, []> const_5 = const()[name = tensor<string, []>("const_5"), val = tensor<int32, []>(0)];
|
| 99 |
+
tensor<int32, []> const_6 = const()[name = tensor<string, []>("const_6"), val = tensor<int32, []>(1)];
|
| 100 |
+
tensor<int32, [?]> mask_1 = range_1d(end = gather_5, start = const_5, step = const_6)[name = tensor<string, []>("mask_1")];
|
| 101 |
+
tensor<int32, [1]> expand_dims_0_axes_0 = const()[name = tensor<string, []>("expand_dims_0_axes_0"), val = tensor<int32, [1]>([0])];
|
| 102 |
+
tensor<int32, [1, ?]> expand_dims_0 = expand_dims(axes = expand_dims_0_axes_0, x = mask_1)[name = tensor<string, []>("expand_dims_0")];
|
| 103 |
+
tensor<int32, [1]> var_91_axes_0 = const()[name = tensor<string, []>("op_91_axes_0"), val = tensor<int32, [1]>([1])];
|
| 104 |
+
tensor<int32, [1, 1]> var_91 = expand_dims(axes = var_91_axes_0, x = mel_length)[name = tensor<string, []>("op_91")];
|
| 105 |
+
tensor<bool, [1, ?]> mask = greater_equal(x = expand_dims_0, y = var_91)[name = tensor<string, []>("mask")];
|
| 106 |
+
tensor<int32, [1]> var_93_axes_0 = const()[name = tensor<string, []>("op_93_axes_0"), val = tensor<int32, [1]>([1])];
|
| 107 |
+
tensor<bool, [1, 1, ?]> var_93 = expand_dims(axes = var_93_axes_0, x = mask)[name = tensor<string, []>("op_93")];
|
| 108 |
+
tensor<fp32, [1, 128, ?]> mel = select(a = var_16, b = x, cond = var_93)[name = tensor<string, []>("processed_signal")];
|
| 109 |
+
} -> (mel, mel_length);
|
| 110 |
+
}
|
models/preprocessor.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eff8082d1cc59b4aeaf963d61fa982f84e805554ede7506aed89d9dfd0d2549e
|
| 3 |
+
size 1184512
|
models/preprocessor.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c6be7b66a9ff7e469719957fd58676fa5a5f8c432f67638ea24e756ec34b97e4
|
| 3 |
+
size 12961
|
models/preprocessor.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eff8082d1cc59b4aeaf963d61fa982f84e805554ede7506aed89d9dfd0d2549e
|
| 3 |
+
size 1184512
|
models/preprocessor.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"A4922046-212C-4752-B1A4-F82AFD0BE152": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Weights",
|
| 7 |
+
"name": "weights",
|
| 8 |
+
"path": "com.apple.CoreML/weights"
|
| 9 |
+
},
|
| 10 |
+
"C2E826E5-D793-4300-AA2D-A7E743CF5F83": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Specification",
|
| 13 |
+
"name": "model.mlmodel",
|
| 14 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "C2E826E5-D793-4300-AA2D-A7E743CF5F83"
|
| 18 |
+
}
|
models/tokenizer.json
ADDED
|
@@ -0,0 +1,1026 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"0": "<unk>",
|
| 3 |
+
"1": "\u2581t",
|
| 4 |
+
"2": "\u2581th",
|
| 5 |
+
"3": "\u2581a",
|
| 6 |
+
"4": "in",
|
| 7 |
+
"5": "\u2581the",
|
| 8 |
+
"6": "re",
|
| 9 |
+
"7": "\u2581w",
|
| 10 |
+
"8": "\u2581o",
|
| 11 |
+
"9": "\u2581s",
|
| 12 |
+
"10": "er",
|
| 13 |
+
"11": "at",
|
| 14 |
+
"12": "ou",
|
| 15 |
+
"13": "nd",
|
| 16 |
+
"14": "it",
|
| 17 |
+
"15": "is",
|
| 18 |
+
"16": "\u2581h",
|
| 19 |
+
"17": "\u2581b",
|
| 20 |
+
"18": "on",
|
| 21 |
+
"19": "\u2581c",
|
| 22 |
+
"20": "ing",
|
| 23 |
+
"21": "en",
|
| 24 |
+
"22": "\u2581to",
|
| 25 |
+
"23": "\u2581m",
|
| 26 |
+
"24": "\u2581f",
|
| 27 |
+
"25": "\u2581p",
|
| 28 |
+
"26": "or",
|
| 29 |
+
"27": "an",
|
| 30 |
+
"28": "es",
|
| 31 |
+
"29": "\u2581of",
|
| 32 |
+
"30": "\u2581d",
|
| 33 |
+
"31": "ed",
|
| 34 |
+
"32": "ll",
|
| 35 |
+
"33": "\u2581and",
|
| 36 |
+
"34": "\u2581I",
|
| 37 |
+
"35": "\u2581in",
|
| 38 |
+
"36": "\u2581l",
|
| 39 |
+
"37": "ar",
|
| 40 |
+
"38": "\u2581y",
|
| 41 |
+
"39": "\u2581g",
|
| 42 |
+
"40": "as",
|
| 43 |
+
"41": "\u2581you",
|
| 44 |
+
"42": "om",
|
| 45 |
+
"43": "\u2581n",
|
| 46 |
+
"44": "ic",
|
| 47 |
+
"45": "ve",
|
| 48 |
+
"46": "al",
|
| 49 |
+
"47": "ion",
|
| 50 |
+
"48": "us",
|
| 51 |
+
"49": "\u2581be",
|
| 52 |
+
"50": "ow",
|
| 53 |
+
"51": "le",
|
| 54 |
+
"52": "\u2581wh",
|
| 55 |
+
"53": "\u2581e",
|
| 56 |
+
"54": "ot",
|
| 57 |
+
"55": "ut",
|
| 58 |
+
"56": "\u2581it",
|
| 59 |
+
"57": "\u2581is",
|
| 60 |
+
"58": "\u2581we",
|
| 61 |
+
"59": "\u2581T",
|
| 62 |
+
"60": "\u2581re",
|
| 63 |
+
"61": "et",
|
| 64 |
+
"62": "\u2581A",
|
| 65 |
+
"63": "ent",
|
| 66 |
+
"64": "\u2581on",
|
| 67 |
+
"65": "\u2581ha",
|
| 68 |
+
"66": "ay",
|
| 69 |
+
"67": "\u2581S",
|
| 70 |
+
"68": "ct",
|
| 71 |
+
"69": "\u2581Th",
|
| 72 |
+
"70": "ver",
|
| 73 |
+
"71": "id",
|
| 74 |
+
"72": "ig",
|
| 75 |
+
"73": "im",
|
| 76 |
+
"74": "ro",
|
| 77 |
+
"75": "\u2581for",
|
| 78 |
+
"76": "ly",
|
| 79 |
+
"77": "\u2581he",
|
| 80 |
+
"78": "ke",
|
| 81 |
+
"79": "ld",
|
| 82 |
+
"80": "se",
|
| 83 |
+
"81": "st",
|
| 84 |
+
"82": "ch",
|
| 85 |
+
"83": "\u2581st",
|
| 86 |
+
"84": "all",
|
| 87 |
+
"85": "ce",
|
| 88 |
+
"86": "ur",
|
| 89 |
+
"87": "ith",
|
| 90 |
+
"88": "am",
|
| 91 |
+
"89": "if",
|
| 92 |
+
"90": "ir",
|
| 93 |
+
"91": "\u2581go",
|
| 94 |
+
"92": "\u2581u",
|
| 95 |
+
"93": "\u2581as",
|
| 96 |
+
"94": "\u2581was",
|
| 97 |
+
"95": "ad",
|
| 98 |
+
"96": "\u2581W",
|
| 99 |
+
"97": "\u2581k",
|
| 100 |
+
"98": "\u2581an",
|
| 101 |
+
"99": "ht",
|
| 102 |
+
"100": "th",
|
| 103 |
+
"101": "\u2581r",
|
| 104 |
+
"102": "\u2581are",
|
| 105 |
+
"103": "ere",
|
| 106 |
+
"104": "\u2581se",
|
| 107 |
+
"105": "\u2581do",
|
| 108 |
+
"106": "\u2581B",
|
| 109 |
+
"107": "\u2581so",
|
| 110 |
+
"108": "\u2581sh",
|
| 111 |
+
"109": "\u2581not",
|
| 112 |
+
"110": "\u2581li",
|
| 113 |
+
"111": "od",
|
| 114 |
+
"112": "\u2581C",
|
| 115 |
+
"113": "ust",
|
| 116 |
+
"114": "ill",
|
| 117 |
+
"115": "ight",
|
| 118 |
+
"116": "ally",
|
| 119 |
+
"117": "\u2581And",
|
| 120 |
+
"118": "ter",
|
| 121 |
+
"119": "\u2581or",
|
| 122 |
+
"120": "\u2581me",
|
| 123 |
+
"121": "\u2581M",
|
| 124 |
+
"122": "ome",
|
| 125 |
+
"123": "op",
|
| 126 |
+
"124": "\u2581at",
|
| 127 |
+
"125": "il",
|
| 128 |
+
"126": "\u2581The",
|
| 129 |
+
"127": "ould",
|
| 130 |
+
"128": "\u2581j",
|
| 131 |
+
"129": "ant",
|
| 132 |
+
"130": "\u2581So",
|
| 133 |
+
"131": "\u2581H",
|
| 134 |
+
"132": "ol",
|
| 135 |
+
"133": "ain",
|
| 136 |
+
"134": "\u2581can",
|
| 137 |
+
"135": "\u2581de",
|
| 138 |
+
"136": "\u2581ne",
|
| 139 |
+
"137": "ore",
|
| 140 |
+
"138": "\u2581con",
|
| 141 |
+
"139": "\u2581kn",
|
| 142 |
+
"140": "ck",
|
| 143 |
+
"141": "ul",
|
| 144 |
+
"142": "\u2581fr",
|
| 145 |
+
"143": "\u2581ab",
|
| 146 |
+
"144": "ers",
|
| 147 |
+
"145": "ess",
|
| 148 |
+
"146": "ge",
|
| 149 |
+
"147": "\u2581pro",
|
| 150 |
+
"148": "pe",
|
| 151 |
+
"149": "ate",
|
| 152 |
+
"150": "\u2581su",
|
| 153 |
+
"151": "\u2581com",
|
| 154 |
+
"152": "\u2581but",
|
| 155 |
+
"153": "\u2581all",
|
| 156 |
+
"154": "est",
|
| 157 |
+
"155": "qu",
|
| 158 |
+
"156": "\u2581ex",
|
| 159 |
+
"157": "\u2581al",
|
| 160 |
+
"158": "ra",
|
| 161 |
+
"159": "\u2581O",
|
| 162 |
+
"160": "out",
|
| 163 |
+
"161": "use",
|
| 164 |
+
"162": "very",
|
| 165 |
+
"163": "pp",
|
| 166 |
+
"164": "\u2581Y",
|
| 167 |
+
"165": "\u2581ch",
|
| 168 |
+
"166": "ri",
|
| 169 |
+
"167": "ist",
|
| 170 |
+
"168": "\u2581v",
|
| 171 |
+
"169": "\u2581lo",
|
| 172 |
+
"170": "ment",
|
| 173 |
+
"171": "art",
|
| 174 |
+
"172": "\u2581P",
|
| 175 |
+
"173": "nt",
|
| 176 |
+
"174": "ab",
|
| 177 |
+
"175": "\u2581one",
|
| 178 |
+
"176": "\u2581N",
|
| 179 |
+
"177": "ive",
|
| 180 |
+
"178": "\u2581wor",
|
| 181 |
+
"179": "ions",
|
| 182 |
+
"180": "ort",
|
| 183 |
+
"181": "\u2581L",
|
| 184 |
+
"182": "\u2581by",
|
| 185 |
+
"183": "ich",
|
| 186 |
+
"184": "\u2581my",
|
| 187 |
+
"185": "ity",
|
| 188 |
+
"186": "ok",
|
| 189 |
+
"187": "\u2581G",
|
| 190 |
+
"188": "res",
|
| 191 |
+
"189": "\u2581up",
|
| 192 |
+
"190": "un",
|
| 193 |
+
"191": "um",
|
| 194 |
+
"192": "ea",
|
| 195 |
+
"193": "ind",
|
| 196 |
+
"194": "and",
|
| 197 |
+
"195": "ink",
|
| 198 |
+
"196": "el",
|
| 199 |
+
"197": "\u2581D",
|
| 200 |
+
"198": "em",
|
| 201 |
+
"199": "\u2581E",
|
| 202 |
+
"200": "os",
|
| 203 |
+
"201": "oug",
|
| 204 |
+
"202": "\u2581if",
|
| 205 |
+
"203": "ca",
|
| 206 |
+
"204": "\u2581out",
|
| 207 |
+
"205": "\u2581int",
|
| 208 |
+
"206": "ie",
|
| 209 |
+
"207": "\u2581F",
|
| 210 |
+
"208": "\u2581It",
|
| 211 |
+
"209": "\u2581his",
|
| 212 |
+
"210": "ard",
|
| 213 |
+
"211": "\u2581had",
|
| 214 |
+
"212": "\u2581tr",
|
| 215 |
+
"213": "her",
|
| 216 |
+
"214": "our",
|
| 217 |
+
"215": "ies",
|
| 218 |
+
"216": "ake",
|
| 219 |
+
"217": "\u2581R",
|
| 220 |
+
"218": "\u2581We",
|
| 221 |
+
"219": "\u2581get",
|
| 222 |
+
"220": "\u2581don",
|
| 223 |
+
"221": "\u2581us",
|
| 224 |
+
"222": "ak",
|
| 225 |
+
"223": "\u2581pl",
|
| 226 |
+
"224": "ect",
|
| 227 |
+
"225": "ure",
|
| 228 |
+
"226": "ame",
|
| 229 |
+
"227": "ast",
|
| 230 |
+
"228": "\u2581who",
|
| 231 |
+
"229": "ack",
|
| 232 |
+
"230": "\u2581le",
|
| 233 |
+
"231": "\u2581sa",
|
| 234 |
+
"232": "iv",
|
| 235 |
+
"233": "ci",
|
| 236 |
+
"234": "ide",
|
| 237 |
+
"235": "\u2581tim",
|
| 238 |
+
"236": "\u2581our",
|
| 239 |
+
"237": "ound",
|
| 240 |
+
"238": "ous",
|
| 241 |
+
"239": "\u2581co",
|
| 242 |
+
"240": "\u2581pe",
|
| 243 |
+
"241": "ose",
|
| 244 |
+
"242": "ud",
|
| 245 |
+
"243": "\u2581see",
|
| 246 |
+
"244": "ough",
|
| 247 |
+
"245": "\u2581man",
|
| 248 |
+
"246": "\u2581qu",
|
| 249 |
+
"247": "\u2581You",
|
| 250 |
+
"248": "so",
|
| 251 |
+
"249": "ople",
|
| 252 |
+
"250": "\u2581Wh",
|
| 253 |
+
"251": "ong",
|
| 254 |
+
"252": "ap",
|
| 255 |
+
"253": "ther",
|
| 256 |
+
"254": "\u2581J",
|
| 257 |
+
"255": "are",
|
| 258 |
+
"256": "ine",
|
| 259 |
+
"257": "\u2581say",
|
| 260 |
+
"258": "\u2581im",
|
| 261 |
+
"259": "\u2581But",
|
| 262 |
+
"260": "ings",
|
| 263 |
+
"261": "\u2581has",
|
| 264 |
+
"262": "\u2581ag",
|
| 265 |
+
"263": "ff",
|
| 266 |
+
"264": "\u2581her",
|
| 267 |
+
"265": "itt",
|
| 268 |
+
"266": "one",
|
| 269 |
+
"267": "\u2581en",
|
| 270 |
+
"268": "\u2581ar",
|
| 271 |
+
"269": "\u2581fe",
|
| 272 |
+
"270": "ven",
|
| 273 |
+
"271": "\u2581any",
|
| 274 |
+
"272": "\u2581mo",
|
| 275 |
+
"273": "reat",
|
| 276 |
+
"274": "ag",
|
| 277 |
+
"275": "\u2581how",
|
| 278 |
+
"276": "\u2581cl",
|
| 279 |
+
"277": "pt",
|
| 280 |
+
"278": "\u2581now",
|
| 281 |
+
"279": "own",
|
| 282 |
+
"280": "ber",
|
| 283 |
+
"281": "\u2581him",
|
| 284 |
+
"282": "\u2581act",
|
| 285 |
+
"283": "hing",
|
| 286 |
+
"284": "ice",
|
| 287 |
+
"285": "\u2581no",
|
| 288 |
+
"286": "ans",
|
| 289 |
+
"287": "iz",
|
| 290 |
+
"288": "\u2581fa",
|
| 291 |
+
"289": "per",
|
| 292 |
+
"290": "pl",
|
| 293 |
+
"291": "\u2581te",
|
| 294 |
+
"292": "\u2581ad",
|
| 295 |
+
"293": "age",
|
| 296 |
+
"294": "ree",
|
| 297 |
+
"295": "\u2581tw",
|
| 298 |
+
"296": "ank",
|
| 299 |
+
"297": "\u2581He",
|
| 300 |
+
"298": "ple",
|
| 301 |
+
"299": "ite",
|
| 302 |
+
"300": "ry",
|
| 303 |
+
"301": "\u2581U",
|
| 304 |
+
"302": "ish",
|
| 305 |
+
"303": "ire",
|
| 306 |
+
"304": "ue",
|
| 307 |
+
"305": "\u2581In",
|
| 308 |
+
"306": "\u2581she",
|
| 309 |
+
"307": "ble",
|
| 310 |
+
"308": "cc",
|
| 311 |
+
"309": "nder",
|
| 312 |
+
"310": "\u2581way",
|
| 313 |
+
"311": "\u2581pr",
|
| 314 |
+
"312": "ear",
|
| 315 |
+
"313": "\u2581did",
|
| 316 |
+
"314": "\u2581po",
|
| 317 |
+
"315": "eah",
|
| 318 |
+
"316": "\u2581un",
|
| 319 |
+
"317": "omet",
|
| 320 |
+
"318": "ence",
|
| 321 |
+
"319": "ep",
|
| 322 |
+
"320": "uch",
|
| 323 |
+
"321": "\u2581sp",
|
| 324 |
+
"322": "ach",
|
| 325 |
+
"323": "og",
|
| 326 |
+
"324": "ance",
|
| 327 |
+
"325": "able",
|
| 328 |
+
"326": "iff",
|
| 329 |
+
"327": "sel",
|
| 330 |
+
"328": "\u2581got",
|
| 331 |
+
"329": "way",
|
| 332 |
+
"330": "\u2581gr",
|
| 333 |
+
"331": "alk",
|
| 334 |
+
"332": "\u2581res",
|
| 335 |
+
"333": "ated",
|
| 336 |
+
"334": "irst",
|
| 337 |
+
"335": "ick",
|
| 338 |
+
"336": "ass",
|
| 339 |
+
"337": "\u2581two",
|
| 340 |
+
"338": "\u2581dis",
|
| 341 |
+
"339": "ord",
|
| 342 |
+
"340": "\u2581pre",
|
| 343 |
+
"341": "ount",
|
| 344 |
+
"342": "ase",
|
| 345 |
+
"343": "ip",
|
| 346 |
+
"344": "ult",
|
| 347 |
+
"345": "ical",
|
| 348 |
+
"346": "orm",
|
| 349 |
+
"347": "ary",
|
| 350 |
+
"348": "ace",
|
| 351 |
+
"349": "\u2581spe",
|
| 352 |
+
"350": "\u2581Ch",
|
| 353 |
+
"351": "\u2581thr",
|
| 354 |
+
"352": "\u2581imp",
|
| 355 |
+
"353": "int",
|
| 356 |
+
"354": "\u2581am",
|
| 357 |
+
"355": "\u2581off",
|
| 358 |
+
"356": "act",
|
| 359 |
+
"357": "ia",
|
| 360 |
+
"358": "\u2581ro",
|
| 361 |
+
"359": "ress",
|
| 362 |
+
"360": "\u2581per",
|
| 363 |
+
"361": "\u2581fo",
|
| 364 |
+
"362": "\u2581br",
|
| 365 |
+
"363": "\u2581K",
|
| 366 |
+
"364": "vel",
|
| 367 |
+
"365": "\u2581gu",
|
| 368 |
+
"366": "\u2581bo",
|
| 369 |
+
"367": "ang",
|
| 370 |
+
"368": "kay",
|
| 371 |
+
"369": "ub",
|
| 372 |
+
"370": "ign",
|
| 373 |
+
"371": "\u2581may",
|
| 374 |
+
"372": "ving",
|
| 375 |
+
"373": "ces",
|
| 376 |
+
"374": "ens",
|
| 377 |
+
"375": "cl",
|
| 378 |
+
"376": "\u2581lot",
|
| 379 |
+
"377": "ru",
|
| 380 |
+
"378": "ade",
|
| 381 |
+
"379": "\u2581bet",
|
| 382 |
+
"380": "\u2581bl",
|
| 383 |
+
"381": "\u2581let",
|
| 384 |
+
"382": "fore",
|
| 385 |
+
"383": "co",
|
| 386 |
+
"384": "ild",
|
| 387 |
+
"385": "ning",
|
| 388 |
+
"386": "xt",
|
| 389 |
+
"387": "ile",
|
| 390 |
+
"388": "ark",
|
| 391 |
+
"389": "self",
|
| 392 |
+
"390": "\u2581app",
|
| 393 |
+
"391": "ory",
|
| 394 |
+
"392": "du",
|
| 395 |
+
"393": "\u2581day",
|
| 396 |
+
"394": "\u2581St",
|
| 397 |
+
"395": "ater",
|
| 398 |
+
"396": "\u2581use",
|
| 399 |
+
"397": "ys",
|
| 400 |
+
"398": "fter",
|
| 401 |
+
"399": "\u2581new",
|
| 402 |
+
"400": "ious",
|
| 403 |
+
"401": "ial",
|
| 404 |
+
"402": "he",
|
| 405 |
+
"403": "wn",
|
| 406 |
+
"404": "ved",
|
| 407 |
+
"405": "red",
|
| 408 |
+
"406": "\u2581fl",
|
| 409 |
+
"407": "iss",
|
| 410 |
+
"408": "ody",
|
| 411 |
+
"409": "form",
|
| 412 |
+
"410": "ian",
|
| 413 |
+
"411": "tain",
|
| 414 |
+
"412": "\u2581bu",
|
| 415 |
+
"413": "\u2581V",
|
| 416 |
+
"414": "\u2581rec",
|
| 417 |
+
"415": "ty",
|
| 418 |
+
"416": "be",
|
| 419 |
+
"417": "\u2581sc",
|
| 420 |
+
"418": "ors",
|
| 421 |
+
"419": "vers",
|
| 422 |
+
"420": "\u2581put",
|
| 423 |
+
"421": "ife",
|
| 424 |
+
"422": "\u2581If",
|
| 425 |
+
"423": "we",
|
| 426 |
+
"424": "te",
|
| 427 |
+
"425": "ject",
|
| 428 |
+
"426": "ath",
|
| 429 |
+
"427": "ting",
|
| 430 |
+
"428": "\u2581rem",
|
| 431 |
+
"429": "\u2581acc",
|
| 432 |
+
"430": "ull",
|
| 433 |
+
"431": "ons",
|
| 434 |
+
"432": "\u2581ind",
|
| 435 |
+
"433": "\u2581ser",
|
| 436 |
+
"434": "\u2581ke",
|
| 437 |
+
"435": "ates",
|
| 438 |
+
"436": "ves",
|
| 439 |
+
"437": "na",
|
| 440 |
+
"438": "lic",
|
| 441 |
+
"439": "\u2581des",
|
| 442 |
+
"440": "\u2581its",
|
| 443 |
+
"441": "ful",
|
| 444 |
+
"442": "ents",
|
| 445 |
+
"443": "erm",
|
| 446 |
+
"444": "ac",
|
| 447 |
+
"445": "ered",
|
| 448 |
+
"446": "ise",
|
| 449 |
+
"447": "\u2581sy",
|
| 450 |
+
"448": "urn",
|
| 451 |
+
"449": "\u2581em",
|
| 452 |
+
"450": "oth",
|
| 453 |
+
"451": "ual",
|
| 454 |
+
"452": "ne",
|
| 455 |
+
"453": "ward",
|
| 456 |
+
"454": "ib",
|
| 457 |
+
"455": "\u2581try",
|
| 458 |
+
"456": "\u2581pos",
|
| 459 |
+
"457": "nds",
|
| 460 |
+
"458": "ft",
|
| 461 |
+
"459": "get",
|
| 462 |
+
"460": "ph",
|
| 463 |
+
"461": "\u2581ob",
|
| 464 |
+
"462": "ady",
|
| 465 |
+
"463": "igh",
|
| 466 |
+
"464": "ood",
|
| 467 |
+
"465": "\u2581rel",
|
| 468 |
+
"466": "\u2581wr",
|
| 469 |
+
"467": "ug",
|
| 470 |
+
"468": "ears",
|
| 471 |
+
"469": "ail",
|
| 472 |
+
"470": "\u2581Now",
|
| 473 |
+
"471": "\u2581bit",
|
| 474 |
+
"472": "ng",
|
| 475 |
+
"473": "\u2581Oh",
|
| 476 |
+
"474": "\u2581hel",
|
| 477 |
+
"475": "ange",
|
| 478 |
+
"476": "\u2581reg",
|
| 479 |
+
"477": "\u2581rep",
|
| 480 |
+
"478": "\u2581bel",
|
| 481 |
+
"479": "\u2581sm",
|
| 482 |
+
"480": "ost",
|
| 483 |
+
"481": "tern",
|
| 484 |
+
"482": "gr",
|
| 485 |
+
"483": "\u2581own",
|
| 486 |
+
"484": "\u2581end",
|
| 487 |
+
"485": "pect",
|
| 488 |
+
"486": "ily",
|
| 489 |
+
"487": "day",
|
| 490 |
+
"488": "ied",
|
| 491 |
+
"489": "ific",
|
| 492 |
+
"490": "ower",
|
| 493 |
+
"491": "\u2581add",
|
| 494 |
+
"492": "cess",
|
| 495 |
+
"493": "ict",
|
| 496 |
+
"494": "ible",
|
| 497 |
+
"495": "\u2581bas",
|
| 498 |
+
"496": "\u2581i",
|
| 499 |
+
"497": "\u2581op",
|
| 500 |
+
"498": "cial",
|
| 501 |
+
"499": "ular",
|
| 502 |
+
"500": "\u2581Be",
|
| 503 |
+
"501": "ced",
|
| 504 |
+
"502": "\u2581too",
|
| 505 |
+
"503": "ks",
|
| 506 |
+
"504": "ew",
|
| 507 |
+
"505": "mer",
|
| 508 |
+
"506": "\u2581ph",
|
| 509 |
+
"507": "ob",
|
| 510 |
+
"508": "==",
|
| 511 |
+
"509": "\u2581la",
|
| 512 |
+
"510": "\u2581set",
|
| 513 |
+
"511": "\u2581min",
|
| 514 |
+
"512": "\u2581sub",
|
| 515 |
+
"513": "\u2581gen",
|
| 516 |
+
"514": "atch",
|
| 517 |
+
"515": "..",
|
| 518 |
+
"516": "\u2581inv",
|
| 519 |
+
"517": "\u2581As",
|
| 520 |
+
"518": "\u2581nat",
|
| 521 |
+
"519": "\u2581sl",
|
| 522 |
+
"520": "\u2581num",
|
| 523 |
+
"521": "av",
|
| 524 |
+
"522": "ways",
|
| 525 |
+
"523": "\u2581God",
|
| 526 |
+
"524": "stem",
|
| 527 |
+
"525": "\u2581ac",
|
| 528 |
+
"526": "\u2581att",
|
| 529 |
+
"527": "\u2581ev",
|
| 530 |
+
"528": "\u2581def",
|
| 531 |
+
"529": "llow",
|
| 532 |
+
"530": "\u2581str",
|
| 533 |
+
"531": "lect",
|
| 534 |
+
"532": "ars",
|
| 535 |
+
"533": "\u2581cr",
|
| 536 |
+
"534": "\u2581Is",
|
| 537 |
+
"535": "olog",
|
| 538 |
+
"536": "les",
|
| 539 |
+
"537": "oy",
|
| 540 |
+
"538": "\u2581ask",
|
| 541 |
+
"539": "\u2581inc",
|
| 542 |
+
"540": "body",
|
| 543 |
+
"541": "\u2581ent",
|
| 544 |
+
"542": "\u2581pol",
|
| 545 |
+
"543": "ness",
|
| 546 |
+
"544": "ix",
|
| 547 |
+
"545": "\u2581why",
|
| 548 |
+
"546": "onna",
|
| 549 |
+
"547": "\u2581ear",
|
| 550 |
+
"548": "\u2581tak",
|
| 551 |
+
"549": "\u2581Un",
|
| 552 |
+
"550": "ited",
|
| 553 |
+
"551": "mun",
|
| 554 |
+
"552": "li",
|
| 555 |
+
"553": "ute",
|
| 556 |
+
"554": "ract",
|
| 557 |
+
"555": "\u2581dec",
|
| 558 |
+
"556": "uro",
|
| 559 |
+
"557": "\u2581mak",
|
| 560 |
+
"558": "\u2581fin",
|
| 561 |
+
"559": "ween",
|
| 562 |
+
"560": "\u2581No",
|
| 563 |
+
"561": "arch",
|
| 564 |
+
"562": "\u2581bec",
|
| 565 |
+
"563": "gan",
|
| 566 |
+
"564": "old",
|
| 567 |
+
"565": "cy",
|
| 568 |
+
"566": "\u2581big",
|
| 569 |
+
"567": "\u2581For",
|
| 570 |
+
"568": "ren",
|
| 571 |
+
"569": "als",
|
| 572 |
+
"570": "und",
|
| 573 |
+
"571": "\u2581Al",
|
| 574 |
+
"572": "\u2581All",
|
| 575 |
+
"573": "ss",
|
| 576 |
+
"574": "ows",
|
| 577 |
+
"575": "\u2581mod",
|
| 578 |
+
"576": "ock",
|
| 579 |
+
"577": "\u2581id",
|
| 580 |
+
"578": "ism",
|
| 581 |
+
"579": "cus",
|
| 582 |
+
"580": "\u2581gl",
|
| 583 |
+
"581": "ably",
|
| 584 |
+
"582": "\u2581ass",
|
| 585 |
+
"583": "\u2581car",
|
| 586 |
+
"584": "ata",
|
| 587 |
+
"585": "ppen",
|
| 588 |
+
"586": "led",
|
| 589 |
+
"587": "\u2581sim",
|
| 590 |
+
"588": "\u2581mon",
|
| 591 |
+
"589": "ics",
|
| 592 |
+
"590": "\u2581giv",
|
| 593 |
+
"591": "cept",
|
| 594 |
+
"592": "\u2581Mr",
|
| 595 |
+
"593": "pan",
|
| 596 |
+
"594": "\u2581pub",
|
| 597 |
+
"595": "\u2581eff",
|
| 598 |
+
"596": "\u2581How",
|
| 599 |
+
"597": "ps",
|
| 600 |
+
"598": "vern",
|
| 601 |
+
"599": "end",
|
| 602 |
+
"600": "hip",
|
| 603 |
+
"601": "iew",
|
| 604 |
+
"602": "ope",
|
| 605 |
+
"603": "\u2581An",
|
| 606 |
+
"604": "\u2581She",
|
| 607 |
+
"605": "\u2581Com",
|
| 608 |
+
"606": "ee",
|
| 609 |
+
"607": "ures",
|
| 610 |
+
"608": "ell",
|
| 611 |
+
"609": "ouse",
|
| 612 |
+
"610": "cond",
|
| 613 |
+
"611": "king",
|
| 614 |
+
"612": "oc",
|
| 615 |
+
"613": "ues",
|
| 616 |
+
"614": "ever",
|
| 617 |
+
"615": "\u2581To",
|
| 618 |
+
"616": "clud",
|
| 619 |
+
"617": "\u2581ins",
|
| 620 |
+
"618": "\u2581exp",
|
| 621 |
+
"619": "\u2581old",
|
| 622 |
+
"620": "\u2581mem",
|
| 623 |
+
"621": "\u2581ref",
|
| 624 |
+
"622": "\u2581tra",
|
| 625 |
+
"623": "\u2581far",
|
| 626 |
+
"624": "ave",
|
| 627 |
+
"625": "rat",
|
| 628 |
+
"626": "\u2581sur",
|
| 629 |
+
"627": "ruct",
|
| 630 |
+
"628": "rib",
|
| 631 |
+
"629": "duct",
|
| 632 |
+
"630": "uff",
|
| 633 |
+
"631": "\u2581met",
|
| 634 |
+
"632": "\u2581sch",
|
| 635 |
+
"633": "ince",
|
| 636 |
+
"634": "\u2581run",
|
| 637 |
+
"635": "ense",
|
| 638 |
+
"636": "\u2581cle",
|
| 639 |
+
"637": "\u2581==",
|
| 640 |
+
"638": "mon",
|
| 641 |
+
"639": "ize",
|
| 642 |
+
"640": "\u2581ord",
|
| 643 |
+
"641": "blem",
|
| 644 |
+
"642": "tin",
|
| 645 |
+
"643": "\u2581Let",
|
| 646 |
+
"644": "ner",
|
| 647 |
+
"645": "ond",
|
| 648 |
+
"646": "its",
|
| 649 |
+
"647": "\u2581cor",
|
| 650 |
+
"648": "land",
|
| 651 |
+
"649": "\u2581cur",
|
| 652 |
+
"650": "\u2581Re",
|
| 653 |
+
"651": "\u2581bus",
|
| 654 |
+
"652": "\u2581uh",
|
| 655 |
+
"653": "air",
|
| 656 |
+
"654": "ote",
|
| 657 |
+
"655": "ants",
|
| 658 |
+
"656": "ason",
|
| 659 |
+
"657": "ric",
|
| 660 |
+
"658": "\u2581el",
|
| 661 |
+
"659": "\u2581cer",
|
| 662 |
+
"660": "nce",
|
| 663 |
+
"661": "\u2581fam",
|
| 664 |
+
"662": "\u2581cap",
|
| 665 |
+
"663": "uck",
|
| 666 |
+
"664": "ool",
|
| 667 |
+
"665": "ried",
|
| 668 |
+
"666": "\u2581cou",
|
| 669 |
+
"667": "\u2581fun",
|
| 670 |
+
"668": "\u2581wom",
|
| 671 |
+
"669": "\u2581hum",
|
| 672 |
+
"670": "\u2581ty",
|
| 673 |
+
"671": "\u2581ap",
|
| 674 |
+
"672": "ike",
|
| 675 |
+
"673": "\u2581few",
|
| 676 |
+
"674": "oney",
|
| 677 |
+
"675": "\u2581inf",
|
| 678 |
+
"676": "ont",
|
| 679 |
+
"677": "ese",
|
| 680 |
+
"678": "ook",
|
| 681 |
+
"679": "gy",
|
| 682 |
+
"680": "uth",
|
| 683 |
+
"681": "ulat",
|
| 684 |
+
"682": "ieve",
|
| 685 |
+
"683": "ized",
|
| 686 |
+
"684": "ross",
|
| 687 |
+
"685": "\u2581ple",
|
| 688 |
+
"686": "\u2581um",
|
| 689 |
+
"687": "\u2581val",
|
| 690 |
+
"688": "\u2581equ",
|
| 691 |
+
"689": "\u2581lea",
|
| 692 |
+
"690": "\u2581lar",
|
| 693 |
+
"691": "ah",
|
| 694 |
+
"692": "eral",
|
| 695 |
+
"693": "\u2581ed",
|
| 696 |
+
"694": "ared",
|
| 697 |
+
"695": "lish",
|
| 698 |
+
"696": "arn",
|
| 699 |
+
"697": "ds",
|
| 700 |
+
"698": "esn",
|
| 701 |
+
"699": "\u2581iss",
|
| 702 |
+
"700": "\u2581ca",
|
| 703 |
+
"701": "ted",
|
| 704 |
+
"702": "ices",
|
| 705 |
+
"703": "\u2581wee",
|
| 706 |
+
"704": "ash",
|
| 707 |
+
"705": "\u2581top",
|
| 708 |
+
"706": "ten",
|
| 709 |
+
"707": "up",
|
| 710 |
+
"708": "ts",
|
| 711 |
+
"709": "gin",
|
| 712 |
+
"710": "con",
|
| 713 |
+
"711": "ari",
|
| 714 |
+
"712": "\u2581opp",
|
| 715 |
+
"713": "osed",
|
| 716 |
+
"714": "\u2581eas",
|
| 717 |
+
"715": "\u2581ext",
|
| 718 |
+
"716": "gg",
|
| 719 |
+
"717": "az",
|
| 720 |
+
"718": "\u2581Fr",
|
| 721 |
+
"719": "ideo",
|
| 722 |
+
"720": "izat",
|
| 723 |
+
"721": "\u2581men",
|
| 724 |
+
"722": "\u2581mom",
|
| 725 |
+
"723": "\u2581ret",
|
| 726 |
+
"724": "tty",
|
| 727 |
+
"725": "rist",
|
| 728 |
+
"726": "\u2581gra",
|
| 729 |
+
"727": "alth",
|
| 730 |
+
"728": "ef",
|
| 731 |
+
"729": "\u2581det",
|
| 732 |
+
"730": "ax",
|
| 733 |
+
"731": "\u2581mat",
|
| 734 |
+
"732": "chn",
|
| 735 |
+
"733": "ern",
|
| 736 |
+
"734": "peri",
|
| 737 |
+
"735": "\u2581bre",
|
| 738 |
+
"736": "\u2581Sh",
|
| 739 |
+
"737": "sw",
|
| 740 |
+
"738": "erat",
|
| 741 |
+
"739": "\u2581sit",
|
| 742 |
+
"740": "ters",
|
| 743 |
+
"741": "ale",
|
| 744 |
+
"742": "man",
|
| 745 |
+
"743": "\u2581sol",
|
| 746 |
+
"744": "ork",
|
| 747 |
+
"745": "\u2581adv",
|
| 748 |
+
"746": "ety",
|
| 749 |
+
"747": "\u2581vis",
|
| 750 |
+
"748": "\u2581med",
|
| 751 |
+
"749": "uc",
|
| 752 |
+
"750": "less",
|
| 753 |
+
"751": "\u2581unt",
|
| 754 |
+
"752": "gram",
|
| 755 |
+
"753": "ets",
|
| 756 |
+
"754": "ists",
|
| 757 |
+
"755": "\u2581ey",
|
| 758 |
+
"756": "\u2581col",
|
| 759 |
+
"757": "imes",
|
| 760 |
+
"758": "\u2581law",
|
| 761 |
+
"759": "\u2581pri",
|
| 762 |
+
"760": "sid",
|
| 763 |
+
"761": "\u2581On",
|
| 764 |
+
"762": "\u2581mot",
|
| 765 |
+
"763": "ield",
|
| 766 |
+
"764": "\u2581Do",
|
| 767 |
+
"765": "\u2581At",
|
| 768 |
+
"766": "ages",
|
| 769 |
+
"767": "amp",
|
| 770 |
+
"768": "\u2581art",
|
| 771 |
+
"769": "miss",
|
| 772 |
+
"770": "\u2581sk",
|
| 773 |
+
"771": "alf",
|
| 774 |
+
"772": "pr",
|
| 775 |
+
"773": "ier",
|
| 776 |
+
"774": "\u2581beh",
|
| 777 |
+
"775": "\u2581Yes",
|
| 778 |
+
"776": "ural",
|
| 779 |
+
"777": "ime",
|
| 780 |
+
"778": "\u2581wa",
|
| 781 |
+
"779": "oks",
|
| 782 |
+
"780": "bers",
|
| 783 |
+
"781": "ger",
|
| 784 |
+
"782": "ient",
|
| 785 |
+
"783": "ries",
|
| 786 |
+
"784": "...",
|
| 787 |
+
"785": "\u2581che",
|
| 788 |
+
"786": "\u2581Br",
|
| 789 |
+
"787": "ird",
|
| 790 |
+
"788": "\u2581Ar",
|
| 791 |
+
"789": "\u2581war",
|
| 792 |
+
"790": "inat",
|
| 793 |
+
"791": "\u2581My",
|
| 794 |
+
"792": "ital",
|
| 795 |
+
"793": "wh",
|
| 796 |
+
"794": "med",
|
| 797 |
+
"795": "\u2581pur",
|
| 798 |
+
"796": "ully",
|
| 799 |
+
"797": "\u2581One",
|
| 800 |
+
"798": "\u2581rat",
|
| 801 |
+
"799": "ines",
|
| 802 |
+
"800": "\u2581Of",
|
| 803 |
+
"801": "io",
|
| 804 |
+
"802": "\u2581loc",
|
| 805 |
+
"803": "ret",
|
| 806 |
+
"804": "ctor",
|
| 807 |
+
"805": "\u2581leg",
|
| 808 |
+
"806": "stit",
|
| 809 |
+
"807": "ined",
|
| 810 |
+
"808": "ught",
|
| 811 |
+
"809": "\u2581dur",
|
| 812 |
+
"810": "\u2581es",
|
| 813 |
+
"811": "vent",
|
| 814 |
+
"812": "aj",
|
| 815 |
+
"813": "\u2581bro",
|
| 816 |
+
"814": "\u2581saw",
|
| 817 |
+
"815": "\u2581sec",
|
| 818 |
+
"816": "ream",
|
| 819 |
+
"817": "\u2581pop",
|
| 820 |
+
"818": "reen",
|
| 821 |
+
"819": "\u2581Ind",
|
| 822 |
+
"820": "els",
|
| 823 |
+
"821": "\u2581yet",
|
| 824 |
+
"822": "ired",
|
| 825 |
+
"823": "\u2581sw",
|
| 826 |
+
"824": "tro",
|
| 827 |
+
"825": "oup",
|
| 828 |
+
"826": "most",
|
| 829 |
+
"827": "pean",
|
| 830 |
+
"828": "eds",
|
| 831 |
+
"829": "ush",
|
| 832 |
+
"830": "oh",
|
| 833 |
+
"831": "\u2581Se",
|
| 834 |
+
"832": "\u2581tea",
|
| 835 |
+
"833": "ann",
|
| 836 |
+
"834": "ilit",
|
| 837 |
+
"835": "err",
|
| 838 |
+
"836": "pend",
|
| 839 |
+
"837": "ton",
|
| 840 |
+
"838": "ased",
|
| 841 |
+
"839": "\u2581aff",
|
| 842 |
+
"840": "\u2581mor",
|
| 843 |
+
"841": "\u2581dra",
|
| 844 |
+
"842": "put",
|
| 845 |
+
"843": "\u2581dr",
|
| 846 |
+
"844": "ins",
|
| 847 |
+
"845": "uat",
|
| 848 |
+
"846": "nect",
|
| 849 |
+
"847": "cri",
|
| 850 |
+
"848": "outh",
|
| 851 |
+
"849": "\u2581ra",
|
| 852 |
+
"850": "\u2581pay",
|
| 853 |
+
"851": "ms",
|
| 854 |
+
"852": "\u2581av",
|
| 855 |
+
"853": "bs",
|
| 856 |
+
"854": "ling",
|
| 857 |
+
"855": "\u2581De",
|
| 858 |
+
"856": "\u2581Or",
|
| 859 |
+
"857": "ove",
|
| 860 |
+
"858": "\u2581Can",
|
| 861 |
+
"859": "\u2581eng",
|
| 862 |
+
"860": "ames",
|
| 863 |
+
"861": "ided",
|
| 864 |
+
"862": "\u2581Go",
|
| 865 |
+
"863": "mitt",
|
| 866 |
+
"864": "ode",
|
| 867 |
+
"865": "\u2581cre",
|
| 868 |
+
"866": "par",
|
| 869 |
+
"867": "ides",
|
| 870 |
+
"868": "pos",
|
| 871 |
+
"869": "\u2581fav",
|
| 872 |
+
"870": "\u2581air",
|
| 873 |
+
"871": "\u2581New",
|
| 874 |
+
"872": "\u2581bad",
|
| 875 |
+
"873": "\u2581six",
|
| 876 |
+
"874": "vat",
|
| 877 |
+
"875": "\u2581pat",
|
| 878 |
+
"876": "not",
|
| 879 |
+
"877": "\u2581di",
|
| 880 |
+
"878": "rop",
|
| 881 |
+
"879": "ral",
|
| 882 |
+
"880": "orn",
|
| 883 |
+
"881": "\u2581par",
|
| 884 |
+
"882": "cing",
|
| 885 |
+
"883": "\u2581aw",
|
| 886 |
+
"884": "orts",
|
| 887 |
+
"885": "ox",
|
| 888 |
+
"886": "\u2581yes",
|
| 889 |
+
"887": "cuss",
|
| 890 |
+
"888": "eng",
|
| 891 |
+
"889": "ives",
|
| 892 |
+
"890": "erms",
|
| 893 |
+
"891": "\u2581job",
|
| 894 |
+
"892": "mand",
|
| 895 |
+
"893": "ying",
|
| 896 |
+
"894": "\u2581occ",
|
| 897 |
+
"895": "aps",
|
| 898 |
+
"896": "ases",
|
| 899 |
+
"897": "\u2581Not",
|
| 900 |
+
"898": "rent",
|
| 901 |
+
"899": "ency",
|
| 902 |
+
"900": "att",
|
| 903 |
+
"901": "ised",
|
| 904 |
+
"902": "vice",
|
| 905 |
+
"903": "\u2581Eng",
|
| 906 |
+
"904": "\u2581est",
|
| 907 |
+
"905": "oked",
|
| 908 |
+
"906": "\u2581Q",
|
| 909 |
+
"907": "iron",
|
| 910 |
+
"908": "idd",
|
| 911 |
+
"909": "me",
|
| 912 |
+
"910": "unch",
|
| 913 |
+
"911": "ane",
|
| 914 |
+
"912": "\u2581z",
|
| 915 |
+
"913": "br",
|
| 916 |
+
"914": "arts",
|
| 917 |
+
"915": "\u2581fat",
|
| 918 |
+
"916": "ery",
|
| 919 |
+
"917": "anks",
|
| 920 |
+
"918": "\u2581jo",
|
| 921 |
+
"919": "\u2581mar",
|
| 922 |
+
"920": "aw",
|
| 923 |
+
"921": "ott",
|
| 924 |
+
"922": "ards",
|
| 925 |
+
"923": "\u2581oh",
|
| 926 |
+
"924": "ians",
|
| 927 |
+
"925": "\u2581sci",
|
| 928 |
+
"926": "row",
|
| 929 |
+
"927": "unt",
|
| 930 |
+
"928": "ury",
|
| 931 |
+
"929": "\u2581abs",
|
| 932 |
+
"930": "ergy",
|
| 933 |
+
"931": "\u2581Z",
|
| 934 |
+
"932": "ump",
|
| 935 |
+
"933": "\u2581Am",
|
| 936 |
+
"934": "ened",
|
| 937 |
+
"935": "angu",
|
| 938 |
+
"936": "\u2581Pro",
|
| 939 |
+
"937": "icat",
|
| 940 |
+
"938": "itch",
|
| 941 |
+
"939": "\u2581dri",
|
| 942 |
+
"940": "iat",
|
| 943 |
+
"941": "\u2581",
|
| 944 |
+
"942": "e",
|
| 945 |
+
"943": "t",
|
| 946 |
+
"944": "o",
|
| 947 |
+
"945": "a",
|
| 948 |
+
"946": "n",
|
| 949 |
+
"947": "i",
|
| 950 |
+
"948": "s",
|
| 951 |
+
"949": "r",
|
| 952 |
+
"950": "h",
|
| 953 |
+
"951": "l",
|
| 954 |
+
"952": "d",
|
| 955 |
+
"953": "u",
|
| 956 |
+
"954": "c",
|
| 957 |
+
"955": "m",
|
| 958 |
+
"956": "y",
|
| 959 |
+
"957": "g",
|
| 960 |
+
"958": "w",
|
| 961 |
+
"959": "f",
|
| 962 |
+
"960": "p",
|
| 963 |
+
"961": ",",
|
| 964 |
+
"962": ".",
|
| 965 |
+
"963": "b",
|
| 966 |
+
"964": "v",
|
| 967 |
+
"965": "k",
|
| 968 |
+
"966": "'",
|
| 969 |
+
"967": "I",
|
| 970 |
+
"968": "T",
|
| 971 |
+
"969": "A",
|
| 972 |
+
"970": "S",
|
| 973 |
+
"971": "x",
|
| 974 |
+
"972": "W",
|
| 975 |
+
"973": "j",
|
| 976 |
+
"974": "C",
|
| 977 |
+
"975": "B",
|
| 978 |
+
"976": "M",
|
| 979 |
+
"977": "?",
|
| 980 |
+
"978": "H",
|
| 981 |
+
"979": "O",
|
| 982 |
+
"980": "0",
|
| 983 |
+
"981": "P",
|
| 984 |
+
"982": "q",
|
| 985 |
+
"983": "Y",
|
| 986 |
+
"984": "N",
|
| 987 |
+
"985": "L",
|
| 988 |
+
"986": "D",
|
| 989 |
+
"987": "1",
|
| 990 |
+
"988": "E",
|
| 991 |
+
"989": "G",
|
| 992 |
+
"990": "z",
|
| 993 |
+
"991": "F",
|
| 994 |
+
"992": "R",
|
| 995 |
+
"993": "-",
|
| 996 |
+
"994": "2",
|
| 997 |
+
"995": "J",
|
| 998 |
+
"996": "U",
|
| 999 |
+
"997": "9",
|
| 1000 |
+
"998": "K",
|
| 1001 |
+
"999": "5",
|
| 1002 |
+
"1000": "3",
|
| 1003 |
+
"1001": "V",
|
| 1004 |
+
"1002": "=",
|
| 1005 |
+
"1003": "4",
|
| 1006 |
+
"1004": "8",
|
| 1007 |
+
"1005": "6",
|
| 1008 |
+
"1006": "7",
|
| 1009 |
+
"1007": "!",
|
| 1010 |
+
"1008": "%",
|
| 1011 |
+
"1009": ":",
|
| 1012 |
+
"1010": "Q",
|
| 1013 |
+
"1011": "Z",
|
| 1014 |
+
"1012": "$",
|
| 1015 |
+
"1013": "X",
|
| 1016 |
+
"1014": "\"",
|
| 1017 |
+
"1015": "&",
|
| 1018 |
+
"1016": "*",
|
| 1019 |
+
"1017": "/",
|
| 1020 |
+
"1018": "\u00a3",
|
| 1021 |
+
"1019": "+",
|
| 1022 |
+
"1020": "\u20ac",
|
| 1023 |
+
"1021": "_",
|
| 1024 |
+
"1022": "^",
|
| 1025 |
+
"1023": "\u00a5"
|
| 1026 |
+
}
|
pyproject.toml
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "nemotron-streaming"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "NeMo Nemotron Streaming Reference Implementation"
|
| 5 |
+
requires-python = ">=3.10,<3.11"
|
| 6 |
+
dependencies = [
|
| 7 |
+
"torch>=2.0.0",
|
| 8 |
+
"nemo_toolkit[asr]>=2.0.0",
|
| 9 |
+
"soundfile>=0.12.0",
|
| 10 |
+
"numpy>=1.24.0",
|
| 11 |
+
]
|
| 12 |
+
|
| 13 |
+
[tool.uv]
|
| 14 |
+
dev-dependencies = []
|
scripts/benchmark_wer.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
WER Benchmark for Nemotron Streaming 0.6b on LibriSpeech test-clean
|
| 4 |
+
"""
|
| 5 |
+
import glob
|
| 6 |
+
import numpy as np
|
| 7 |
+
import soundfile as sf
|
| 8 |
+
import torch
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import nemo.collections.asr as nemo_asr
|
| 11 |
+
from nemo.collections.asr.parts.utils.streaming_utils import CacheAwareStreamingAudioBuffer
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def load_ground_truth(librispeech_path: str) -> dict:
|
| 15 |
+
"""Load all ground truth transcriptions."""
|
| 16 |
+
gt = {}
|
| 17 |
+
for trans_file in glob.glob(f"{librispeech_path}/**/*.trans.txt", recursive=True):
|
| 18 |
+
with open(trans_file) as f:
|
| 19 |
+
for line in f:
|
| 20 |
+
parts = line.strip().split(" ", 1)
|
| 21 |
+
if len(parts) == 2:
|
| 22 |
+
file_id, text = parts
|
| 23 |
+
gt[file_id] = text.lower()
|
| 24 |
+
return gt
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def normalize_text(text: str) -> str:
|
| 28 |
+
"""Normalize text for WER calculation - remove punctuation, lowercase."""
|
| 29 |
+
import re
|
| 30 |
+
text = re.sub(r'[^\w\s]', '', text)
|
| 31 |
+
return ' '.join(text.lower().split())
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def compute_wer(reference: str, hypothesis: str) -> tuple:
|
| 35 |
+
"""Compute WER between reference and hypothesis."""
|
| 36 |
+
ref_words = normalize_text(reference).split()
|
| 37 |
+
hyp_words = normalize_text(hypothesis).split()
|
| 38 |
+
|
| 39 |
+
d = np.zeros((len(ref_words) + 1, len(hyp_words) + 1), dtype=np.uint32)
|
| 40 |
+
for i in range(len(ref_words) + 1):
|
| 41 |
+
d[i, 0] = i
|
| 42 |
+
for j in range(len(hyp_words) + 1):
|
| 43 |
+
d[0, j] = j
|
| 44 |
+
|
| 45 |
+
for i in range(1, len(ref_words) + 1):
|
| 46 |
+
for j in range(1, len(hyp_words) + 1):
|
| 47 |
+
if ref_words[i-1] == hyp_words[j-1]:
|
| 48 |
+
d[i, j] = d[i-1, j-1]
|
| 49 |
+
else:
|
| 50 |
+
d[i, j] = min(d[i-1, j] + 1, d[i, j-1] + 1, d[i-1, j-1] + 1)
|
| 51 |
+
|
| 52 |
+
errors = d[len(ref_words), len(hyp_words)]
|
| 53 |
+
return errors, len(ref_words)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def calc_drop_extra_pre_encoded(model, step_num, pad_and_drop_preencoded):
|
| 57 |
+
"""Calculate drop_extra_pre_encoded value per NVIDIA's reference."""
|
| 58 |
+
if step_num == 0 and not pad_and_drop_preencoded:
|
| 59 |
+
return 0
|
| 60 |
+
return model.encoder.streaming_cfg.drop_extra_pre_encoded
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def transcribe_streaming(model, audio: np.ndarray, pad_and_drop_preencoded: bool = False) -> str:
|
| 64 |
+
"""Streaming transcription using conformer_stream_step API."""
|
| 65 |
+
model.encoder.setup_streaming_params()
|
| 66 |
+
|
| 67 |
+
streaming_buffer = CacheAwareStreamingAudioBuffer(
|
| 68 |
+
model=model,
|
| 69 |
+
pad_and_drop_preencoded=pad_and_drop_preencoded,
|
| 70 |
+
)
|
| 71 |
+
streaming_buffer.reset_buffer()
|
| 72 |
+
streaming_buffer.append_audio(audio)
|
| 73 |
+
|
| 74 |
+
cache_last_channel, cache_last_time, cache_last_channel_len = \
|
| 75 |
+
model.encoder.get_initial_cache_state(batch_size=1)
|
| 76 |
+
|
| 77 |
+
previous_hypotheses = None
|
| 78 |
+
pred_out_stream = None
|
| 79 |
+
final_text = ""
|
| 80 |
+
|
| 81 |
+
with torch.inference_mode():
|
| 82 |
+
for step_num, (chunk_audio, chunk_lengths) in enumerate(streaming_buffer):
|
| 83 |
+
(
|
| 84 |
+
pred_out_stream,
|
| 85 |
+
transcribed_texts,
|
| 86 |
+
cache_last_channel,
|
| 87 |
+
cache_last_time,
|
| 88 |
+
cache_last_channel_len,
|
| 89 |
+
previous_hypotheses,
|
| 90 |
+
) = model.conformer_stream_step(
|
| 91 |
+
processed_signal=chunk_audio,
|
| 92 |
+
processed_signal_length=chunk_lengths,
|
| 93 |
+
cache_last_channel=cache_last_channel,
|
| 94 |
+
cache_last_time=cache_last_time,
|
| 95 |
+
cache_last_channel_len=cache_last_channel_len,
|
| 96 |
+
keep_all_outputs=streaming_buffer.is_buffer_empty(),
|
| 97 |
+
previous_hypotheses=previous_hypotheses,
|
| 98 |
+
previous_pred_out=pred_out_stream,
|
| 99 |
+
drop_extra_pre_encoded=calc_drop_extra_pre_encoded(model, step_num, pad_and_drop_preencoded),
|
| 100 |
+
return_transcription=True,
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
if transcribed_texts and len(transcribed_texts) > 0:
|
| 104 |
+
text = transcribed_texts[0]
|
| 105 |
+
if hasattr(text, 'text'):
|
| 106 |
+
final_text = text.text
|
| 107 |
+
else:
|
| 108 |
+
final_text = str(text)
|
| 109 |
+
|
| 110 |
+
return final_text
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def main():
|
| 114 |
+
import argparse
|
| 115 |
+
parser = argparse.ArgumentParser()
|
| 116 |
+
parser.add_argument("--num-files", type=int, default=100)
|
| 117 |
+
parser.add_argument("--dataset", type=str, default="datasets/LibriSpeech/test-clean")
|
| 118 |
+
args = parser.parse_args()
|
| 119 |
+
|
| 120 |
+
print("=" * 70)
|
| 121 |
+
print("NEMOTRON STREAMING 0.6B - WER BENCHMARK")
|
| 122 |
+
print("=" * 70)
|
| 123 |
+
|
| 124 |
+
# Load ground truth
|
| 125 |
+
print(f"\nLoading ground truth from {args.dataset}...")
|
| 126 |
+
gt = load_ground_truth(args.dataset)
|
| 127 |
+
print(f"Loaded {len(gt)} transcriptions")
|
| 128 |
+
|
| 129 |
+
# Get audio files
|
| 130 |
+
audio_files = sorted(glob.glob(f"{args.dataset}/**/*.flac", recursive=True))[:args.num_files]
|
| 131 |
+
print(f"Testing on {len(audio_files)} files")
|
| 132 |
+
|
| 133 |
+
# Load model
|
| 134 |
+
print("\nLoading model...")
|
| 135 |
+
model = nemo_asr.models.ASRModel.from_pretrained("nvidia/nemotron-speech-streaming-en-0.6b")
|
| 136 |
+
model.eval()
|
| 137 |
+
|
| 138 |
+
# Streaming transcription
|
| 139 |
+
print("\n[STREAMING MODE]")
|
| 140 |
+
stream_errors = 0
|
| 141 |
+
stream_words = 0
|
| 142 |
+
|
| 143 |
+
for i, audio_path in enumerate(audio_files):
|
| 144 |
+
file_id = Path(audio_path).stem
|
| 145 |
+
print(f" [{i+1}/{len(audio_files)}] {file_id}", end=" ", flush=True)
|
| 146 |
+
|
| 147 |
+
audio, sr = sf.read(audio_path, dtype="float32")
|
| 148 |
+
hyp = transcribe_streaming(model, audio)
|
| 149 |
+
|
| 150 |
+
if file_id in gt:
|
| 151 |
+
errors, words = compute_wer(gt[file_id], hyp)
|
| 152 |
+
stream_errors += errors
|
| 153 |
+
stream_words += words
|
| 154 |
+
current_wer = 100 * stream_errors / stream_words
|
| 155 |
+
print(f"-> {errors} errs, WER so far: {current_wer:.2f}%")
|
| 156 |
+
else:
|
| 157 |
+
print("-> (no ground truth)")
|
| 158 |
+
|
| 159 |
+
stream_wer = 100 * stream_errors / stream_words if stream_words > 0 else 0
|
| 160 |
+
|
| 161 |
+
# Summary
|
| 162 |
+
print("\n" + "=" * 70)
|
| 163 |
+
print("SUMMARY")
|
| 164 |
+
print("=" * 70)
|
| 165 |
+
print(f"Files tested: {len(audio_files)}")
|
| 166 |
+
print(f"Streaming WER: {stream_wer:.2f}%")
|
| 167 |
+
print(f"NVIDIA claimed: 2.31%")
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
if __name__ == "__main__":
|
| 171 |
+
main()
|
scripts/convert_nemotron_streaming.py
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Export Nemotron Speech Streaming 0.6B to CoreML.
|
| 3 |
+
|
| 4 |
+
Exports 4 components for streaming RNNT inference:
|
| 5 |
+
1. Preprocessor: audio → mel
|
| 6 |
+
2. Encoder: mel + cache → encoded + new_cache
|
| 7 |
+
3. Decoder: token + state → decoder_out + new_state
|
| 8 |
+
4. Joint: encoder + decoder → logits
|
| 9 |
+
"""
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
import json
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from typing import Dict, Optional, Tuple
|
| 15 |
+
|
| 16 |
+
import coremltools as ct
|
| 17 |
+
import numpy as np
|
| 18 |
+
import torch
|
| 19 |
+
import typer
|
| 20 |
+
|
| 21 |
+
import nemo.collections.asr as nemo_asr
|
| 22 |
+
|
| 23 |
+
from individual_components import (
|
| 24 |
+
DecoderWrapper,
|
| 25 |
+
EncoderStreamingWrapper,
|
| 26 |
+
ExportSettings,
|
| 27 |
+
JointWrapper,
|
| 28 |
+
PreprocessorWrapper,
|
| 29 |
+
_coreml_convert,
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
DEFAULT_MODEL_ID = "nvidia/nemotron-speech-streaming-en-0.6b"
|
| 33 |
+
|
| 34 |
+
# Streaming config from model:
|
| 35 |
+
# chunk_size=[105, 112], pre_encode_cache_size=[0, 9], valid_out_len=14
|
| 36 |
+
CHUNK_MEL_FRAMES = 112
|
| 37 |
+
PRE_ENCODE_CACHE = 9
|
| 38 |
+
TOTAL_MEL_FRAMES = CHUNK_MEL_FRAMES + PRE_ENCODE_CACHE # 121
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def _tensor_shape(t: torch.Tensor) -> Tuple[int, ...]:
|
| 42 |
+
return tuple(int(d) for d in t.shape)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _parse_cu(name: str) -> ct.ComputeUnit:
|
| 46 |
+
mapping = {
|
| 47 |
+
"ALL": ct.ComputeUnit.ALL,
|
| 48 |
+
"CPU_ONLY": ct.ComputeUnit.CPU_ONLY,
|
| 49 |
+
"CPU_AND_GPU": ct.ComputeUnit.CPU_AND_GPU,
|
| 50 |
+
"CPU_AND_NE": ct.ComputeUnit.CPU_AND_NE,
|
| 51 |
+
}
|
| 52 |
+
return mapping.get(name.upper(), ct.ComputeUnit.CPU_ONLY)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
app = typer.Typer(add_completion=False)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@app.command()
|
| 59 |
+
def convert(
|
| 60 |
+
output_dir: Path = typer.Option(Path("nemotron_coreml"), help="Output directory"),
|
| 61 |
+
encoder_cu: str = typer.Option("CPU_AND_NE", help="Encoder compute units"),
|
| 62 |
+
precision: str = typer.Option("FLOAT32", help="FLOAT32 or FLOAT16"),
|
| 63 |
+
) -> None:
|
| 64 |
+
"""Export Nemotron Streaming to CoreML."""
|
| 65 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 66 |
+
|
| 67 |
+
typer.echo("Loading model...")
|
| 68 |
+
model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(DEFAULT_MODEL_ID, map_location="cpu")
|
| 69 |
+
model.eval()
|
| 70 |
+
|
| 71 |
+
sample_rate = int(model.cfg.preprocessor.sample_rate)
|
| 72 |
+
encoder = model.encoder
|
| 73 |
+
encoder.setup_streaming_params()
|
| 74 |
+
|
| 75 |
+
# Get cache shapes
|
| 76 |
+
cache_channel, cache_time, cache_len = encoder.get_initial_cache_state(batch_size=1, device="cpu")
|
| 77 |
+
cache_len = cache_len.to(torch.int32)
|
| 78 |
+
|
| 79 |
+
# Transpose to [B, L, ...] for CoreML
|
| 80 |
+
cache_channel_b = cache_channel.transpose(0, 1)
|
| 81 |
+
cache_time_b = cache_time.transpose(0, 1)
|
| 82 |
+
|
| 83 |
+
typer.echo(f"Cache shapes: channel={cache_channel_b.shape}, time={cache_time_b.shape}")
|
| 84 |
+
|
| 85 |
+
# Create wrappers
|
| 86 |
+
preprocessor = PreprocessorWrapper(model.preprocessor.eval())
|
| 87 |
+
encoder_streaming = EncoderStreamingWrapper(encoder.eval())
|
| 88 |
+
decoder = DecoderWrapper(model.decoder.eval())
|
| 89 |
+
joint = JointWrapper(model.joint.eval())
|
| 90 |
+
|
| 91 |
+
model.decoder._rnnt_export = True
|
| 92 |
+
|
| 93 |
+
settings = ExportSettings(
|
| 94 |
+
output_dir=output_dir,
|
| 95 |
+
compute_units=ct.ComputeUnit.CPU_ONLY,
|
| 96 |
+
deployment_target=ct.target.iOS17,
|
| 97 |
+
compute_precision=ct.precision.FLOAT16 if precision.upper() == "FLOAT16" else ct.precision.FLOAT32,
|
| 98 |
+
max_audio_seconds=30.0,
|
| 99 |
+
max_symbol_steps=1,
|
| 100 |
+
chunk_size_frames=14,
|
| 101 |
+
cache_size=cache_channel.shape[2],
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
# === Preprocessor ===
|
| 105 |
+
typer.echo("Exporting preprocessor...")
|
| 106 |
+
max_samples = 30 * sample_rate
|
| 107 |
+
audio = torch.randn(1, max_samples)
|
| 108 |
+
audio_len = torch.tensor([max_samples], dtype=torch.int32)
|
| 109 |
+
|
| 110 |
+
traced = torch.jit.trace(preprocessor, (audio, audio_len), strict=False)
|
| 111 |
+
inputs = [
|
| 112 |
+
ct.TensorType(name="audio", shape=(1, ct.RangeDim(1, max_samples)), dtype=np.float32),
|
| 113 |
+
ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
|
| 114 |
+
]
|
| 115 |
+
outputs = [
|
| 116 |
+
ct.TensorType(name="mel", dtype=np.float32),
|
| 117 |
+
ct.TensorType(name="mel_length", dtype=np.int32),
|
| 118 |
+
]
|
| 119 |
+
mlmodel = _coreml_convert(traced, inputs, outputs, settings, ct.ComputeUnit.CPU_ONLY)
|
| 120 |
+
mlmodel.save(str(output_dir / "preprocessor.mlpackage"))
|
| 121 |
+
|
| 122 |
+
# === Encoder (streaming) ===
|
| 123 |
+
typer.echo("Exporting encoder...")
|
| 124 |
+
mel_features = int(model.cfg.preprocessor.features) # 128 for this model
|
| 125 |
+
mel = torch.randn(1, mel_features, TOTAL_MEL_FRAMES)
|
| 126 |
+
mel_len = torch.tensor([TOTAL_MEL_FRAMES], dtype=torch.int32)
|
| 127 |
+
|
| 128 |
+
traced = torch.jit.trace(
|
| 129 |
+
encoder_streaming,
|
| 130 |
+
(mel, mel_len, cache_channel_b, cache_time_b, cache_len),
|
| 131 |
+
strict=False
|
| 132 |
+
)
|
| 133 |
+
inputs = [
|
| 134 |
+
ct.TensorType(name="mel", shape=_tensor_shape(mel), dtype=np.float32),
|
| 135 |
+
ct.TensorType(name="mel_length", shape=(1,), dtype=np.int32),
|
| 136 |
+
ct.TensorType(name="cache_channel", shape=_tensor_shape(cache_channel_b), dtype=np.float32),
|
| 137 |
+
ct.TensorType(name="cache_time", shape=_tensor_shape(cache_time_b), dtype=np.float32),
|
| 138 |
+
ct.TensorType(name="cache_len", shape=(1,), dtype=np.int32),
|
| 139 |
+
]
|
| 140 |
+
outputs = [
|
| 141 |
+
ct.TensorType(name="encoded", dtype=np.float32),
|
| 142 |
+
ct.TensorType(name="encoded_length", dtype=np.int32),
|
| 143 |
+
ct.TensorType(name="cache_channel_out", dtype=np.float32),
|
| 144 |
+
ct.TensorType(name="cache_time_out", dtype=np.float32),
|
| 145 |
+
ct.TensorType(name="cache_len_out", dtype=np.int32),
|
| 146 |
+
]
|
| 147 |
+
mlmodel = _coreml_convert(traced, inputs, outputs, settings, _parse_cu(encoder_cu))
|
| 148 |
+
mlmodel.save(str(output_dir / "encoder.mlpackage"))
|
| 149 |
+
|
| 150 |
+
# === Decoder ===
|
| 151 |
+
typer.echo("Exporting decoder...")
|
| 152 |
+
decoder_hidden = int(model.decoder.pred_hidden)
|
| 153 |
+
decoder_layers = int(model.decoder.pred_rnn_layers)
|
| 154 |
+
|
| 155 |
+
targets = torch.tensor([[model.decoder.blank_idx]], dtype=torch.int32)
|
| 156 |
+
target_len = torch.tensor([1], dtype=torch.int32)
|
| 157 |
+
h = torch.zeros(decoder_layers, 1, decoder_hidden)
|
| 158 |
+
c = torch.zeros(decoder_layers, 1, decoder_hidden)
|
| 159 |
+
|
| 160 |
+
traced = torch.jit.trace(decoder, (targets, target_len, h, c), strict=False)
|
| 161 |
+
inputs = [
|
| 162 |
+
ct.TensorType(name="token", shape=(1, 1), dtype=np.int32),
|
| 163 |
+
ct.TensorType(name="token_length", shape=(1,), dtype=np.int32),
|
| 164 |
+
ct.TensorType(name="h_in", shape=_tensor_shape(h), dtype=np.float32),
|
| 165 |
+
ct.TensorType(name="c_in", shape=_tensor_shape(c), dtype=np.float32),
|
| 166 |
+
]
|
| 167 |
+
outputs = [
|
| 168 |
+
ct.TensorType(name="decoder_out", dtype=np.float32),
|
| 169 |
+
ct.TensorType(name="h_out", dtype=np.float32),
|
| 170 |
+
ct.TensorType(name="c_out", dtype=np.float32),
|
| 171 |
+
]
|
| 172 |
+
mlmodel = _coreml_convert(traced, inputs, outputs, settings, ct.ComputeUnit.CPU_ONLY)
|
| 173 |
+
mlmodel.save(str(output_dir / "decoder.mlpackage"))
|
| 174 |
+
|
| 175 |
+
# === Joint ===
|
| 176 |
+
typer.echo("Exporting joint...")
|
| 177 |
+
with torch.no_grad():
|
| 178 |
+
mel_test, _ = preprocessor(audio[:, :sample_rate], torch.tensor([sample_rate], dtype=torch.int32))
|
| 179 |
+
# Run through encoder wrapper (not model.encoder directly to avoid typed method issues)
|
| 180 |
+
enc_out, _, _, _, _ = encoder_streaming(
|
| 181 |
+
mel_test,
|
| 182 |
+
torch.tensor([mel_test.shape[2]], dtype=torch.int32),
|
| 183 |
+
cache_channel_b,
|
| 184 |
+
cache_time_b,
|
| 185 |
+
cache_len
|
| 186 |
+
)
|
| 187 |
+
dec_out, _, _ = decoder(targets, target_len, h, c)
|
| 188 |
+
|
| 189 |
+
# Single step: [B, D, 1]
|
| 190 |
+
enc_step = enc_out[:, :, :1].contiguous()
|
| 191 |
+
dec_step = dec_out[:, :, :1].contiguous()
|
| 192 |
+
|
| 193 |
+
traced = torch.jit.trace(joint, (enc_step, dec_step), strict=False)
|
| 194 |
+
inputs = [
|
| 195 |
+
ct.TensorType(name="encoder", shape=_tensor_shape(enc_step), dtype=np.float32),
|
| 196 |
+
ct.TensorType(name="decoder", shape=_tensor_shape(dec_step), dtype=np.float32),
|
| 197 |
+
]
|
| 198 |
+
outputs = [ct.TensorType(name="logits", dtype=np.float32)]
|
| 199 |
+
mlmodel = _coreml_convert(traced, inputs, outputs, settings, ct.ComputeUnit.CPU_ONLY)
|
| 200 |
+
mlmodel.save(str(output_dir / "joint.mlpackage"))
|
| 201 |
+
|
| 202 |
+
# === Metadata ===
|
| 203 |
+
vocab_size = int(model.tokenizer.vocab_size)
|
| 204 |
+
metadata = {
|
| 205 |
+
"model": DEFAULT_MODEL_ID,
|
| 206 |
+
"sample_rate": sample_rate,
|
| 207 |
+
"mel_features": mel_features,
|
| 208 |
+
"chunk_mel_frames": CHUNK_MEL_FRAMES,
|
| 209 |
+
"pre_encode_cache": PRE_ENCODE_CACHE,
|
| 210 |
+
"total_mel_frames": TOTAL_MEL_FRAMES,
|
| 211 |
+
"vocab_size": vocab_size,
|
| 212 |
+
"blank_idx": int(model.decoder.blank_idx),
|
| 213 |
+
"cache_channel_shape": list(cache_channel_b.shape),
|
| 214 |
+
"cache_time_shape": list(cache_time_b.shape),
|
| 215 |
+
"decoder_hidden": decoder_hidden,
|
| 216 |
+
"decoder_layers": decoder_layers,
|
| 217 |
+
"encoder_dim": int(enc_out.shape[1]),
|
| 218 |
+
}
|
| 219 |
+
(output_dir / "metadata.json").write_text(json.dumps(metadata, indent=2))
|
| 220 |
+
|
| 221 |
+
# Tokenizer
|
| 222 |
+
tokenizer = {str(i): model.tokenizer.ids_to_tokens([i])[0] for i in range(vocab_size)}
|
| 223 |
+
(output_dir / "tokenizer.json").write_text(json.dumps(tokenizer, indent=2))
|
| 224 |
+
|
| 225 |
+
typer.echo(f"Done! Exported to {output_dir}")
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
if __name__ == "__main__":
|
| 229 |
+
app()
|
scripts/individual_components.py
ADDED
|
@@ -0,0 +1,322 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Export Parakeet Realtime EOU RNNT components into CoreML.
|
| 3 |
+
|
| 4 |
+
This model uses a cache-aware streaming FastConformer encoder.
|
| 5 |
+
The encoder requires splitting into:
|
| 6 |
+
1. Initial encoder (no cache, for first chunk)
|
| 7 |
+
2. Streaming encoder (with cache inputs/outputs)
|
| 8 |
+
"""
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
from dataclasses import dataclass
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from typing import Optional, Tuple
|
| 14 |
+
|
| 15 |
+
import coremltools as ct
|
| 16 |
+
import torch
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@dataclass
|
| 20 |
+
class ExportSettings:
|
| 21 |
+
output_dir: Path
|
| 22 |
+
compute_units: ct.ComputeUnit
|
| 23 |
+
deployment_target: Optional[ct.target]
|
| 24 |
+
compute_precision: Optional[ct.precision]
|
| 25 |
+
max_audio_seconds: float
|
| 26 |
+
max_symbol_steps: int
|
| 27 |
+
# Streaming-specific settings
|
| 28 |
+
chunk_size_frames: int # Number of frames per chunk (after subsampling)
|
| 29 |
+
cache_size: int # Size of the channel cache
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class PreprocessorWrapper(torch.nn.Module):
|
| 33 |
+
"""Wrapper for the preprocessor (mel spectrogram extraction)."""
|
| 34 |
+
|
| 35 |
+
def __init__(self, module: torch.nn.Module) -> None:
|
| 36 |
+
super().__init__()
|
| 37 |
+
self.module = module
|
| 38 |
+
|
| 39 |
+
def forward(
|
| 40 |
+
self, audio_signal: torch.Tensor, length: torch.Tensor
|
| 41 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
| 42 |
+
mel, mel_length = self.module(
|
| 43 |
+
input_signal=audio_signal, length=length.to(dtype=torch.long)
|
| 44 |
+
)
|
| 45 |
+
return mel, mel_length
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class EncoderInitialWrapper(torch.nn.Module):
|
| 49 |
+
"""Encoder wrapper for the initial chunk (no cache input).
|
| 50 |
+
|
| 51 |
+
This is used for the first chunk of audio where there's no previous cache.
|
| 52 |
+
It outputs the encoder features and initial cache states.
|
| 53 |
+
"""
|
| 54 |
+
|
| 55 |
+
def __init__(self, module: torch.nn.Module) -> None:
|
| 56 |
+
super().__init__()
|
| 57 |
+
self.module = module
|
| 58 |
+
|
| 59 |
+
def forward(
|
| 60 |
+
self, features: torch.Tensor, length: torch.Tensor
|
| 61 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
| 62 |
+
"""Forward pass for initial chunk without cache.
|
| 63 |
+
|
| 64 |
+
Args:
|
| 65 |
+
features: Mel spectrogram [B, D, T]
|
| 66 |
+
length: Sequence lengths [B]
|
| 67 |
+
|
| 68 |
+
Returns:
|
| 69 |
+
encoded: Encoder output [B, D, T_enc]
|
| 70 |
+
encoded_lengths: Output lengths [B]
|
| 71 |
+
"""
|
| 72 |
+
# Initial forward without cache
|
| 73 |
+
encoded, encoded_lengths = self.module(
|
| 74 |
+
audio_signal=features, length=length.to(dtype=torch.long)
|
| 75 |
+
)
|
| 76 |
+
return encoded, encoded_lengths
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
class EncoderStreamingWrapper(torch.nn.Module):
|
| 80 |
+
"""Encoder wrapper for streaming with cache.
|
| 81 |
+
|
| 82 |
+
This is used for subsequent chunks where cache states are available.
|
| 83 |
+
It takes cache states as input and outputs updated cache states.
|
| 84 |
+
"""
|
| 85 |
+
|
| 86 |
+
def __init__(self, module: torch.nn.Module) -> None:
|
| 87 |
+
super().__init__()
|
| 88 |
+
self.module = module
|
| 89 |
+
|
| 90 |
+
def forward(
|
| 91 |
+
self,
|
| 92 |
+
features: torch.Tensor,
|
| 93 |
+
length: torch.Tensor,
|
| 94 |
+
cache_last_channel: torch.Tensor,
|
| 95 |
+
cache_last_time: torch.Tensor,
|
| 96 |
+
cache_last_channel_len: torch.Tensor,
|
| 97 |
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
| 98 |
+
"""Forward pass with cache for streaming."""
|
| 99 |
+
# Transpose caches from [B, L, ...] to [L, B, ...] for NeMo
|
| 100 |
+
cache_last_channel_t = cache_last_channel.transpose(0, 1)
|
| 101 |
+
cache_last_time_t = cache_last_time.transpose(0, 1)
|
| 102 |
+
cache_len_i64 = cache_last_channel_len.to(dtype=torch.int64)
|
| 103 |
+
|
| 104 |
+
# Call encoder forward with cache parameters
|
| 105 |
+
encoded, encoded_lengths, cache_ch_next, cache_t_next, cache_len_next = self.module(
|
| 106 |
+
audio_signal=features,
|
| 107 |
+
length=length.to(dtype=torch.long),
|
| 108 |
+
cache_last_channel=cache_last_channel_t,
|
| 109 |
+
cache_last_time=cache_last_time_t,
|
| 110 |
+
cache_last_channel_len=cache_len_i64,
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
# Transpose caches back from [L, B, ...] to [B, L, ...]
|
| 114 |
+
cache_ch_next = cache_ch_next.transpose(0, 1)
|
| 115 |
+
cache_t_next = cache_t_next.transpose(0, 1)
|
| 116 |
+
|
| 117 |
+
return (
|
| 118 |
+
encoded,
|
| 119 |
+
encoded_lengths.to(dtype=torch.int32),
|
| 120 |
+
cache_ch_next,
|
| 121 |
+
cache_t_next,
|
| 122 |
+
cache_len_next.to(dtype=torch.int32),
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
class DecoderWrapper(torch.nn.Module):
|
| 127 |
+
"""Wrapper for the RNNT prediction network (decoder)."""
|
| 128 |
+
|
| 129 |
+
def __init__(self, module: torch.nn.Module) -> None:
|
| 130 |
+
super().__init__()
|
| 131 |
+
self.module = module
|
| 132 |
+
|
| 133 |
+
def forward(
|
| 134 |
+
self,
|
| 135 |
+
targets: torch.Tensor,
|
| 136 |
+
target_lengths: torch.Tensor,
|
| 137 |
+
h_in: torch.Tensor,
|
| 138 |
+
c_in: torch.Tensor,
|
| 139 |
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
| 140 |
+
state = [h_in, c_in]
|
| 141 |
+
decoder_output, _, new_state = self.module(
|
| 142 |
+
targets=targets.to(dtype=torch.long),
|
| 143 |
+
target_length=target_lengths.to(dtype=torch.long),
|
| 144 |
+
states=state,
|
| 145 |
+
)
|
| 146 |
+
return decoder_output, new_state[0], new_state[1]
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
class JointWrapper(torch.nn.Module):
|
| 150 |
+
"""Wrapper for the RNNT joint network."""
|
| 151 |
+
|
| 152 |
+
def __init__(self, module: torch.nn.Module) -> None:
|
| 153 |
+
super().__init__()
|
| 154 |
+
self.module = module
|
| 155 |
+
|
| 156 |
+
def forward(
|
| 157 |
+
self, encoder_outputs: torch.Tensor, decoder_outputs: torch.Tensor
|
| 158 |
+
) -> torch.Tensor:
|
| 159 |
+
# Input: encoder_outputs [B, D, T], decoder_outputs [B, D, U]
|
| 160 |
+
# Transpose to match what projection layers expect
|
| 161 |
+
encoder_outputs = encoder_outputs.transpose(1, 2) # [B, T, D]
|
| 162 |
+
decoder_outputs = decoder_outputs.transpose(1, 2) # [B, U, D]
|
| 163 |
+
|
| 164 |
+
# Apply projections
|
| 165 |
+
enc_proj = self.module.enc(encoder_outputs) # [B, T, joint_dim]
|
| 166 |
+
dec_proj = self.module.pred(decoder_outputs) # [B, U, joint_dim]
|
| 167 |
+
|
| 168 |
+
# Explicit broadcasting along T and U
|
| 169 |
+
x = enc_proj.unsqueeze(2) + dec_proj.unsqueeze(1) # [B, T, U, joint_dim]
|
| 170 |
+
x = self.module.joint_net[0](x) # ReLU
|
| 171 |
+
x = self.module.joint_net[1](x) # Dropout (no-op in eval)
|
| 172 |
+
out = self.module.joint_net[2](x) # Linear -> logits
|
| 173 |
+
return out
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
class MelEncoderWrapper(torch.nn.Module):
|
| 177 |
+
"""Fused wrapper: waveform -> mel -> encoder (no cache, initial chunk)."""
|
| 178 |
+
|
| 179 |
+
def __init__(
|
| 180 |
+
self, preprocessor: PreprocessorWrapper, encoder: EncoderInitialWrapper
|
| 181 |
+
) -> None:
|
| 182 |
+
super().__init__()
|
| 183 |
+
self.preprocessor = preprocessor
|
| 184 |
+
self.encoder = encoder
|
| 185 |
+
|
| 186 |
+
def forward(
|
| 187 |
+
self, audio_signal: torch.Tensor, audio_length: torch.Tensor
|
| 188 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
| 189 |
+
mel, mel_length = self.preprocessor(audio_signal, audio_length)
|
| 190 |
+
encoded, enc_len = self.encoder(mel, mel_length.to(dtype=torch.int32))
|
| 191 |
+
return encoded, enc_len
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
class MelEncoderStreamingWrapper(torch.nn.Module):
|
| 195 |
+
"""Fused wrapper: waveform -> mel -> encoder (with cache, streaming)."""
|
| 196 |
+
|
| 197 |
+
def __init__(
|
| 198 |
+
self, preprocessor: PreprocessorWrapper, encoder: EncoderStreamingWrapper
|
| 199 |
+
) -> None:
|
| 200 |
+
super().__init__()
|
| 201 |
+
self.preprocessor = preprocessor
|
| 202 |
+
self.encoder = encoder
|
| 203 |
+
|
| 204 |
+
def forward(
|
| 205 |
+
self,
|
| 206 |
+
audio_signal: torch.Tensor,
|
| 207 |
+
audio_length: torch.Tensor,
|
| 208 |
+
cache_last_channel: torch.Tensor,
|
| 209 |
+
cache_last_time: torch.Tensor,
|
| 210 |
+
cache_last_channel_len: torch.Tensor,
|
| 211 |
+
) -> Tuple[
|
| 212 |
+
torch.Tensor,
|
| 213 |
+
torch.Tensor,
|
| 214 |
+
torch.Tensor,
|
| 215 |
+
torch.Tensor,
|
| 216 |
+
torch.Tensor,
|
| 217 |
+
]:
|
| 218 |
+
mel, mel_length = self.preprocessor(audio_signal, audio_length)
|
| 219 |
+
return self.encoder(
|
| 220 |
+
mel,
|
| 221 |
+
mel_length.to(dtype=torch.int32),
|
| 222 |
+
cache_last_channel,
|
| 223 |
+
cache_last_time,
|
| 224 |
+
cache_last_channel_len,
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
class JointDecisionWrapper(torch.nn.Module):
|
| 229 |
+
"""Joint + decision head: outputs label id, label prob.
|
| 230 |
+
|
| 231 |
+
Unlike TDT, EOU models don't have duration outputs.
|
| 232 |
+
They have a special EOU token that marks end of utterance.
|
| 233 |
+
"""
|
| 234 |
+
|
| 235 |
+
def __init__(self, joint: JointWrapper, vocab_size: int) -> None:
|
| 236 |
+
super().__init__()
|
| 237 |
+
self.joint = joint
|
| 238 |
+
self.vocab_with_blank = int(vocab_size) + 1
|
| 239 |
+
|
| 240 |
+
def forward(
|
| 241 |
+
self, encoder_outputs: torch.Tensor, decoder_outputs: torch.Tensor
|
| 242 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
| 243 |
+
logits = self.joint(encoder_outputs, decoder_outputs)
|
| 244 |
+
token_logits = logits[..., : self.vocab_with_blank]
|
| 245 |
+
|
| 246 |
+
# Token selection
|
| 247 |
+
token_ids = torch.argmax(token_logits, dim=-1).to(dtype=torch.int32)
|
| 248 |
+
token_probs_all = torch.softmax(token_logits, dim=-1)
|
| 249 |
+
token_prob = torch.gather(
|
| 250 |
+
token_probs_all, dim=-1, index=token_ids.long().unsqueeze(-1)
|
| 251 |
+
).squeeze(-1)
|
| 252 |
+
|
| 253 |
+
return token_ids, token_prob
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
class JointDecisionSingleStep(torch.nn.Module):
|
| 257 |
+
"""Single-step variant for streaming: encoder_step [1, D, 1] -> [1,1,1].
|
| 258 |
+
|
| 259 |
+
Inputs:
|
| 260 |
+
- encoder_step: [B=1, D, T=1]
|
| 261 |
+
- decoder_step: [B=1, D, U=1]
|
| 262 |
+
|
| 263 |
+
Returns:
|
| 264 |
+
- token_id: [1, 1, 1] int32
|
| 265 |
+
- token_prob: [1, 1, 1] float32
|
| 266 |
+
- top_k_ids: [1, 1, 1, K] int32
|
| 267 |
+
- top_k_logits: [1, 1, 1, K] float32
|
| 268 |
+
"""
|
| 269 |
+
|
| 270 |
+
def __init__(self, joint: JointWrapper, vocab_size: int, top_k: int = 64) -> None:
|
| 271 |
+
super().__init__()
|
| 272 |
+
self.joint = joint
|
| 273 |
+
self.vocab_with_blank = int(vocab_size) + 1
|
| 274 |
+
self.top_k = int(top_k)
|
| 275 |
+
|
| 276 |
+
def forward(
|
| 277 |
+
self, encoder_step: torch.Tensor, decoder_step: torch.Tensor
|
| 278 |
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
| 279 |
+
logits = self.joint(encoder_step, decoder_step) # [1, 1, 1, V]
|
| 280 |
+
token_logits = logits[..., : self.vocab_with_blank]
|
| 281 |
+
|
| 282 |
+
token_ids = torch.argmax(token_logits, dim=-1, keepdim=False).to(
|
| 283 |
+
dtype=torch.int32
|
| 284 |
+
)
|
| 285 |
+
token_probs_all = torch.softmax(token_logits, dim=-1)
|
| 286 |
+
token_prob = torch.gather(
|
| 287 |
+
token_probs_all, dim=-1, index=token_ids.long().unsqueeze(-1)
|
| 288 |
+
).squeeze(-1)
|
| 289 |
+
|
| 290 |
+
# Top-K candidates for host-side re-ranking
|
| 291 |
+
topk_logits, topk_ids_long = torch.topk(
|
| 292 |
+
token_logits, k=min(self.top_k, token_logits.shape[-1]), dim=-1
|
| 293 |
+
)
|
| 294 |
+
topk_ids = topk_ids_long.to(dtype=torch.int32)
|
| 295 |
+
return token_ids, token_prob, topk_ids, topk_logits
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
def _coreml_convert(
|
| 299 |
+
traced: torch.jit.ScriptModule,
|
| 300 |
+
inputs,
|
| 301 |
+
outputs,
|
| 302 |
+
settings: ExportSettings,
|
| 303 |
+
compute_units_override: Optional[ct.ComputeUnit] = None,
|
| 304 |
+
) -> ct.models.MLModel:
|
| 305 |
+
cu = (
|
| 306 |
+
compute_units_override
|
| 307 |
+
if compute_units_override is not None
|
| 308 |
+
else settings.compute_units
|
| 309 |
+
)
|
| 310 |
+
kwargs = {
|
| 311 |
+
"convert_to": "mlprogram",
|
| 312 |
+
"inputs": inputs,
|
| 313 |
+
"outputs": outputs,
|
| 314 |
+
"compute_units": cu,
|
| 315 |
+
}
|
| 316 |
+
print("Converting:", traced.__class__.__name__)
|
| 317 |
+
print("Conversion kwargs:", kwargs)
|
| 318 |
+
if settings.deployment_target is not None:
|
| 319 |
+
kwargs["minimum_deployment_target"] = settings.deployment_target
|
| 320 |
+
if settings.compute_precision is not None:
|
| 321 |
+
kwargs["compute_precision"] = settings.compute_precision
|
| 322 |
+
return ct.convert(traced, **kwargs)
|
scripts/nemo_streaming_reference.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
NeMo Nemotron Streaming Reference Implementation
|
| 4 |
+
|
| 5 |
+
Streaming inference with nemotron-speech-streaming-en-0.6b using 1.12s chunks.
|
| 6 |
+
Uses conformer_stream_step API with CacheAwareStreamingAudioBuffer.
|
| 7 |
+
"""
|
| 8 |
+
import numpy as np
|
| 9 |
+
import soundfile as sf
|
| 10 |
+
import torch
|
| 11 |
+
import nemo.collections.asr as nemo_asr
|
| 12 |
+
from nemo.collections.asr.parts.utils.streaming_utils import CacheAwareStreamingAudioBuffer
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def calc_drop_extra_pre_encoded(model, step_num, pad_and_drop_preencoded):
|
| 16 |
+
"""Calculate drop_extra_pre_encoded value per NVIDIA's reference."""
|
| 17 |
+
if step_num == 0 and not pad_and_drop_preencoded:
|
| 18 |
+
return 0
|
| 19 |
+
return model.encoder.streaming_cfg.drop_extra_pre_encoded
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def transcribe_streaming(model, audio: np.ndarray, sr: int = 16000, pad_and_drop_preencoded: bool = False) -> str:
|
| 23 |
+
"""
|
| 24 |
+
Streaming transcription using NeMo's conformer_stream_step API.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
model: NeMo ASR model (must support streaming)
|
| 28 |
+
audio: Audio samples as float32 numpy array
|
| 29 |
+
sr: Sample rate (must be 16000)
|
| 30 |
+
pad_and_drop_preencoded: Whether to pad and drop preencoded frames.
|
| 31 |
+
False (default) gives better WER, True is needed for ONNX export.
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
Transcribed text
|
| 35 |
+
"""
|
| 36 |
+
model.encoder.setup_streaming_params()
|
| 37 |
+
|
| 38 |
+
streaming_buffer = CacheAwareStreamingAudioBuffer(
|
| 39 |
+
model=model,
|
| 40 |
+
pad_and_drop_preencoded=pad_and_drop_preencoded,
|
| 41 |
+
)
|
| 42 |
+
streaming_buffer.reset_buffer()
|
| 43 |
+
streaming_buffer.append_audio(audio)
|
| 44 |
+
|
| 45 |
+
cache_last_channel, cache_last_time, cache_last_channel_len = \
|
| 46 |
+
model.encoder.get_initial_cache_state(batch_size=1)
|
| 47 |
+
|
| 48 |
+
previous_hypotheses = None
|
| 49 |
+
pred_out_stream = None
|
| 50 |
+
final_text = ""
|
| 51 |
+
|
| 52 |
+
with torch.inference_mode():
|
| 53 |
+
for step_num, (chunk_audio, chunk_lengths) in enumerate(streaming_buffer):
|
| 54 |
+
(
|
| 55 |
+
pred_out_stream,
|
| 56 |
+
transcribed_texts,
|
| 57 |
+
cache_last_channel,
|
| 58 |
+
cache_last_time,
|
| 59 |
+
cache_last_channel_len,
|
| 60 |
+
previous_hypotheses,
|
| 61 |
+
) = model.conformer_stream_step(
|
| 62 |
+
processed_signal=chunk_audio,
|
| 63 |
+
processed_signal_length=chunk_lengths,
|
| 64 |
+
cache_last_channel=cache_last_channel,
|
| 65 |
+
cache_last_time=cache_last_time,
|
| 66 |
+
cache_last_channel_len=cache_last_channel_len,
|
| 67 |
+
keep_all_outputs=streaming_buffer.is_buffer_empty(),
|
| 68 |
+
previous_hypotheses=previous_hypotheses,
|
| 69 |
+
previous_pred_out=pred_out_stream,
|
| 70 |
+
drop_extra_pre_encoded=calc_drop_extra_pre_encoded(model, step_num, pad_and_drop_preencoded),
|
| 71 |
+
return_transcription=True,
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
if transcribed_texts and len(transcribed_texts) > 0:
|
| 75 |
+
text = transcribed_texts[0]
|
| 76 |
+
if hasattr(text, 'text'):
|
| 77 |
+
final_text = text.text
|
| 78 |
+
else:
|
| 79 |
+
final_text = str(text)
|
| 80 |
+
|
| 81 |
+
return final_text
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def main():
|
| 85 |
+
import argparse
|
| 86 |
+
parser = argparse.ArgumentParser(description="NeMo Streaming Reference")
|
| 87 |
+
parser.add_argument("--audio", type=str, required=True, help="Path to audio file")
|
| 88 |
+
parser.add_argument("--duration", type=float, default=None, help="Duration in seconds to transcribe")
|
| 89 |
+
args = parser.parse_args()
|
| 90 |
+
|
| 91 |
+
audio, sr = sf.read(args.audio, dtype="float32")
|
| 92 |
+
if args.duration:
|
| 93 |
+
audio = audio[:int(args.duration * sr)]
|
| 94 |
+
|
| 95 |
+
print("=" * 70)
|
| 96 |
+
print("NEMOTRON STREAMING")
|
| 97 |
+
print("=" * 70)
|
| 98 |
+
print(f"Audio: {len(audio)/sr:.1f}s @ {sr}Hz")
|
| 99 |
+
|
| 100 |
+
print("\nLoading model...")
|
| 101 |
+
model = nemo_asr.models.ASRModel.from_pretrained("nvidia/nemotron-speech-streaming-en-0.6b")
|
| 102 |
+
model.eval()
|
| 103 |
+
|
| 104 |
+
print("\n[STREAMING MODE] (1.12s chunks)")
|
| 105 |
+
text = transcribe_streaming(model, audio, sr)
|
| 106 |
+
print(f" {text}")
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
if __name__ == "__main__":
|
| 110 |
+
main()
|