ronay-nv commited on
Commit
214d72a
·
verified ·
1 Parent(s): 7873b5f

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
.ipynb_checkpoints/config-checkpoint.json ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3VLNemotronEmbedModel"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "modeling_qwen3_vl_nemotron_embed.Qwen3VLNemotronEmbedConfig",
7
+ "AutoModel": "modeling_qwen3_vl_nemotron_embed.Qwen3VLNemotronEmbedModel",
8
+ "AutoProcessor": "processing_qwen3_vl_nemotron_embed.Qwen3VLNemotronEmbedProcessor"
9
+ },
10
+ "dtype": "bfloat16",
11
+ "image_token_id": 151655,
12
+ "model_type": "qwen3_vl_nemotron_embed",
13
+ "pooling": "colbert",
14
+ "text_config": {
15
+ "attention_bias": false,
16
+ "attention_dropout": 0.0,
17
+ "bos_token_id": 151643,
18
+ "dtype": "bfloat16",
19
+ "eos_token_id": 151645,
20
+ "head_dim": 128,
21
+ "hidden_act": "silu",
22
+ "hidden_size": 2560,
23
+ "initializer_range": 0.02,
24
+ "intermediate_size": 9728,
25
+ "max_position_embeddings": 262144,
26
+ "model_type": "qwen3_vl_text",
27
+ "num_attention_heads": 32,
28
+ "num_hidden_layers": 36,
29
+ "num_key_value_heads": 8,
30
+ "rms_norm_eps": 1e-06,
31
+ "rope_parameters": {
32
+ "mrope_interleaved": true,
33
+ "mrope_section": [
34
+ 24,
35
+ 20,
36
+ 20
37
+ ],
38
+ "rope_theta": 5000000,
39
+ "rope_type": "default"
40
+ },
41
+ "tie_word_embeddings": true,
42
+ "use_cache": true,
43
+ "vocab_size": 151936
44
+ },
45
+ "tie_word_embeddings": true,
46
+ "transformers_version": "5.0.0rc0",
47
+ "use_cache": false,
48
+ "video_token_id": 151656,
49
+ "vision_config": {
50
+ "deepstack_visual_indexes": [
51
+ 5,
52
+ 11,
53
+ 17
54
+ ],
55
+ "depth": 24,
56
+ "dtype": "bfloat16",
57
+ "hidden_act": "gelu_pytorch_tanh",
58
+ "hidden_size": 1024,
59
+ "in_channels": 3,
60
+ "initializer_range": 0.02,
61
+ "intermediate_size": 4096,
62
+ "model_type": "qwen3_vl",
63
+ "num_heads": 16,
64
+ "num_position_embeddings": 2304,
65
+ "out_hidden_size": 2560,
66
+ "patch_size": 16,
67
+ "spatial_merge_size": 2,
68
+ "temporal_patch_size": 2
69
+ },
70
+ "vision_end_token_id": 151653,
71
+ "vision_start_token_id": 151652
72
+ }
.ipynb_checkpoints/generation_config-checkpoint.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 151643,
4
+ "eos_token_id": 151645,
5
+ "transformers_version": "5.0.0rc0",
6
+ "use_cache": false
7
+ }
.ipynb_checkpoints/mteb2_eval-checkpoint.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # Copyright (c) 2026 NVIDIA
3
+ # Licensed under customized NSCLv1 [see LICENSE.md for details]
4
+ # --------------------------------------------------------
5
+
6
+ """
7
+ pip install "mteb>=2.6.5, <3.0.0"
8
+ python3 mteb2_eval.py --model_name nvidia/nemotron-colembed-4b-v2 --batch_size 16 --benchmark "ViDoRe(v3)" --task-list Vidore3ComputerScienceRetrieval
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import argparse
14
+ import os
15
+
16
+ import mteb
17
+
18
+
19
+ def main():
20
+ parser = argparse.ArgumentParser()
21
+ parser.add_argument("--model_name", type=str, required=True)
22
+ parser.add_argument("--batch_size", type=int, default=16, required=False)
23
+ parser.add_argument(
24
+ "--results_folder", type=str, default="results_csv", required=False
25
+ )
26
+ parser.add_argument("--predictions_folder", type=str, default=None, required=False)
27
+ parser.add_argument(
28
+ "--benchmark",
29
+ type=str,
30
+ required=False,
31
+ default="ViDoRe(v3)",
32
+ choices=[
33
+ "ViDoRe(v3)", # Vidore V3
34
+ "VisualDocumentRetrieval", # Vidore V1 & V2
35
+ ],
36
+ )
37
+ parser.add_argument(
38
+ "--task-list",
39
+ type=str,
40
+ nargs="+", # Accept one or more space-separated string arguments
41
+ default=None, # Default to None if the argument is not provided
42
+ help="Optional: A list of task class names to run. If not provided, all tasks will be run.",
43
+ )
44
+ args = parser.parse_args()
45
+
46
+ print(f"Loading model: {args.model_name}")
47
+ model = mteb.get_model_meta(args.model_name)
48
+
49
+ # Loads all benchmark tasks
50
+ all_tasks = mteb.get_benchmark(args.benchmark).tasks
51
+ all_tasks_names = " ".join([task.__class__.__name__ for task in all_tasks])
52
+ print(f"Available tasks in benchmark {args.benchmark}: {all_tasks_names}")
53
+
54
+ # filter tasks
55
+ if args.task_list:
56
+ # If user provided a list, filter all_tasks
57
+ print(f"Running evaluation on specified tasks: {args.task_list}")
58
+ requested_task_names = set(args.task_list)
59
+ tasks = [
60
+ task
61
+ for task in all_tasks
62
+ if task.__class__.__name__ in requested_task_names
63
+ ]
64
+
65
+ # Optional: Warn if a requested task was not found
66
+ found_names = {t.__class__.__name__ for t in tasks}
67
+ missing = requested_task_names - found_names
68
+ if missing:
69
+ print(
70
+ f"Warning: The following requested tasks were not found and will be skipped: {missing}"
71
+ )
72
+ else:
73
+ # If --task-list was not provided, use all tasks
74
+ print("Running evaluation on all available tasks.")
75
+ tasks = all_tasks
76
+
77
+ tasks_names = " ".join([task.__class__.__name__ for task in tasks])
78
+ print(f"Evaluating tasks: {tasks_names}")
79
+
80
+ results = mteb.evaluate(
81
+ model=model,
82
+ tasks=tasks,
83
+ encode_kwargs={
84
+ "batch_size": args.batch_size,
85
+ },
86
+ prediction_folder=args.predictions_folder,
87
+ overwrite_strategy="always",
88
+ )
89
+
90
+ print(results)
91
+
92
+ print(f"Saving results to {args.results_folder}")
93
+ os.makedirs(args.results_folder, exist_ok=True)
94
+ model_name = args.model_name.replace("/", "_")
95
+ output_path = os.path.join(
96
+ args.results_folder, f"{model_name}-{tasks_names.replace(' ', '-')}.csv"
97
+ )
98
+ df = results.to_dataframe()
99
+ df.to_csv(output_path, index=False)
100
+
101
+
102
+ if __name__ == "__main__":
103
+ main()
LICENSE ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # NVIDIA License
2
+
3
+ ## 1. Definitions
4
+
5
+ “Licensor” means any person or entity that distributes its Work.
6
+ “Work” means (a) the original work of authorship made available under this license, which may include software, documentation, or other files, and (b) any additions to or derivative works thereof that are made available under this license.
7
+ The terms “reproduce,” “reproduction,” “derivative works,” and “distribution” have the meaning as provided under U.S. copyright law; provided, however, that for the purposes of this license, derivative works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work.
8
+ Works are “made available” under this license by including in or with the Work either (a) a copyright notice referencing the applicability of this license to the Work, or (b) a copy of this license.
9
+
10
+ ## 2. License Grant
11
+
12
+ 2.1 Copyright Grant. Subject to the terms and conditions of this license, each Licensor grants to you a perpetual, worldwide, non-exclusive, royalty-free, copyright license to use, reproduce, prepare derivative works of, publicly display, publicly perform, sublicense and distribute its Work and any resulting derivative works in any form.
13
+
14
+ ## 3. Limitations
15
+
16
+ 3.1 Redistribution. You may reproduce or distribute the Work only if (a) you do so under this license, (b) you include a complete copy of this license with your distribution, and (c) you retain without modification any copyright, patent, trademark, or attribution notices that are present in the Work.
17
+
18
+ 3.2 Derivative Works. You may specify that additional or different terms apply to the use, reproduction, and distribution of your derivative works of the Work (“Your Terms”) only if (a) Your Terms provide that the use limitation in Section 3.3 applies to your derivative works, and (b) you identify the specific derivative works that are subject to Your Terms. Notwithstanding Your Terms, this license (including the redistribution requirements in Section 3.1) will continue to apply to the Work itself.
19
+
20
+ 3.3 Use Limitation. The Work and any derivative works thereof only may be used or intended for use non-commercially. Notwithstanding the foregoing, NVIDIA Corporation and its affiliates may use the Work and any derivative works commercially. As used herein, “non-commercially” means for non-commercial research and educational purposes only.
21
+
22
+ 3.4 Patent Claims. If you bring or threaten to bring a patent claim against any Licensor (including any claim, cross-claim or counterclaim in a lawsuit) to enforce any patents that you allege are infringed by any Work, then your rights under this license from such Licensor (including the grant in Section 2.1) will terminate immediately.
23
+
24
+ 3.5 Trademarks. This license does not grant any rights to use any Licensor’s or its affiliates’ names, logos, or trademarks, except as necessary to reproduce the notices described in this license.
25
+
26
+ 3.6 Termination. If you violate any term of this license, then your rights under this license (including the grant in Section 2.1) will terminate immediately.
27
+
28
+ ## 4. Disclaimer of Warranty.
29
+
30
+ THE WORK IS PROVIDED “AS IS” WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
31
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS LICENSE.
32
+
33
+ ## 5. Limitation of Liability.
34
+
35
+ EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
README.md ADDED
@@ -0,0 +1,337 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ license_name: customized-nscl-v1
4
+ license_link: LICENSE
5
+ tags:
6
+ - text
7
+ - image
8
+ - vidore
9
+ - colpali
10
+ - multimodal-embedding
11
+ - multilingual-embedding
12
+ - Text-to-Visual Document (T→VD) retrieval
13
+ - feature-extraction
14
+ language:
15
+ - multilingual
16
+ inference: false
17
+ library_name: transformers
18
+ pipeline_tag: visual-document-retrieval
19
+ ---
20
+ # **Model Overview**
21
+
22
+ ## Description
23
+
24
+ The **nvidia/nemotron-colembed-4b-v2** is a state-of-the-art late interaction embedding model that ranks No. 2 in the ViDoRe V3: a comprehensive evaluation of retrieval for enterprise use-case benchmark, (as of Jan 21, 2026) with a score of `61.74` on 8 public tasks. The model was fine-tuned for query-document retrieval. Users can input `queries`, which are text, or `documents` which are page images, to the model. The model outputs ColBERT-style multi-vector numerical representations for input queries and documents.
25
+
26
+
27
+ ✨ **Key Improvements:**
28
+ * ⚗️ **Advanced Model Merging:** Utilizes post-training model merging to combine the strengths of multiple fine-tuned checkpoints. This delivers the accuracy stability of an ensemble without any additional inference latency.
29
+ * 🌍 **Enhanced Synthetic Data:** We significantly enriched our training mixture with diverse multilingual synthetic data, improving semantic alignment across languages and complex document types.
30
+
31
+ This model is for non-commercial/research use only.
32
+
33
+
34
+ ### Deployment Geography
35
+ Global
36
+
37
+ ### Use Case
38
+ `nemotron-colembed-4b-v2` is intended for researchers exploring applications that must understand or retrieve information across both text and image modalities. It is instrumental in multimodal RAG systems, where queries are in text format and documents are images, such as pages, text, charts, tables or infographics. Potential applications include multimedia search engines, cross-modal retrieval systems, and conversational AI with rich input understanding.
39
+
40
+ ### License/Terms of Use
41
+ The use of this model is governed by the [NVIDIA Non-Commercial License Agreement](https://huggingface.co/nvidia/nemotron-colembed-4b-v2/blob/main/LICENSE) and the use of the post-processing scripts are licensed under [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0.txt). Additional Information: Built with Qwen3-VL which is released under [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0.txt).
42
+
43
+ This project will download and install additional third-party open source software projects. Review the license terms of these open source projects before use.
44
+
45
+ ### Release Date
46
+ 01/21/2026 via [https://huggingface.co/nvidia/nemotron-colembed-4b-v2](https://huggingface.co/nvidia/nemotron-colembed-4b-v2)
47
+
48
+ ## Model Architecture
49
+
50
+ - **Architecture Type:** Transformer
51
+ - **Network Architecture:** [Qwen3-VL-4B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct) based encoder.
52
+
53
+ The `nemotron-colembed-4b-v2` is a transformer-based multimodal embedding model built from `Qwen3-VL-4B-Instruct`, which adopts a three-module architecture comprising a vision encoder (the [SigLIP-2 architecture](https://huggingface.co/google/siglip2-large-patch16-256)), an MLP-based vision–language merger, and a large language model (LLM) (see [technical report](https://arxiv.org/pdf/2511.21631) for details). It has approximately 4.8B parameters.
54
+
55
+
56
+ ## Input(s):
57
+ **Input Type(s):** Image, Text <br>
58
+
59
+ **Input Format(s):** <br>
60
+ - Image: List of images- Red, Green, Blue (RGB) <br>
61
+ - Text: List of Strings <br>
62
+
63
+ **Input Parameters:** <br>
64
+ - Image: Two-Dimensional (2D) <br>
65
+ - Text: One-Dimensional (1D) <br>
66
+
67
+
68
+ **Other Properties Related to Input:**
69
+ - The model's maximum context length we evaluated is 10240 tokens. <br>
70
+ - Each image tile consumes 256 tokens. We have tested this model extensively with these settings on config.json - `max_input_tiles = 8`, `use_thumbnails = True`, so that every image is split into maximum 8 tiles + 1 thumbnail (whole image at lower resolution). Images must be python PIL format. The model will scale the image into multiple tiles of 512x512.
71
+
72
+
73
+ ## Outputs
74
+
75
+ - **Output Type:** Floats
76
+ - **Output Format:** List of float arrays
77
+ - **Output Parameters:** The list of floats equivalent to [batchsize x seq length x embedding_dim]
78
+ - **Other Properties Related to Output:** Model outputs embedding vectors of dimension for each input token.
79
+
80
+ Our AI models are designed and/or optimized to run on NVIDIA GPU-accelerated systems. By leveraging NVIDIA’s hardware (e.g. GPU cores) and software frameworks (e.g., CUDA libraries), the model achieves faster training and inference times compared to CPU-only solutions.
81
+
82
+
83
+ ### Installation
84
+ The model requires transformers version 5.0.0rc0 and flash attention installed.
85
+
86
+ ```bash
87
+ pip install transformers==5.0.0rc0
88
+ pip install flash-attn==2.6.3 --no-build-isolation
89
+ ```
90
+ Depending on your environment you might need to upgrade polars and pydantic:
91
+
92
+ ```bash
93
+ pip install -U datasets polars
94
+ pip install -U pydantic
95
+ ```
96
+
97
+ ### Transformers Usage
98
+
99
+ ```python
100
+ import requests
101
+ from PIL import Image
102
+ from io import BytesIO
103
+ import torch
104
+ from transformers import AutoModel
105
+ from transformers.image_utils import load_image
106
+ # Load Model
107
+
108
+ model = AutoModel.from_pretrained(
109
+ 'nvidia/nemotron-colembed-4b-v2',
110
+ device_map='cuda',
111
+ trust_remote_code=True,
112
+ torch_dtype=torch.bfloat16,
113
+ attn_implementation="flash_attention_2"
114
+ ).eval()
115
+
116
+ # Queries
117
+ queries = [
118
+ 'How is AI improving the intelligence and capabilities of robots?',
119
+ 'Canary, a multilingual model that transcribes speech in English, Spanish, German, and French with punctuation and capitalization.',
120
+ 'Generative AI can generate DNA sequences that can be translated into proteins for bioengineering.'
121
+ ]
122
+
123
+ image_urls = [
124
+ "https://developer.download.nvidia.com/images/isaac/nvidia-isaac-lab-1920x1080.jpg",
125
+ "https://developer-blogs.nvidia.com/wp-content/uploads/2024/03/asr-nemo-canary-featured.jpg",
126
+ "https://blogs.nvidia.com/wp-content/uploads/2023/02/genome-sequencing-helix.jpg"
127
+ ]
128
+
129
+ # Load all images (load_image handles both local paths and URLs)
130
+ images = [load_image(img_path) for img_path in image_urls]
131
+
132
+ # Encoding
133
+ query_embeddings = model.forward_queries(queries, batch_size=8)
134
+ image_embeddings = model.forward_images(images, batch_size=8)
135
+
136
+ scores = model.get_scores(
137
+ query_embeddings,
138
+ image_embeddings
139
+ )
140
+ # Diagonal should have higher scores
141
+ print(scores)
142
+
143
+ # tensor([[21.5332, 21.1848, 20.9185],
144
+ # [32.4948, 33.2485, 32.5982],
145
+ # [26.0623, 26.1014, 26.5692]], device='cuda:0')
146
+ ```
147
+
148
+
149
+ ## Software Integration: <br>
150
+
151
+ Runtime Engine(s): TensorRT, Triton <br>
152
+ Supported Hardware Microarchitecture Compatibility: A100 40GB, A100 80GB, H100 80GB <br>
153
+ Supported Operating System(s): Linux
154
+
155
+ ## Model Version(s)
156
+ **nemotron-colembed-4b-v2**
157
+
158
+ # Training and Evaluation Datasets
159
+
160
+ ## Training Dataset
161
+
162
+ The model was trained on publicly available datasets, including [DocMatix-IR](https://huggingface.co/datasets/Tevatron/docmatix-ir), [VDR](https://huggingface.co/datasets/vdr-multilingual-train), [Vidore-ColPali-Training](https://huggingface.co/datasets/vidore/colpali_train_set), [VisRAG-Ret-Train-Synthetic-data](https://huggingface.co/datasets/openbmb/VisRAG-Ret-Train-Synthetic-data), [VisRAG-Ret-Train-In-domain-data](https://huggingface.co/datasets/openbmb/VisRAG-Ret-Train-In-domain-data), and [Wiki-SS-NQ](https://huggingface.co/datasets/Tevatron/wiki-ss-nq).
163
+
164
+ **Data Modality**: Image
165
+
166
+ **Image Training Data Size**
167
+ - Less than a Million Images
168
+
169
+ **Data Collection Method by dataset:** Hybrid: Automated, Human, Synthetic <br>
170
+ **Labeling Method by dataset:** Hybrid: Automated, Human, Synthetic <br>
171
+ **Properties:** Training: The vision embedding model was fine-tuned on approximately 500k image samples.
172
+
173
+ ## Evaluation Dataset
174
+
175
+ We evaluate the model on the datasets from [ViDoRe](https://huggingface.co/spaces/vidore/README) V1, V2 and V3 Visual Document Retrieval benchmarks.
176
+
177
+
178
+ [ViDoRe](https://huggingface.co/spaces/vidore/README) is a premier benchmark for Visual Document Retrieval and it is composed of various page-level retrieving tasks spanning multiple domains, languages, and settings. The latest version of the benchmark is [Vidore V3](https://huggingface.co/blog/QuentinJG/introducing-vidore-v3), a comprehensive evaluation of retrieval for enterprise use-cases.
179
+
180
+ We provide a [script](https://huggingface.co/nvidia/nemotron-colembed-4b-v2/blob/main/mteb2_eval.py) using [MTEB 2](https://github.com/embeddings-benchmark/mteb/tree/main/mteb) library to evaluate ColEmbed models on ViDoRe benchmarks.
181
+
182
+ - **Data Collection Method by dataset:** Hybrid: Automated, Human, Synthetic
183
+ - **Labeling Method by dataset:** Hybrid: Automated, Human, Synthetic
184
+ - **Properties:** More details on ViDoRe V1 and ViDoRe V2 can be found on their leaderboard. [Visual Document Retrieval Benchmark](https://huggingface.co/vidore),
185
+
186
+
187
+ ## Evaluation Results
188
+
189
+ ### ViDoRE V1&V2 and V3 on MTEB leaderboards
190
+
191
+ ```bash
192
+ pip install "mteb>=2.7.0, <3.0.0"
193
+ # Evaluates with Vidore V1 and V2
194
+ CUDA_VISIBLE_DEVICES=0; python3 mteb2_eval.py --model_name nvidia/nemotron-colembed-4b-v2 --batch_size 16 --benchmark "VisualDocumentRetrieval"
195
+ # Evaluates with Vidore V3
196
+ CUDA_VISIBLE_DEVICES=0; python3 mteb2_eval.py --model_name nvidia/nemotron-colembed-4b-v2 --batch_size 16 --benchmark "ViDoRe(v3)"
197
+ # Evaluates with a specific task/dataset of Vidore V3: Vidore3ComputerScienceRetrieval
198
+ CUDA_VISIBLE_DEVICES=0; python3 mteb2_eval.py --model_name nvidia/nemotron-colembed-4b-v2 --batch_size 16 --benchmark "ViDoRe(v3)" --task-list Vidore3ComputerScienceRetrieval
199
+ ```
200
+
201
+ In this section, we evaluate the performance of nemotron-colembed-4b-v2 against other models that previously achieved top-five rankings on the leaderboards.
202
+
203
+ We report results on the ViDoRe benchmark suite. The tables below summarize the image-modality accuracy of nemotron-colembed-4b-v2 on the ViDoRe V1, V2, and V3 benchmarks, alongside other NVIDIA nemotron-colembed models. Note that (M)MTEB leaderboards use Borda ranking. Each task acts like a voter that ranks models based on how well they perform. Models earn more points when they rank higher on a task. The model with the most total points across all tasks gets the top overall rank.
204
+
205
+
206
+ #### ViDoRe V3 (NDCG@10)
207
+
208
+ | Model | **Avg** | CompSci | Energy | FinanceEn | FinanceFr | HR | Industrial | Pharma | Physics |
209
+ | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- |
210
+ | **nemotron-colembed-8b**| 63.50 | 79.45 | 69.39 | 67.54 | 51.44 | 66.41 | 56.07 | 67.11 | 50.59 |
211
+ | nemotron-colembed-4b | 61.74 | 78.41 | 66.77 | 64.76 | 48.98 | 66.13 | 53.83 | 66.13 | 48.88 |
212
+ | tomoro-colqwen3-8b| 61.60 | 75.35 | 68.41 | 65.08 | 49.10 | 63.98 | 54.41 | 66.36 | 50.13|
213
+ | tomoro-colqwen3-4b| 60.16 | 75.44 | 66.43 | 63.84 | 46.83 | 60.09 | 53.58 | 65.74 | 49.32 |
214
+ | [nemotron-colembed-3b-v2](https://huggingface.co/nvidia/llama-nemotron-colembed-3b-v2) | 59.70 | 77.09 | 64.88 | 64.23 | 44.41 | 62.28 | 51.71 | 66.04 | 46.93 |
215
+ | nomic-ai/colnomic-embed-multimodal-7b | 57.64 | 76.20 | 63.58 | 56.57 | 45.46 | 58.67 | 50.13 | 62.26 | 48.25 |
216
+ | jinaai/jina-embeddings-v4 | 57.54 | 71.81 | 63.50 | 59.30 | 46.10 | 59.53 | 50.38 | 63.09 | 46.63 |
217
+
218
+
219
+
220
+ #### ViDoRe V2 (NDCG@10)
221
+
222
+ | Model | **Avg** | BioMedicalLectures | ESGReportsHL | ESGReports | EconomicsReports |
223
+ | :--- | :--- | :--- | :--- | :--- | :--- |
224
+ | **nemotron-colembed-8b** | 67.35 | 68.48 | 73.43 | 67.93 | 59.54 |
225
+ | nemotron-colembed-4b | 66.92 | 67.84 | 73.00 | 67.76 | 59.08 |
226
+ | [nemotron-colembed-3b-v2](https://huggingface.co/nvidia/llama-nemotron-colembed-3b-v2)| 66.75 | 68.23 | 77.91 | 64.71 | 56.15 |
227
+ | tomoro-colqwen3-8b| 65.40 | 65.47 | 75.98 | 60.71 | 59.46 |
228
+ | EvoQwen2.5-VL-Retriever-7B-v1 | 65.24 | 65.20 | 76.98 | 59.67 | 59.13 |
229
+ | tomoro-colqwen3-4b| 64.69 | 65.38 | 74.65 | 62.44 | 56.30 |
230
+ | [nemotron-colembed-3b-v1](https://huggingface.co/nvidia/llama-nemoretriever-colembed-3b-v1)| 63.32 | 62.70 | 75.38 | 57.38 | 57.84 |
231
+
232
+
233
+
234
+ #### ViDoRe V1 (NDCG@10)
235
+
236
+ | Model | **Avg** | ArxivQA | DocVQA | InfoVQA | Shift | Syn-AI | Syn-Energy | Syn-Gov | Syn-Health | TabFQuAD | Tatdqa |
237
+ | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- |
238
+ | [nemotron-colembed-3b-v2](https://huggingface.co/nvidia/llama-nemotron-colembed-3b-v2)| 92.16 | 91.12 | 68.46 | 94.86 | 92.30 | 100.00 | 98.02 | 97.95 | 98.89 | 97.48 | 82.49 |
239
+ | nemotron-colembed-8b | 92.06 | 92.23 | 68.61 | 94.11 | 92.66 | 100.00 | 96.26 | 98.02 | 97.79 | 98.03 | 82.90 |
240
+ | nemotron-colembed-4b | 92.01 | 92.56 | 69.18 | 93.89 | 91.88 | 99.26 | 96.19 | 98.39 | 98.16 | 98.16 | 82.46 |
241
+ | [nemotron-colembed-3b-v1](https://huggingface.co/nvidia/llama-nemoretriever-colembed-3b-v1)| 91.00 | 88.35 | 66.21 | 94.92 | 90.70 | 99.63 | 96.63 | 97.82 | 99.26 | 95.94 | 80.57 |
242
+ | tomoro-colqwen3-8b| 90.76 | 91.15 | 66.37 | 94.48 | 87.89 | 99.26 | 96.71 | 97.58 | 99.06 | 94.23 | 80.92 |
243
+ | EvoQwen2.5-VL-Retriever-7B-v1 | 90.68 | 91.49 | 65.07 | 94.11 | 88.80 | 99.63 | 96.63 | 96.29 | 98.89 | 93.63 | 82.26 |
244
+ | tomoro-colqwen3-4b| 90.57 | 90.58 | 66.30 | 94.31 | 87.39 | 99.26 | 96.91 | 97.17 | 99.63 | 94.33 | 79.87 |
245
+
246
+
247
+ ## Inference:
248
+ **Acceleration Engine:** Not Applicable <br>
249
+ **Test Hardware:** A100 40GB, A100 80GB, H100 80GB
250
+
251
+
252
+
253
+ ### Citation
254
+
255
+ ```
256
+ @misc{xu2025llamanemoretrievercolembedtopperforming,
257
+ title={Llama Nemoretriever Colembed: Top-Performing Text-Image Retrieval Model},
258
+ author={Mengyao Xu and Gabriel Moreira and Ronay Ak and Radek Osmulski and Yauhen Babakhin and Zhiding Yu and Benedikt Schifferer and Even Oldridge},
259
+ year={2025},
260
+ eprint={2507.05513},
261
+ archivePrefix={arXiv},
262
+ primaryClass={cs.CV},
263
+ url={https://arxiv.org/abs/2507.05513},
264
+ }
265
+
266
+ @misc{moreira2025nvretrieverimprovingtextembedding,
267
+ title={NV-Retriever: Improving text embedding models with effective hard-negative mining},
268
+ author={Gabriel de Souza P. Moreira and Radek Osmulski and Mengyao Xu and Ronay Ak and Benedikt Schifferer and Even Oldridge},
269
+ year={2025},
270
+ eprint={2407.15831},
271
+ archivePrefix={arXiv},
272
+ primaryClass={cs.IR},
273
+ url={https://arxiv.org/abs/2407.15831},
274
+ }
275
+
276
+ @article{Qwen3-VL,
277
+ title={Qwen3-VL Technical Report},
278
+ author={Shuai Bai and Yuxuan Cai and Ruizhe Chen and Keqin Chen and Xionghui Chen and Zesen Cheng and Lianghao Deng and Wei Ding and Chang Gao and Chunjiang Ge and Wenbin Ge and Zhifang Guo and Qidong Huang and Jie Huang and Fei Huang and Binyuan Hui and Shutong Jiang and Zhaohai Li and Mingsheng Li and Mei Li and Kaixin Li and Zicheng Lin and Junyang Lin and Xuejing Liu and Jiawei Liu and Chenglong Liu and Yang Liu and Dayiheng Liu and Shixuan Liu and Dunjie Lu and Ruilin Luo and Chenxu Lv and Rui Men and Lingchen Meng and Xuancheng Ren and Xingzhang Ren and Sibo Song and Yuchong Sun and Jun Tang and Jianhong Tu and Jianqiang Wan and Peng Wang and Pengfei Wang and Qiuyue Wang and Yuxuan Wang and Tianbao Xie and Yiheng Xu and Haiyang Xu and Jin Xu and Zhibo Yang and Mingkun Yang and Jianxin Yang and An Yang and Bowen Yu and Fei Zhang and Hang Zhang and Xi Zhang and Bo Zheng and Humen Zhong and Jingren Zhou and Fan Zhou and Jing Zhou and Yuanzhi Zhu and Ke Zhu},
279
+ journal={arXiv preprint arXiv:2511.21631},
280
+ year={2025}
281
+ }
282
+ ```
283
+
284
+ ## **Ethical Considerations**
285
+
286
+ NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications. When downloaded or used in accordance with our terms of service, developers should work with their supporting model team to ensure this model meets requirements for the relevant industry and use case and addresses unforeseen product misuse.
287
+
288
+ For more detailed information on ethical considerations for this model, please see the Explainability, Bias, Safety, and Privacy sections.
289
+
290
+ Please report security vulnerabilities or NVIDIA AI Concerns [here](https://www.nvidia.com/en-us/support/submit-security-vulnerability/).
291
+
292
+
293
+ ## Bias
294
+
295
+ | Field | Response |
296
+ | ----- | ----- |
297
+ | Participation considerations from adversely impacted groups [protected classes](https://www.senate.ca.gov/content/protected-classes) in model design and testing | None |
298
+ | Measures taken to mitigate against unwanted bias | None |
299
+
300
+ ## Explainability
301
+
302
+ | Field | Response |
303
+ | ----- | ----- |
304
+ | Intended Application & Domain: | Document and query embedding for question and answer retrieval.|
305
+ | Model Type: | Transformer encoder. |
306
+ | Intended User: | Generative AI creators working with conversational AI models. Users who want to build a question and answer application over a large corpus, leveraging the latest dense retrieval technologies. The corpus can be images of PDFs, such as text, tables, charts or infographics, or extracted plain text. |
307
+ | Output: | Array of float numbers (Dense Vector Representation for the input text). |
308
+ | Describe how the model works: | Model transforms the input into a dense vector representation. |
309
+ | Technical Limitations: | The model's max sequence length is 10240. Longer text inputs should be truncated. |
310
+ | Name the adversely impacted groups this has been tested to deliver comparable outcomes regardless of: | N/A |
311
+ | Verified to have met prescribed NVIDIA quality standards: | Yes |
312
+ | Performance Metrics: | Accuracy, Throughput, and Latency. |
313
+ | Potential Known Risks: | This model does not guarantee to always retrieve the correct passage(s) for a given query. |
314
+ | Licensing & Terms of Use: | The use of this model is governed by the [NVIDIA Non-Commercial License](https://huggingface.co/nvidia/nemotron-colembed-4b-v2/blob/main/LICENSE) and the use of the post-processing scripts are licensed under [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0.txt). Additional Information: Built with Qwen3-VL which is released under [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0.txt).|
315
+
316
+ ## Privacy
317
+
318
+ | Field | Response |
319
+ | ----- | ----- |
320
+ | Generatable or reverse engineerable personal data? | No |
321
+ | Personal data used to create this model? | None Known |
322
+ | How often is dataset reviewed? | Dataset is initially reviewed upon addition, and subsequent reviews are conducted as needed or upon request for changes. |
323
+ | Is there provenance for all datasets used in training? | Yes |
324
+ | Does data labeling (annotation, metadata) comply with privacy laws? | Yes |
325
+ | Is data compliant with data subject requests for data correction or removal, if such a request was made? | No, not possible with externally-sourced data. |
326
+ | Was data from user interactions with the AI model (e.g. user input and prompts) used to train the model? | No |
327
+ | Was consent obtained for any personal data used? | Not Applicable |
328
+ | Applicable Privacy Policy | https://www.nvidia.com/en-us/about-nvidia/privacy-policy/ |
329
+
330
+ ## Safety
331
+
332
+ | Field | Response |
333
+ | ----- | ----- |
334
+ | Model Application(s): | Document Embedding for Retrieval. User queries can be text and documents can be text, document page images, charts, tables, and infographics. |
335
+ | Describe the physical safety impact (if present). | Not Applicable |
336
+ | Use Case Restrictions: | The use of this model is governed by the [NVIDIA Non-Commercial License](https://huggingface.co/nvidia/nemotron-colembed-4b-v2/blob/main/LICENSE) and the use of the post-processing scripts are licensed under [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0.txt). Additional Information: Built with Qwen3-VL which is released under [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0.txt).|
337
+ | Model and dataset restrictions: | The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to. |
chat_template.jinja ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {%- if messages[0].content is string %}
5
+ {{- messages[0].content }}
6
+ {%- else %}
7
+ {%- for content in messages[0].content %}
8
+ {%- if 'text' in content %}
9
+ {{- content.text }}
10
+ {%- endif %}
11
+ {%- endfor %}
12
+ {%- endif %}
13
+ {{- '\n\n' }}
14
+ {%- endif %}
15
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
16
+ {%- for tool in tools %}
17
+ {{- "\n" }}
18
+ {{- tool | tojson }}
19
+ {%- endfor %}
20
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
21
+ {%- else %}
22
+ {%- if messages[0].role == 'system' %}
23
+ {{- '<|im_start|>system\n' }}
24
+ {%- if messages[0].content is string %}
25
+ {{- messages[0].content }}
26
+ {%- else %}
27
+ {%- for content in messages[0].content %}
28
+ {%- if 'text' in content %}
29
+ {{- content.text }}
30
+ {%- endif %}
31
+ {%- endfor %}
32
+ {%- endif %}
33
+ {{- '<|im_end|>\n' }}
34
+ {%- endif %}
35
+ {%- endif %}
36
+ {%- set image_count = namespace(value=0) %}
37
+ {%- set video_count = namespace(value=0) %}
38
+ {%- for message in messages %}
39
+ {%- if message.role == "user" %}
40
+ {{- '<|im_start|>' + message.role + '\n' }}
41
+ {%- if message.content is string %}
42
+ {{- message.content }}
43
+ {%- else %}
44
+ {%- for content in message.content %}
45
+ {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
46
+ {%- set image_count.value = image_count.value + 1 %}
47
+ {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
48
+ <|vision_start|><|image_pad|><|vision_end|>
49
+ {%- elif content.type == 'video' or 'video' in content %}
50
+ {%- set video_count.value = video_count.value + 1 %}
51
+ {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
52
+ <|vision_start|><|video_pad|><|vision_end|>
53
+ {%- elif 'text' in content %}
54
+ {{- content.text }}
55
+ {%- endif %}
56
+ {%- endfor %}
57
+ {%- endif %}
58
+ {{- '<|im_end|>\n' }}
59
+ {%- elif message.role == "assistant" %}
60
+ {{- '<|im_start|>' + message.role + '\n' }}
61
+ {%- if message.content is string %}
62
+ {{- message.content }}
63
+ {%- else %}
64
+ {%- for content_item in message.content %}
65
+ {%- if 'text' in content_item %}
66
+ {{- content_item.text }}
67
+ {%- endif %}
68
+ {%- endfor %}
69
+ {%- endif %}
70
+ {%- if message.tool_calls %}
71
+ {%- for tool_call in message.tool_calls %}
72
+ {%- if (loop.first and message.content) or (not loop.first) %}
73
+ {{- '\n' }}
74
+ {%- endif %}
75
+ {%- if tool_call.function %}
76
+ {%- set tool_call = tool_call.function %}
77
+ {%- endif %}
78
+ {{- '<tool_call>\n{"name": "' }}
79
+ {{- tool_call.name }}
80
+ {{- '", "arguments": ' }}
81
+ {%- if tool_call.arguments is string %}
82
+ {{- tool_call.arguments }}
83
+ {%- else %}
84
+ {{- tool_call.arguments | tojson }}
85
+ {%- endif %}
86
+ {{- '}\n</tool_call>' }}
87
+ {%- endfor %}
88
+ {%- endif %}
89
+ {{- '<|im_end|>\n' }}
90
+ {%- elif message.role == "tool" %}
91
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
92
+ {{- '<|im_start|>user' }}
93
+ {%- endif %}
94
+ {{- '\n<tool_response>\n' }}
95
+ {%- if message.content is string %}
96
+ {{- message.content }}
97
+ {%- else %}
98
+ {%- for content in message.content %}
99
+ {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
100
+ {%- set image_count.value = image_count.value + 1 %}
101
+ {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
102
+ <|vision_start|><|image_pad|><|vision_end|>
103
+ {%- elif content.type == 'video' or 'video' in content %}
104
+ {%- set video_count.value = video_count.value + 1 %}
105
+ {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
106
+ <|vision_start|><|video_pad|><|vision_end|>
107
+ {%- elif 'text' in content %}
108
+ {{- content.text }}
109
+ {%- endif %}
110
+ {%- endfor %}
111
+ {%- endif %}
112
+ {{- '\n</tool_response>' }}
113
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
114
+ {{- '<|im_end|>\n' }}
115
+ {%- endif %}
116
+ {%- endif %}
117
+ {%- endfor %}
118
+ {%- if add_generation_prompt %}
119
+ {{- '<|im_start|>assistant\n' }}
120
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3VLNemotronEmbedModel"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "modeling_qwen3_vl_nemotron_embed.Qwen3VLNemotronEmbedConfig",
7
+ "AutoModel": "modeling_qwen3_vl_nemotron_embed.Qwen3VLNemotronEmbedModel",
8
+ "AutoProcessor": "processing_qwen3_vl_nemotron_embed.Qwen3VLNemotronEmbedProcessor"
9
+ },
10
+ "dtype": "bfloat16",
11
+ "image_token_id": 151655,
12
+ "model_type": "qwen3_vl_nemotron_embed",
13
+ "pooling": "colbert",
14
+ "text_config": {
15
+ "attention_bias": false,
16
+ "attention_dropout": 0.0,
17
+ "bos_token_id": 151643,
18
+ "dtype": "bfloat16",
19
+ "eos_token_id": 151645,
20
+ "head_dim": 128,
21
+ "hidden_act": "silu",
22
+ "hidden_size": 2560,
23
+ "initializer_range": 0.02,
24
+ "intermediate_size": 9728,
25
+ "max_position_embeddings": 262144,
26
+ "model_type": "qwen3_vl_text",
27
+ "num_attention_heads": 32,
28
+ "num_hidden_layers": 36,
29
+ "num_key_value_heads": 8,
30
+ "rms_norm_eps": 1e-06,
31
+ "rope_parameters": {
32
+ "mrope_interleaved": true,
33
+ "mrope_section": [
34
+ 24,
35
+ 20,
36
+ 20
37
+ ],
38
+ "rope_theta": 5000000,
39
+ "rope_type": "default"
40
+ },
41
+ "tie_word_embeddings": true,
42
+ "use_cache": true,
43
+ "vocab_size": 151936
44
+ },
45
+ "tie_word_embeddings": true,
46
+ "transformers_version": "5.0.0rc0",
47
+ "use_cache": false,
48
+ "video_token_id": 151656,
49
+ "vision_config": {
50
+ "deepstack_visual_indexes": [
51
+ 5,
52
+ 11,
53
+ 17
54
+ ],
55
+ "depth": 24,
56
+ "dtype": "bfloat16",
57
+ "hidden_act": "gelu_pytorch_tanh",
58
+ "hidden_size": 1024,
59
+ "in_channels": 3,
60
+ "initializer_range": 0.02,
61
+ "intermediate_size": 4096,
62
+ "model_type": "qwen3_vl",
63
+ "num_heads": 16,
64
+ "num_position_embeddings": 2304,
65
+ "out_hidden_size": 2560,
66
+ "patch_size": 16,
67
+ "spatial_merge_size": 2,
68
+ "temporal_patch_size": 2
69
+ },
70
+ "vision_end_token_id": 151653,
71
+ "vision_start_token_id": 151652
72
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 151643,
4
+ "eos_token_id": 151645,
5
+ "transformers_version": "5.0.0rc0",
6
+ "use_cache": false
7
+ }
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6223655cd16c06e3e48f3d7a3089c4d677990b336acea814a3ba3c6400da7a2c
3
+ size 4990497880
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bfa21ee16dafffd08a7fdf904cea5c2f503a47660bbcb08ade8fe7f42aa7719
3
+ size 4663133960
model.safetensors.index.json ADDED
@@ -0,0 +1,722 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 4826771968,
4
+ "total_size": 9653543936
5
+ },
6
+ "weight_map": {
7
+ "lm_head.weight": "model-00002-of-00002.safetensors",
8
+ "model.language_model.embed_tokens.weight": "model-00001-of-00002.safetensors",
9
+ "model.language_model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
10
+ "model.language_model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
11
+ "model.language_model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
12
+ "model.language_model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
13
+ "model.language_model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
14
+ "model.language_model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
15
+ "model.language_model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
16
+ "model.language_model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
17
+ "model.language_model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
18
+ "model.language_model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
19
+ "model.language_model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
20
+ "model.language_model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
21
+ "model.language_model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
22
+ "model.language_model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
23
+ "model.language_model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
24
+ "model.language_model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
25
+ "model.language_model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
26
+ "model.language_model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
27
+ "model.language_model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
28
+ "model.language_model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
29
+ "model.language_model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
30
+ "model.language_model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
31
+ "model.language_model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
32
+ "model.language_model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
33
+ "model.language_model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
34
+ "model.language_model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
35
+ "model.language_model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
36
+ "model.language_model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
37
+ "model.language_model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
38
+ "model.language_model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
39
+ "model.language_model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
40
+ "model.language_model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
41
+ "model.language_model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
42
+ "model.language_model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
43
+ "model.language_model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
44
+ "model.language_model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
45
+ "model.language_model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
46
+ "model.language_model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
47
+ "model.language_model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
48
+ "model.language_model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
49
+ "model.language_model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
50
+ "model.language_model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
51
+ "model.language_model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
52
+ "model.language_model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
53
+ "model.language_model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
54
+ "model.language_model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
55
+ "model.language_model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
56
+ "model.language_model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
57
+ "model.language_model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
58
+ "model.language_model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
59
+ "model.language_model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
60
+ "model.language_model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
61
+ "model.language_model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
62
+ "model.language_model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
63
+ "model.language_model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
64
+ "model.language_model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
65
+ "model.language_model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
66
+ "model.language_model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
67
+ "model.language_model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
68
+ "model.language_model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
69
+ "model.language_model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
70
+ "model.language_model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
71
+ "model.language_model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
72
+ "model.language_model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
73
+ "model.language_model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
74
+ "model.language_model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
75
+ "model.language_model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
76
+ "model.language_model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
77
+ "model.language_model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
78
+ "model.language_model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
79
+ "model.language_model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
80
+ "model.language_model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
81
+ "model.language_model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
82
+ "model.language_model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
83
+ "model.language_model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
84
+ "model.language_model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
85
+ "model.language_model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
86
+ "model.language_model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
87
+ "model.language_model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
88
+ "model.language_model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
89
+ "model.language_model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
90
+ "model.language_model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
91
+ "model.language_model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
92
+ "model.language_model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
93
+ "model.language_model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
94
+ "model.language_model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
95
+ "model.language_model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
96
+ "model.language_model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
97
+ "model.language_model.layers.16.input_layernorm.weight": "model-00002-of-00002.safetensors",
98
+ "model.language_model.layers.16.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
99
+ "model.language_model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
100
+ "model.language_model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
101
+ "model.language_model.layers.16.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
102
+ "model.language_model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
103
+ "model.language_model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
104
+ "model.language_model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
105
+ "model.language_model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
106
+ "model.language_model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
107
+ "model.language_model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
108
+ "model.language_model.layers.17.input_layernorm.weight": "model-00002-of-00002.safetensors",
109
+ "model.language_model.layers.17.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
110
+ "model.language_model.layers.17.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
111
+ "model.language_model.layers.17.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
112
+ "model.language_model.layers.17.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
113
+ "model.language_model.layers.17.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
114
+ "model.language_model.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
115
+ "model.language_model.layers.17.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
116
+ "model.language_model.layers.17.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
117
+ "model.language_model.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
118
+ "model.language_model.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
119
+ "model.language_model.layers.18.input_layernorm.weight": "model-00002-of-00002.safetensors",
120
+ "model.language_model.layers.18.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
121
+ "model.language_model.layers.18.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
122
+ "model.language_model.layers.18.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
123
+ "model.language_model.layers.18.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
124
+ "model.language_model.layers.18.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
125
+ "model.language_model.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
126
+ "model.language_model.layers.18.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
127
+ "model.language_model.layers.18.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
128
+ "model.language_model.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
129
+ "model.language_model.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
130
+ "model.language_model.layers.19.input_layernorm.weight": "model-00002-of-00002.safetensors",
131
+ "model.language_model.layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
132
+ "model.language_model.layers.19.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
133
+ "model.language_model.layers.19.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
134
+ "model.language_model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
135
+ "model.language_model.layers.19.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
136
+ "model.language_model.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
137
+ "model.language_model.layers.19.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
138
+ "model.language_model.layers.19.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
139
+ "model.language_model.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
140
+ "model.language_model.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
141
+ "model.language_model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
142
+ "model.language_model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
143
+ "model.language_model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
144
+ "model.language_model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
145
+ "model.language_model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
146
+ "model.language_model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
147
+ "model.language_model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
148
+ "model.language_model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
149
+ "model.language_model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
150
+ "model.language_model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
151
+ "model.language_model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
152
+ "model.language_model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
153
+ "model.language_model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
154
+ "model.language_model.layers.20.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
155
+ "model.language_model.layers.20.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
156
+ "model.language_model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
157
+ "model.language_model.layers.20.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
158
+ "model.language_model.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
159
+ "model.language_model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
160
+ "model.language_model.layers.20.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
161
+ "model.language_model.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
162
+ "model.language_model.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
163
+ "model.language_model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
164
+ "model.language_model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
165
+ "model.language_model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
166
+ "model.language_model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
167
+ "model.language_model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
168
+ "model.language_model.layers.21.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
169
+ "model.language_model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
170
+ "model.language_model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
171
+ "model.language_model.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
172
+ "model.language_model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
173
+ "model.language_model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
174
+ "model.language_model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
175
+ "model.language_model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
176
+ "model.language_model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
177
+ "model.language_model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
178
+ "model.language_model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
179
+ "model.language_model.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
180
+ "model.language_model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
181
+ "model.language_model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
182
+ "model.language_model.layers.22.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
183
+ "model.language_model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
184
+ "model.language_model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
185
+ "model.language_model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
186
+ "model.language_model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
187
+ "model.language_model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
188
+ "model.language_model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
189
+ "model.language_model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
190
+ "model.language_model.layers.23.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
191
+ "model.language_model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
192
+ "model.language_model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
193
+ "model.language_model.layers.23.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
194
+ "model.language_model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
195
+ "model.language_model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
196
+ "model.language_model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
197
+ "model.language_model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
198
+ "model.language_model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
199
+ "model.language_model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
200
+ "model.language_model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
201
+ "model.language_model.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
202
+ "model.language_model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
203
+ "model.language_model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
204
+ "model.language_model.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
205
+ "model.language_model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
206
+ "model.language_model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
207
+ "model.language_model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
208
+ "model.language_model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
209
+ "model.language_model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
210
+ "model.language_model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
211
+ "model.language_model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
212
+ "model.language_model.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
213
+ "model.language_model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
214
+ "model.language_model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
215
+ "model.language_model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
216
+ "model.language_model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
217
+ "model.language_model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
218
+ "model.language_model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
219
+ "model.language_model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
220
+ "model.language_model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
221
+ "model.language_model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
222
+ "model.language_model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
223
+ "model.language_model.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
224
+ "model.language_model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
225
+ "model.language_model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
226
+ "model.language_model.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
227
+ "model.language_model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
228
+ "model.language_model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
229
+ "model.language_model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
230
+ "model.language_model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
231
+ "model.language_model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
232
+ "model.language_model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
233
+ "model.language_model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
234
+ "model.language_model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
235
+ "model.language_model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
236
+ "model.language_model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
237
+ "model.language_model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
238
+ "model.language_model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
239
+ "model.language_model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
240
+ "model.language_model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
241
+ "model.language_model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
242
+ "model.language_model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
243
+ "model.language_model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
244
+ "model.language_model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
245
+ "model.language_model.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
246
+ "model.language_model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
247
+ "model.language_model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
248
+ "model.language_model.layers.28.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
249
+ "model.language_model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
250
+ "model.language_model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
251
+ "model.language_model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
252
+ "model.language_model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
253
+ "model.language_model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
254
+ "model.language_model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
255
+ "model.language_model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
256
+ "model.language_model.layers.29.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
257
+ "model.language_model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
258
+ "model.language_model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
259
+ "model.language_model.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
260
+ "model.language_model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
261
+ "model.language_model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
262
+ "model.language_model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
263
+ "model.language_model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
264
+ "model.language_model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
265
+ "model.language_model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
266
+ "model.language_model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
267
+ "model.language_model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
268
+ "model.language_model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
269
+ "model.language_model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
270
+ "model.language_model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
271
+ "model.language_model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
272
+ "model.language_model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
273
+ "model.language_model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
274
+ "model.language_model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
275
+ "model.language_model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
276
+ "model.language_model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
277
+ "model.language_model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
278
+ "model.language_model.layers.30.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
279
+ "model.language_model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
280
+ "model.language_model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
281
+ "model.language_model.layers.30.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
282
+ "model.language_model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
283
+ "model.language_model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
284
+ "model.language_model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
285
+ "model.language_model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
286
+ "model.language_model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
287
+ "model.language_model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
288
+ "model.language_model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
289
+ "model.language_model.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
290
+ "model.language_model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
291
+ "model.language_model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
292
+ "model.language_model.layers.31.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
293
+ "model.language_model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
294
+ "model.language_model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
295
+ "model.language_model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
296
+ "model.language_model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
297
+ "model.language_model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
298
+ "model.language_model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
299
+ "model.language_model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
300
+ "model.language_model.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
301
+ "model.language_model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
302
+ "model.language_model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
303
+ "model.language_model.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
304
+ "model.language_model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
305
+ "model.language_model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
306
+ "model.language_model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
307
+ "model.language_model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
308
+ "model.language_model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
309
+ "model.language_model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
310
+ "model.language_model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
311
+ "model.language_model.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
312
+ "model.language_model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
313
+ "model.language_model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
314
+ "model.language_model.layers.33.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
315
+ "model.language_model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
316
+ "model.language_model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
317
+ "model.language_model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
318
+ "model.language_model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
319
+ "model.language_model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
320
+ "model.language_model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
321
+ "model.language_model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
322
+ "model.language_model.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
323
+ "model.language_model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
324
+ "model.language_model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
325
+ "model.language_model.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
326
+ "model.language_model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
327
+ "model.language_model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
328
+ "model.language_model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
329
+ "model.language_model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
330
+ "model.language_model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
331
+ "model.language_model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
332
+ "model.language_model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
333
+ "model.language_model.layers.35.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
334
+ "model.language_model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
335
+ "model.language_model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
336
+ "model.language_model.layers.35.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
337
+ "model.language_model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
338
+ "model.language_model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
339
+ "model.language_model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
340
+ "model.language_model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
341
+ "model.language_model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
342
+ "model.language_model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
343
+ "model.language_model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
344
+ "model.language_model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
345
+ "model.language_model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
346
+ "model.language_model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
347
+ "model.language_model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
348
+ "model.language_model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
349
+ "model.language_model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
350
+ "model.language_model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
351
+ "model.language_model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
352
+ "model.language_model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
353
+ "model.language_model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
354
+ "model.language_model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
355
+ "model.language_model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
356
+ "model.language_model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
357
+ "model.language_model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
358
+ "model.language_model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
359
+ "model.language_model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
360
+ "model.language_model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
361
+ "model.language_model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
362
+ "model.language_model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
363
+ "model.language_model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
364
+ "model.language_model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
365
+ "model.language_model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
366
+ "model.language_model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
367
+ "model.language_model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
368
+ "model.language_model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
369
+ "model.language_model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
370
+ "model.language_model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
371
+ "model.language_model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
372
+ "model.language_model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
373
+ "model.language_model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
374
+ "model.language_model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
375
+ "model.language_model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
376
+ "model.language_model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
377
+ "model.language_model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
378
+ "model.language_model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
379
+ "model.language_model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
380
+ "model.language_model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
381
+ "model.language_model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
382
+ "model.language_model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
383
+ "model.language_model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
384
+ "model.language_model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
385
+ "model.language_model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
386
+ "model.language_model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
387
+ "model.language_model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
388
+ "model.language_model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
389
+ "model.language_model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
390
+ "model.language_model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
391
+ "model.language_model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
392
+ "model.language_model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
393
+ "model.language_model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
394
+ "model.language_model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
395
+ "model.language_model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
396
+ "model.language_model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
397
+ "model.language_model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
398
+ "model.language_model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
399
+ "model.language_model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
400
+ "model.language_model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
401
+ "model.language_model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
402
+ "model.language_model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
403
+ "model.language_model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
404
+ "model.language_model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
405
+ "model.language_model.norm.weight": "model-00002-of-00002.safetensors",
406
+ "model.visual.blocks.0.attn.proj.bias": "model-00001-of-00002.safetensors",
407
+ "model.visual.blocks.0.attn.proj.weight": "model-00001-of-00002.safetensors",
408
+ "model.visual.blocks.0.attn.qkv.bias": "model-00001-of-00002.safetensors",
409
+ "model.visual.blocks.0.attn.qkv.weight": "model-00001-of-00002.safetensors",
410
+ "model.visual.blocks.0.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
411
+ "model.visual.blocks.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
412
+ "model.visual.blocks.0.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
413
+ "model.visual.blocks.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
414
+ "model.visual.blocks.0.norm1.bias": "model-00001-of-00002.safetensors",
415
+ "model.visual.blocks.0.norm1.weight": "model-00001-of-00002.safetensors",
416
+ "model.visual.blocks.0.norm2.bias": "model-00001-of-00002.safetensors",
417
+ "model.visual.blocks.0.norm2.weight": "model-00001-of-00002.safetensors",
418
+ "model.visual.blocks.1.attn.proj.bias": "model-00001-of-00002.safetensors",
419
+ "model.visual.blocks.1.attn.proj.weight": "model-00001-of-00002.safetensors",
420
+ "model.visual.blocks.1.attn.qkv.bias": "model-00001-of-00002.safetensors",
421
+ "model.visual.blocks.1.attn.qkv.weight": "model-00001-of-00002.safetensors",
422
+ "model.visual.blocks.1.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
423
+ "model.visual.blocks.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
424
+ "model.visual.blocks.1.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
425
+ "model.visual.blocks.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
426
+ "model.visual.blocks.1.norm1.bias": "model-00001-of-00002.safetensors",
427
+ "model.visual.blocks.1.norm1.weight": "model-00001-of-00002.safetensors",
428
+ "model.visual.blocks.1.norm2.bias": "model-00001-of-00002.safetensors",
429
+ "model.visual.blocks.1.norm2.weight": "model-00001-of-00002.safetensors",
430
+ "model.visual.blocks.10.attn.proj.bias": "model-00001-of-00002.safetensors",
431
+ "model.visual.blocks.10.attn.proj.weight": "model-00001-of-00002.safetensors",
432
+ "model.visual.blocks.10.attn.qkv.bias": "model-00001-of-00002.safetensors",
433
+ "model.visual.blocks.10.attn.qkv.weight": "model-00001-of-00002.safetensors",
434
+ "model.visual.blocks.10.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
435
+ "model.visual.blocks.10.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
436
+ "model.visual.blocks.10.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
437
+ "model.visual.blocks.10.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
438
+ "model.visual.blocks.10.norm1.bias": "model-00001-of-00002.safetensors",
439
+ "model.visual.blocks.10.norm1.weight": "model-00001-of-00002.safetensors",
440
+ "model.visual.blocks.10.norm2.bias": "model-00001-of-00002.safetensors",
441
+ "model.visual.blocks.10.norm2.weight": "model-00001-of-00002.safetensors",
442
+ "model.visual.blocks.11.attn.proj.bias": "model-00001-of-00002.safetensors",
443
+ "model.visual.blocks.11.attn.proj.weight": "model-00001-of-00002.safetensors",
444
+ "model.visual.blocks.11.attn.qkv.bias": "model-00001-of-00002.safetensors",
445
+ "model.visual.blocks.11.attn.qkv.weight": "model-00001-of-00002.safetensors",
446
+ "model.visual.blocks.11.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
447
+ "model.visual.blocks.11.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
448
+ "model.visual.blocks.11.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
449
+ "model.visual.blocks.11.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
450
+ "model.visual.blocks.11.norm1.bias": "model-00001-of-00002.safetensors",
451
+ "model.visual.blocks.11.norm1.weight": "model-00001-of-00002.safetensors",
452
+ "model.visual.blocks.11.norm2.bias": "model-00001-of-00002.safetensors",
453
+ "model.visual.blocks.11.norm2.weight": "model-00001-of-00002.safetensors",
454
+ "model.visual.blocks.12.attn.proj.bias": "model-00001-of-00002.safetensors",
455
+ "model.visual.blocks.12.attn.proj.weight": "model-00001-of-00002.safetensors",
456
+ "model.visual.blocks.12.attn.qkv.bias": "model-00001-of-00002.safetensors",
457
+ "model.visual.blocks.12.attn.qkv.weight": "model-00001-of-00002.safetensors",
458
+ "model.visual.blocks.12.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
459
+ "model.visual.blocks.12.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
460
+ "model.visual.blocks.12.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
461
+ "model.visual.blocks.12.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
462
+ "model.visual.blocks.12.norm1.bias": "model-00001-of-00002.safetensors",
463
+ "model.visual.blocks.12.norm1.weight": "model-00001-of-00002.safetensors",
464
+ "model.visual.blocks.12.norm2.bias": "model-00001-of-00002.safetensors",
465
+ "model.visual.blocks.12.norm2.weight": "model-00001-of-00002.safetensors",
466
+ "model.visual.blocks.13.attn.proj.bias": "model-00001-of-00002.safetensors",
467
+ "model.visual.blocks.13.attn.proj.weight": "model-00001-of-00002.safetensors",
468
+ "model.visual.blocks.13.attn.qkv.bias": "model-00001-of-00002.safetensors",
469
+ "model.visual.blocks.13.attn.qkv.weight": "model-00001-of-00002.safetensors",
470
+ "model.visual.blocks.13.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
471
+ "model.visual.blocks.13.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
472
+ "model.visual.blocks.13.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
473
+ "model.visual.blocks.13.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
474
+ "model.visual.blocks.13.norm1.bias": "model-00001-of-00002.safetensors",
475
+ "model.visual.blocks.13.norm1.weight": "model-00001-of-00002.safetensors",
476
+ "model.visual.blocks.13.norm2.bias": "model-00001-of-00002.safetensors",
477
+ "model.visual.blocks.13.norm2.weight": "model-00001-of-00002.safetensors",
478
+ "model.visual.blocks.14.attn.proj.bias": "model-00001-of-00002.safetensors",
479
+ "model.visual.blocks.14.attn.proj.weight": "model-00001-of-00002.safetensors",
480
+ "model.visual.blocks.14.attn.qkv.bias": "model-00001-of-00002.safetensors",
481
+ "model.visual.blocks.14.attn.qkv.weight": "model-00001-of-00002.safetensors",
482
+ "model.visual.blocks.14.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
483
+ "model.visual.blocks.14.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
484
+ "model.visual.blocks.14.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
485
+ "model.visual.blocks.14.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
486
+ "model.visual.blocks.14.norm1.bias": "model-00001-of-00002.safetensors",
487
+ "model.visual.blocks.14.norm1.weight": "model-00001-of-00002.safetensors",
488
+ "model.visual.blocks.14.norm2.bias": "model-00001-of-00002.safetensors",
489
+ "model.visual.blocks.14.norm2.weight": "model-00001-of-00002.safetensors",
490
+ "model.visual.blocks.15.attn.proj.bias": "model-00001-of-00002.safetensors",
491
+ "model.visual.blocks.15.attn.proj.weight": "model-00001-of-00002.safetensors",
492
+ "model.visual.blocks.15.attn.qkv.bias": "model-00001-of-00002.safetensors",
493
+ "model.visual.blocks.15.attn.qkv.weight": "model-00001-of-00002.safetensors",
494
+ "model.visual.blocks.15.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
495
+ "model.visual.blocks.15.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
496
+ "model.visual.blocks.15.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
497
+ "model.visual.blocks.15.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
498
+ "model.visual.blocks.15.norm1.bias": "model-00001-of-00002.safetensors",
499
+ "model.visual.blocks.15.norm1.weight": "model-00001-of-00002.safetensors",
500
+ "model.visual.blocks.15.norm2.bias": "model-00001-of-00002.safetensors",
501
+ "model.visual.blocks.15.norm2.weight": "model-00001-of-00002.safetensors",
502
+ "model.visual.blocks.16.attn.proj.bias": "model-00001-of-00002.safetensors",
503
+ "model.visual.blocks.16.attn.proj.weight": "model-00001-of-00002.safetensors",
504
+ "model.visual.blocks.16.attn.qkv.bias": "model-00001-of-00002.safetensors",
505
+ "model.visual.blocks.16.attn.qkv.weight": "model-00001-of-00002.safetensors",
506
+ "model.visual.blocks.16.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
507
+ "model.visual.blocks.16.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
508
+ "model.visual.blocks.16.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
509
+ "model.visual.blocks.16.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
510
+ "model.visual.blocks.16.norm1.bias": "model-00001-of-00002.safetensors",
511
+ "model.visual.blocks.16.norm1.weight": "model-00001-of-00002.safetensors",
512
+ "model.visual.blocks.16.norm2.bias": "model-00001-of-00002.safetensors",
513
+ "model.visual.blocks.16.norm2.weight": "model-00001-of-00002.safetensors",
514
+ "model.visual.blocks.17.attn.proj.bias": "model-00001-of-00002.safetensors",
515
+ "model.visual.blocks.17.attn.proj.weight": "model-00001-of-00002.safetensors",
516
+ "model.visual.blocks.17.attn.qkv.bias": "model-00001-of-00002.safetensors",
517
+ "model.visual.blocks.17.attn.qkv.weight": "model-00001-of-00002.safetensors",
518
+ "model.visual.blocks.17.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
519
+ "model.visual.blocks.17.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
520
+ "model.visual.blocks.17.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
521
+ "model.visual.blocks.17.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
522
+ "model.visual.blocks.17.norm1.bias": "model-00001-of-00002.safetensors",
523
+ "model.visual.blocks.17.norm1.weight": "model-00001-of-00002.safetensors",
524
+ "model.visual.blocks.17.norm2.bias": "model-00001-of-00002.safetensors",
525
+ "model.visual.blocks.17.norm2.weight": "model-00001-of-00002.safetensors",
526
+ "model.visual.blocks.18.attn.proj.bias": "model-00001-of-00002.safetensors",
527
+ "model.visual.blocks.18.attn.proj.weight": "model-00001-of-00002.safetensors",
528
+ "model.visual.blocks.18.attn.qkv.bias": "model-00001-of-00002.safetensors",
529
+ "model.visual.blocks.18.attn.qkv.weight": "model-00001-of-00002.safetensors",
530
+ "model.visual.blocks.18.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
531
+ "model.visual.blocks.18.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
532
+ "model.visual.blocks.18.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
533
+ "model.visual.blocks.18.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
534
+ "model.visual.blocks.18.norm1.bias": "model-00001-of-00002.safetensors",
535
+ "model.visual.blocks.18.norm1.weight": "model-00001-of-00002.safetensors",
536
+ "model.visual.blocks.18.norm2.bias": "model-00001-of-00002.safetensors",
537
+ "model.visual.blocks.18.norm2.weight": "model-00001-of-00002.safetensors",
538
+ "model.visual.blocks.19.attn.proj.bias": "model-00001-of-00002.safetensors",
539
+ "model.visual.blocks.19.attn.proj.weight": "model-00001-of-00002.safetensors",
540
+ "model.visual.blocks.19.attn.qkv.bias": "model-00001-of-00002.safetensors",
541
+ "model.visual.blocks.19.attn.qkv.weight": "model-00001-of-00002.safetensors",
542
+ "model.visual.blocks.19.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
543
+ "model.visual.blocks.19.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
544
+ "model.visual.blocks.19.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
545
+ "model.visual.blocks.19.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
546
+ "model.visual.blocks.19.norm1.bias": "model-00001-of-00002.safetensors",
547
+ "model.visual.blocks.19.norm1.weight": "model-00001-of-00002.safetensors",
548
+ "model.visual.blocks.19.norm2.bias": "model-00001-of-00002.safetensors",
549
+ "model.visual.blocks.19.norm2.weight": "model-00001-of-00002.safetensors",
550
+ "model.visual.blocks.2.attn.proj.bias": "model-00001-of-00002.safetensors",
551
+ "model.visual.blocks.2.attn.proj.weight": "model-00001-of-00002.safetensors",
552
+ "model.visual.blocks.2.attn.qkv.bias": "model-00001-of-00002.safetensors",
553
+ "model.visual.blocks.2.attn.qkv.weight": "model-00001-of-00002.safetensors",
554
+ "model.visual.blocks.2.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
555
+ "model.visual.blocks.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
556
+ "model.visual.blocks.2.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
557
+ "model.visual.blocks.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
558
+ "model.visual.blocks.2.norm1.bias": "model-00001-of-00002.safetensors",
559
+ "model.visual.blocks.2.norm1.weight": "model-00001-of-00002.safetensors",
560
+ "model.visual.blocks.2.norm2.bias": "model-00001-of-00002.safetensors",
561
+ "model.visual.blocks.2.norm2.weight": "model-00001-of-00002.safetensors",
562
+ "model.visual.blocks.20.attn.proj.bias": "model-00001-of-00002.safetensors",
563
+ "model.visual.blocks.20.attn.proj.weight": "model-00001-of-00002.safetensors",
564
+ "model.visual.blocks.20.attn.qkv.bias": "model-00001-of-00002.safetensors",
565
+ "model.visual.blocks.20.attn.qkv.weight": "model-00001-of-00002.safetensors",
566
+ "model.visual.blocks.20.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
567
+ "model.visual.blocks.20.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
568
+ "model.visual.blocks.20.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
569
+ "model.visual.blocks.20.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
570
+ "model.visual.blocks.20.norm1.bias": "model-00001-of-00002.safetensors",
571
+ "model.visual.blocks.20.norm1.weight": "model-00001-of-00002.safetensors",
572
+ "model.visual.blocks.20.norm2.bias": "model-00001-of-00002.safetensors",
573
+ "model.visual.blocks.20.norm2.weight": "model-00001-of-00002.safetensors",
574
+ "model.visual.blocks.21.attn.proj.bias": "model-00001-of-00002.safetensors",
575
+ "model.visual.blocks.21.attn.proj.weight": "model-00001-of-00002.safetensors",
576
+ "model.visual.blocks.21.attn.qkv.bias": "model-00001-of-00002.safetensors",
577
+ "model.visual.blocks.21.attn.qkv.weight": "model-00001-of-00002.safetensors",
578
+ "model.visual.blocks.21.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
579
+ "model.visual.blocks.21.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
580
+ "model.visual.blocks.21.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
581
+ "model.visual.blocks.21.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
582
+ "model.visual.blocks.21.norm1.bias": "model-00001-of-00002.safetensors",
583
+ "model.visual.blocks.21.norm1.weight": "model-00001-of-00002.safetensors",
584
+ "model.visual.blocks.21.norm2.bias": "model-00001-of-00002.safetensors",
585
+ "model.visual.blocks.21.norm2.weight": "model-00001-of-00002.safetensors",
586
+ "model.visual.blocks.22.attn.proj.bias": "model-00001-of-00002.safetensors",
587
+ "model.visual.blocks.22.attn.proj.weight": "model-00001-of-00002.safetensors",
588
+ "model.visual.blocks.22.attn.qkv.bias": "model-00001-of-00002.safetensors",
589
+ "model.visual.blocks.22.attn.qkv.weight": "model-00001-of-00002.safetensors",
590
+ "model.visual.blocks.22.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
591
+ "model.visual.blocks.22.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
592
+ "model.visual.blocks.22.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
593
+ "model.visual.blocks.22.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
594
+ "model.visual.blocks.22.norm1.bias": "model-00001-of-00002.safetensors",
595
+ "model.visual.blocks.22.norm1.weight": "model-00001-of-00002.safetensors",
596
+ "model.visual.blocks.22.norm2.bias": "model-00001-of-00002.safetensors",
597
+ "model.visual.blocks.22.norm2.weight": "model-00001-of-00002.safetensors",
598
+ "model.visual.blocks.23.attn.proj.bias": "model-00001-of-00002.safetensors",
599
+ "model.visual.blocks.23.attn.proj.weight": "model-00001-of-00002.safetensors",
600
+ "model.visual.blocks.23.attn.qkv.bias": "model-00001-of-00002.safetensors",
601
+ "model.visual.blocks.23.attn.qkv.weight": "model-00001-of-00002.safetensors",
602
+ "model.visual.blocks.23.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
603
+ "model.visual.blocks.23.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
604
+ "model.visual.blocks.23.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
605
+ "model.visual.blocks.23.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
606
+ "model.visual.blocks.23.norm1.bias": "model-00001-of-00002.safetensors",
607
+ "model.visual.blocks.23.norm1.weight": "model-00001-of-00002.safetensors",
608
+ "model.visual.blocks.23.norm2.bias": "model-00001-of-00002.safetensors",
609
+ "model.visual.blocks.23.norm2.weight": "model-00001-of-00002.safetensors",
610
+ "model.visual.blocks.3.attn.proj.bias": "model-00001-of-00002.safetensors",
611
+ "model.visual.blocks.3.attn.proj.weight": "model-00001-of-00002.safetensors",
612
+ "model.visual.blocks.3.attn.qkv.bias": "model-00001-of-00002.safetensors",
613
+ "model.visual.blocks.3.attn.qkv.weight": "model-00001-of-00002.safetensors",
614
+ "model.visual.blocks.3.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
615
+ "model.visual.blocks.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
616
+ "model.visual.blocks.3.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
617
+ "model.visual.blocks.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
618
+ "model.visual.blocks.3.norm1.bias": "model-00001-of-00002.safetensors",
619
+ "model.visual.blocks.3.norm1.weight": "model-00001-of-00002.safetensors",
620
+ "model.visual.blocks.3.norm2.bias": "model-00001-of-00002.safetensors",
621
+ "model.visual.blocks.3.norm2.weight": "model-00001-of-00002.safetensors",
622
+ "model.visual.blocks.4.attn.proj.bias": "model-00001-of-00002.safetensors",
623
+ "model.visual.blocks.4.attn.proj.weight": "model-00001-of-00002.safetensors",
624
+ "model.visual.blocks.4.attn.qkv.bias": "model-00001-of-00002.safetensors",
625
+ "model.visual.blocks.4.attn.qkv.weight": "model-00001-of-00002.safetensors",
626
+ "model.visual.blocks.4.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
627
+ "model.visual.blocks.4.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
628
+ "model.visual.blocks.4.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
629
+ "model.visual.blocks.4.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
630
+ "model.visual.blocks.4.norm1.bias": "model-00001-of-00002.safetensors",
631
+ "model.visual.blocks.4.norm1.weight": "model-00001-of-00002.safetensors",
632
+ "model.visual.blocks.4.norm2.bias": "model-00001-of-00002.safetensors",
633
+ "model.visual.blocks.4.norm2.weight": "model-00001-of-00002.safetensors",
634
+ "model.visual.blocks.5.attn.proj.bias": "model-00001-of-00002.safetensors",
635
+ "model.visual.blocks.5.attn.proj.weight": "model-00001-of-00002.safetensors",
636
+ "model.visual.blocks.5.attn.qkv.bias": "model-00001-of-00002.safetensors",
637
+ "model.visual.blocks.5.attn.qkv.weight": "model-00001-of-00002.safetensors",
638
+ "model.visual.blocks.5.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
639
+ "model.visual.blocks.5.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
640
+ "model.visual.blocks.5.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
641
+ "model.visual.blocks.5.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
642
+ "model.visual.blocks.5.norm1.bias": "model-00001-of-00002.safetensors",
643
+ "model.visual.blocks.5.norm1.weight": "model-00001-of-00002.safetensors",
644
+ "model.visual.blocks.5.norm2.bias": "model-00001-of-00002.safetensors",
645
+ "model.visual.blocks.5.norm2.weight": "model-00001-of-00002.safetensors",
646
+ "model.visual.blocks.6.attn.proj.bias": "model-00001-of-00002.safetensors",
647
+ "model.visual.blocks.6.attn.proj.weight": "model-00001-of-00002.safetensors",
648
+ "model.visual.blocks.6.attn.qkv.bias": "model-00001-of-00002.safetensors",
649
+ "model.visual.blocks.6.attn.qkv.weight": "model-00001-of-00002.safetensors",
650
+ "model.visual.blocks.6.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
651
+ "model.visual.blocks.6.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
652
+ "model.visual.blocks.6.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
653
+ "model.visual.blocks.6.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
654
+ "model.visual.blocks.6.norm1.bias": "model-00001-of-00002.safetensors",
655
+ "model.visual.blocks.6.norm1.weight": "model-00001-of-00002.safetensors",
656
+ "model.visual.blocks.6.norm2.bias": "model-00001-of-00002.safetensors",
657
+ "model.visual.blocks.6.norm2.weight": "model-00001-of-00002.safetensors",
658
+ "model.visual.blocks.7.attn.proj.bias": "model-00001-of-00002.safetensors",
659
+ "model.visual.blocks.7.attn.proj.weight": "model-00001-of-00002.safetensors",
660
+ "model.visual.blocks.7.attn.qkv.bias": "model-00001-of-00002.safetensors",
661
+ "model.visual.blocks.7.attn.qkv.weight": "model-00001-of-00002.safetensors",
662
+ "model.visual.blocks.7.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
663
+ "model.visual.blocks.7.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
664
+ "model.visual.blocks.7.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
665
+ "model.visual.blocks.7.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
666
+ "model.visual.blocks.7.norm1.bias": "model-00001-of-00002.safetensors",
667
+ "model.visual.blocks.7.norm1.weight": "model-00001-of-00002.safetensors",
668
+ "model.visual.blocks.7.norm2.bias": "model-00001-of-00002.safetensors",
669
+ "model.visual.blocks.7.norm2.weight": "model-00001-of-00002.safetensors",
670
+ "model.visual.blocks.8.attn.proj.bias": "model-00001-of-00002.safetensors",
671
+ "model.visual.blocks.8.attn.proj.weight": "model-00001-of-00002.safetensors",
672
+ "model.visual.blocks.8.attn.qkv.bias": "model-00001-of-00002.safetensors",
673
+ "model.visual.blocks.8.attn.qkv.weight": "model-00001-of-00002.safetensors",
674
+ "model.visual.blocks.8.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
675
+ "model.visual.blocks.8.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
676
+ "model.visual.blocks.8.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
677
+ "model.visual.blocks.8.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
678
+ "model.visual.blocks.8.norm1.bias": "model-00001-of-00002.safetensors",
679
+ "model.visual.blocks.8.norm1.weight": "model-00001-of-00002.safetensors",
680
+ "model.visual.blocks.8.norm2.bias": "model-00001-of-00002.safetensors",
681
+ "model.visual.blocks.8.norm2.weight": "model-00001-of-00002.safetensors",
682
+ "model.visual.blocks.9.attn.proj.bias": "model-00001-of-00002.safetensors",
683
+ "model.visual.blocks.9.attn.proj.weight": "model-00001-of-00002.safetensors",
684
+ "model.visual.blocks.9.attn.qkv.bias": "model-00001-of-00002.safetensors",
685
+ "model.visual.blocks.9.attn.qkv.weight": "model-00001-of-00002.safetensors",
686
+ "model.visual.blocks.9.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
687
+ "model.visual.blocks.9.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
688
+ "model.visual.blocks.9.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
689
+ "model.visual.blocks.9.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
690
+ "model.visual.blocks.9.norm1.bias": "model-00001-of-00002.safetensors",
691
+ "model.visual.blocks.9.norm1.weight": "model-00001-of-00002.safetensors",
692
+ "model.visual.blocks.9.norm2.bias": "model-00001-of-00002.safetensors",
693
+ "model.visual.blocks.9.norm2.weight": "model-00001-of-00002.safetensors",
694
+ "model.visual.deepstack_merger_list.0.linear_fc1.bias": "model-00001-of-00002.safetensors",
695
+ "model.visual.deepstack_merger_list.0.linear_fc1.weight": "model-00001-of-00002.safetensors",
696
+ "model.visual.deepstack_merger_list.0.linear_fc2.bias": "model-00001-of-00002.safetensors",
697
+ "model.visual.deepstack_merger_list.0.linear_fc2.weight": "model-00001-of-00002.safetensors",
698
+ "model.visual.deepstack_merger_list.0.norm.bias": "model-00001-of-00002.safetensors",
699
+ "model.visual.deepstack_merger_list.0.norm.weight": "model-00001-of-00002.safetensors",
700
+ "model.visual.deepstack_merger_list.1.linear_fc1.bias": "model-00001-of-00002.safetensors",
701
+ "model.visual.deepstack_merger_list.1.linear_fc1.weight": "model-00001-of-00002.safetensors",
702
+ "model.visual.deepstack_merger_list.1.linear_fc2.bias": "model-00001-of-00002.safetensors",
703
+ "model.visual.deepstack_merger_list.1.linear_fc2.weight": "model-00001-of-00002.safetensors",
704
+ "model.visual.deepstack_merger_list.1.norm.bias": "model-00001-of-00002.safetensors",
705
+ "model.visual.deepstack_merger_list.1.norm.weight": "model-00001-of-00002.safetensors",
706
+ "model.visual.deepstack_merger_list.2.linear_fc1.bias": "model-00001-of-00002.safetensors",
707
+ "model.visual.deepstack_merger_list.2.linear_fc1.weight": "model-00001-of-00002.safetensors",
708
+ "model.visual.deepstack_merger_list.2.linear_fc2.bias": "model-00001-of-00002.safetensors",
709
+ "model.visual.deepstack_merger_list.2.linear_fc2.weight": "model-00001-of-00002.safetensors",
710
+ "model.visual.deepstack_merger_list.2.norm.bias": "model-00001-of-00002.safetensors",
711
+ "model.visual.deepstack_merger_list.2.norm.weight": "model-00001-of-00002.safetensors",
712
+ "model.visual.merger.linear_fc1.bias": "model-00001-of-00002.safetensors",
713
+ "model.visual.merger.linear_fc1.weight": "model-00001-of-00002.safetensors",
714
+ "model.visual.merger.linear_fc2.bias": "model-00001-of-00002.safetensors",
715
+ "model.visual.merger.linear_fc2.weight": "model-00001-of-00002.safetensors",
716
+ "model.visual.merger.norm.bias": "model-00001-of-00002.safetensors",
717
+ "model.visual.merger.norm.weight": "model-00001-of-00002.safetensors",
718
+ "model.visual.patch_embed.proj.bias": "model-00001-of-00002.safetensors",
719
+ "model.visual.patch_embed.proj.weight": "model-00001-of-00002.safetensors",
720
+ "model.visual.pos_embed.weight": "model-00001-of-00002.safetensors"
721
+ }
722
+ }
modeling_qwen3_vl_nemotron_embed.py ADDED
@@ -0,0 +1,480 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Qwen3VLNemotronEmbed: Vision-Language Embedding Model with ColBERT-style scoring.
2
+
3
+ This module provides a bidirectional vision-language model for document retrieval
4
+ and embedding tasks, based on the Qwen3VL architecture with bidirectional attention.
5
+ """
6
+
7
+ from contextlib import nullcontext
8
+ from typing import Dict, List, Optional, TypeVar, Union
9
+
10
+ import torch
11
+ import torch.nn.functional as F
12
+ from datasets import Dataset
13
+ from torch.utils.data import DataLoader
14
+ from torch.utils.data import Dataset as TorchDataset
15
+ from tqdm import tqdm
16
+ from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
17
+ from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig
18
+ from transformers.models.qwen3_vl.modeling_qwen3_vl import (
19
+ BaseModelOutputWithPast,
20
+ Cache,
21
+ FlashAttentionKwargs,
22
+ Qwen3VLModel,
23
+ Qwen3VLPreTrainedModel,
24
+ Qwen3VLTextModel,
25
+ Unpack,
26
+ auto_docstring,
27
+ check_model_inputs,
28
+ )
29
+ from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
30
+
31
+ TV = TypeVar("TV")
32
+
33
+
34
+ class ListDataset(TorchDataset[TV]):
35
+ """Simple dataset wrapper for list elements."""
36
+
37
+ def __init__(self, elements: List[TV]):
38
+ self.elements = elements
39
+
40
+ def __len__(self) -> int:
41
+ return len(self.elements)
42
+
43
+ def __getitem__(self, idx: int) -> TV:
44
+ return self.elements[idx]
45
+
46
+
47
+ class Qwen3VLNemotronEmbedConfig(Qwen3VLConfig):
48
+ """Configuration for Qwen3VLNemotronEmbed models."""
49
+
50
+ model_type = "qwen3_vl_nemotron_embed"
51
+
52
+ pooling: str
53
+
54
+ def __init__(
55
+ self,
56
+ pooling: str = "colbert",
57
+ **kwargs,
58
+ ):
59
+ self.pooling = pooling
60
+ super().__init__(**kwargs)
61
+
62
+
63
+ def _create_bidirectional_mask(
64
+ config,
65
+ input_embeds: torch.Tensor,
66
+ attention_mask: Optional[torch.Tensor],
67
+ cache_position: torch.Tensor,
68
+ past_key_values: Optional[Cache],
69
+ position_ids: Optional[torch.Tensor] = None,
70
+ **kwargs,
71
+ ) -> Optional[torch.Tensor]:
72
+ """Create bidirectional attention mask based on attention implementation."""
73
+ if config._attn_implementation == "flash_attention_2":
74
+ if attention_mask is not None and (attention_mask == 0.0).any():
75
+ return attention_mask
76
+ return None
77
+ elif config._attn_implementation == "eager":
78
+ if attention_mask is not None:
79
+ return _prepare_4d_attention_mask(
80
+ attention_mask,
81
+ dtype=input_embeds.dtype,
82
+ tgt_len=input_embeds.shape[1],
83
+ )
84
+ return None
85
+ else:
86
+ if attention_mask is not None:
87
+ return _prepare_4d_attention_mask(
88
+ attention_mask,
89
+ dtype=input_embeds.dtype,
90
+ tgt_len=input_embeds.shape[1],
91
+ )
92
+ return None
93
+
94
+
95
+ class Qwen3VLNemotronEmbedTextModel(Qwen3VLTextModel):
96
+ """Bidirectional text model for Qwen3VLNemotronEmbed."""
97
+
98
+ def __init__(self, config):
99
+ super().__init__(config)
100
+ for layer in self.layers:
101
+ layer.self_attn.is_causal = False
102
+
103
+ @check_model_inputs()
104
+ @auto_docstring
105
+ def forward(
106
+ self,
107
+ input_ids: Optional[torch.LongTensor] = None,
108
+ attention_mask: Optional[torch.Tensor] = None,
109
+ position_ids: Optional[torch.LongTensor] = None,
110
+ past_key_values: Optional[Cache] = None,
111
+ inputs_embeds: Optional[torch.FloatTensor] = None,
112
+ use_cache: Optional[bool] = None,
113
+ cache_position: Optional[torch.LongTensor] = None,
114
+ visual_pos_masks: Optional[torch.Tensor] = None,
115
+ deepstack_visual_embeds: Optional[list[torch.Tensor]] = None,
116
+ **kwargs: Unpack[FlashAttentionKwargs],
117
+ ) -> Union[tuple, BaseModelOutputWithPast]:
118
+ """
119
+ visual_pos_masks (`torch.Tensor`, *optional*):
120
+ Boolean mask indicating positions of visual tokens in the sequence.
121
+ Used for deepstack processing to identify where to inject visual features.
122
+ deepstack_visual_embeds (`list[torch.Tensor]`, *optional*):
123
+ List of visual embeddings from intermediate vision encoder layers.
124
+ Each tensor corresponds to a decoder layer where visual features are injected.
125
+ """
126
+ if (input_ids is None) ^ (inputs_embeds is not None):
127
+ raise ValueError(
128
+ "You must specify exactly one of input_ids or inputs_embeds"
129
+ )
130
+
131
+ if inputs_embeds is None:
132
+ inputs_embeds = self.embed_tokens(input_ids)
133
+
134
+ if position_ids is None:
135
+ position_ids = cache_position.view(1, 1, -1).expand(
136
+ 3, inputs_embeds.shape[0], -1
137
+ )
138
+ elif position_ids.ndim == 2:
139
+ position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
140
+
141
+ if position_ids.ndim == 3 and position_ids.shape[0] == 4:
142
+ text_position_ids = position_ids[0]
143
+ position_ids = position_ids[1:]
144
+ else:
145
+ text_position_ids = position_ids[0]
146
+
147
+ attention_mask = _create_bidirectional_mask(
148
+ config=self.config,
149
+ input_embeds=inputs_embeds,
150
+ attention_mask=attention_mask,
151
+ cache_position=cache_position,
152
+ past_key_values=past_key_values,
153
+ position_ids=text_position_ids,
154
+ )
155
+
156
+ hidden_states = inputs_embeds
157
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
158
+
159
+ for layer_idx, decoder_layer in enumerate(self.layers):
160
+ layer_outputs = decoder_layer(
161
+ hidden_states,
162
+ attention_mask=attention_mask,
163
+ position_ids=text_position_ids,
164
+ past_key_values=past_key_values,
165
+ cache_position=cache_position,
166
+ position_embeddings=position_embeddings,
167
+ **kwargs,
168
+ )
169
+ hidden_states = layer_outputs
170
+
171
+ if deepstack_visual_embeds is not None and layer_idx in range(
172
+ len(deepstack_visual_embeds)
173
+ ):
174
+ hidden_states = self._deepstack_process(
175
+ hidden_states,
176
+ visual_pos_masks,
177
+ deepstack_visual_embeds[layer_idx],
178
+ )
179
+
180
+ hidden_states = self.norm(hidden_states)
181
+
182
+ return BaseModelOutputWithPast(
183
+ last_hidden_state=hidden_states,
184
+ )
185
+
186
+
187
+ class Qwen3VLNemotronEmbedVisionLanguageModel(Qwen3VLModel):
188
+ """Vision-language model with bidirectional text attention."""
189
+
190
+ def __init__(self, config):
191
+ Qwen3VLPreTrainedModel.__init__(self, config)
192
+
193
+ from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLVisionModel
194
+
195
+ self.visual = Qwen3VLVisionModel._from_config(config.vision_config)
196
+ self.language_model = Qwen3VLNemotronEmbedTextModel._from_config(
197
+ config.text_config
198
+ )
199
+ self.rope_deltas = None
200
+
201
+ self.post_init()
202
+
203
+
204
+ class Qwen3VLNemotronEmbedForConditionalGeneration(Qwen3VLForConditionalGeneration):
205
+ """Qwen3VLNemotronEmbed for conditional generation (base class)."""
206
+
207
+ config: Qwen3VLNemotronEmbedConfig
208
+
209
+ def __init__(self, config):
210
+ super().__init__(config)
211
+ self.model = Qwen3VLNemotronEmbedVisionLanguageModel._from_config(config)
212
+
213
+
214
+ class EmbeddingMixin:
215
+ """Mixin providing high-level embedding extraction methods."""
216
+
217
+ def _get_processor(self):
218
+ """Get or create the processor for this model."""
219
+ if not hasattr(self, "_processor") or self._processor is None:
220
+ self._processor = AutoProcessor.from_pretrained(
221
+ self.config._name_or_path, trust_remote_code=True
222
+ )
223
+ return self._processor
224
+
225
+ def process_queries(self, queries: List[str], **kwargs) -> Dict[str, torch.Tensor]:
226
+ """Process text queries for embedding extraction."""
227
+ return self._get_processor().process_queries(queries, **kwargs)
228
+
229
+ def process_documents(
230
+ self, documents: Union[Dict, List[Dict]], **kwargs
231
+ ) -> Dict[str, torch.Tensor]:
232
+ """Process documents (image + text) for embedding extraction."""
233
+ return self._get_processor().process_documents(documents, **kwargs)
234
+
235
+ def _extract_embeddings(
236
+ self, dataloader: DataLoader, is_query: bool
237
+ ) -> torch.Tensor:
238
+ """Extract embeddings from a dataloader.
239
+
240
+ Args:
241
+ dataloader: DataLoader yielding batches of processed inputs.
242
+ is_query: Whether these are query embeddings (for progress message).
243
+
244
+ Returns:
245
+ Tensor of embeddings with shape (num_samples, max_seq_len, hidden_dim).
246
+ """
247
+ device = next(self.parameters()).device
248
+ qs = []
249
+ message = "query" if is_query else "document"
250
+
251
+ for batch in tqdm(dataloader, desc=f"Extracting {message} embeddings..."):
252
+ with torch.inference_mode():
253
+ with (
254
+ torch.autocast(device_type="cuda", dtype=torch.bfloat16)
255
+ if device.type == "cuda"
256
+ else nullcontext()
257
+ ):
258
+ if "pixel_values" in batch and batch["pixel_values"] is None:
259
+ batch.pop("pixel_values")
260
+ batch = {k: v.to(device) for k, v in batch.items()}
261
+ embeddings = self(**batch, output_hidden_states=True).hidden_states[
262
+ -1
263
+ ]
264
+ embeddings = embeddings * batch["attention_mask"].unsqueeze(-1)
265
+ embeddings = F.normalize(embeddings, dim=-1)
266
+
267
+ if not torch.isfinite(embeddings).all():
268
+ raise ValueError("Embeddings contain NaN or Inf values")
269
+
270
+ qs.append(embeddings.detach().cpu())
271
+
272
+ all_embeddings_tensor = self.padding_various_shape_tensor(qs)
273
+ return all_embeddings_tensor
274
+
275
+ def forward_queries(self, queries: List[str], batch_size: int = 8) -> torch.Tensor:
276
+ """Forward text queries and extract embeddings.
277
+
278
+ Args:
279
+ queries: List of query strings.
280
+ batch_size: Batch size for processing.
281
+
282
+ Returns:
283
+ Tensor of query embeddings with shape (num_queries, max_seq_len, hidden_dim).
284
+ """
285
+ if isinstance(queries, DataLoader):
286
+ dataset = queries.dataset
287
+ else:
288
+ dataset = ListDataset[str](queries)
289
+
290
+ dataloader = DataLoader(
291
+ dataset=dataset,
292
+ batch_size=batch_size,
293
+ collate_fn=self.process_queries,
294
+ shuffle=False,
295
+ drop_last=False,
296
+ )
297
+ return self._extract_embeddings(dataloader=dataloader, is_query=True)
298
+
299
+ def forward_documents(
300
+ self, corpus: List[Dict], batch_size: int = 8
301
+ ) -> torch.Tensor:
302
+ """Forward documents (image + text) and extract embeddings.
303
+
304
+ Args:
305
+ corpus: List of dicts with "image" and "text" keys.
306
+ batch_size: Batch size for processing.
307
+
308
+ Returns:
309
+ Tensor of document embeddings with shape (num_docs, max_seq_len, hidden_dim).
310
+ """
311
+ images = []
312
+ texts = []
313
+ for doc in corpus:
314
+ text = doc.get("text", "")
315
+ image = doc.get("image")
316
+ if image is not None and image.mode != "RGB":
317
+ image = image.convert("RGB")
318
+ images.append(image)
319
+ texts.append(text)
320
+
321
+ dataset = Dataset.from_dict({"image": images, "text": texts})
322
+ dataloader = DataLoader(
323
+ dataset=dataset,
324
+ batch_size=batch_size,
325
+ collate_fn=self.process_documents,
326
+ shuffle=False,
327
+ num_workers=8,
328
+ pin_memory=True,
329
+ drop_last=False,
330
+ )
331
+ return self._extract_embeddings(dataloader=dataloader, is_query=False)
332
+
333
+ def forward_images(
334
+ self, images: List, batch_size: int = 8, **kwargs
335
+ ) -> torch.Tensor:
336
+ """Forward images as image-only documents.
337
+
338
+ Args:
339
+ images: List of PIL Images.
340
+ batch_size: Batch size for processing.
341
+
342
+ Returns:
343
+ Tensor of image embeddings.
344
+ """
345
+ corpus = [{"image": image, "text": ""} for image in images]
346
+ return self.forward_documents(corpus, batch_size)
347
+
348
+ def forward_passages(
349
+ self, images: List, batch_size: int = 8, **kwargs
350
+ ) -> torch.Tensor:
351
+ """Forward passages as image-only documents (alias for forward_images)."""
352
+ return self.forward_images(images, batch_size, **kwargs)
353
+
354
+
355
+ class ColBERTScoringMixin:
356
+ """Mixin providing ColBERT MaxSim scoring methods."""
357
+
358
+ def padding_various_shape_tensor(self, tensors: List[torch.Tensor]) -> torch.Tensor:
359
+ """Pad tensors of various shapes for ColBERT-like scoring.
360
+
361
+ Args:
362
+ tensors: List of tensors with shape (batch, seq_len, hidden_dim)
363
+
364
+ Returns:
365
+ Concatenated tensor with all sequences padded to max length.
366
+ """
367
+ max_seq_len = max(t.shape[1] for t in tensors)
368
+ padded_tensors = [
369
+ F.pad(t, (0, 0, 0, max_seq_len - t.shape[1]), mode="constant", value=0)
370
+ for t in tensors
371
+ ]
372
+ return torch.cat(padded_tensors, dim=0)
373
+
374
+ def colbert_score(
375
+ self,
376
+ qs: Union[torch.Tensor, List[torch.Tensor]],
377
+ ps: Union[torch.Tensor, List[torch.Tensor]],
378
+ batch_size: int = 128,
379
+ device: Optional[Union[str, torch.device]] = None,
380
+ ) -> torch.Tensor:
381
+ """Compute ColBERT MaxSim scores between queries and passages.
382
+
383
+ Args:
384
+ qs: Query embeddings - tensor or list of tensors.
385
+ ps: Passage embeddings - tensor or list of tensors.
386
+ batch_size: Batch size for scoring computation.
387
+ device: Device to run computation on.
388
+
389
+ Returns:
390
+ Score matrix of shape (num_queries, num_passages).
391
+ """
392
+ if batch_size is None:
393
+ batch_size = 128
394
+ if device is None:
395
+ device = next(self.parameters()).device
396
+
397
+ if isinstance(qs, torch.Tensor):
398
+ qs = [qs[i] for i in range(qs.shape[0])]
399
+ if isinstance(ps, torch.Tensor):
400
+ ps = [ps[i] for i in range(ps.shape[0])]
401
+
402
+ if len(qs) == 0:
403
+ raise ValueError("No queries provided")
404
+ if len(ps) == 0:
405
+ raise ValueError("No passages provided")
406
+
407
+ scores_list: List[torch.Tensor] = []
408
+ for i in range(0, len(qs), batch_size):
409
+ scores_batch = []
410
+ qs_slice = qs[i : i + batch_size]
411
+ qs_batch = torch.nn.utils.rnn.pad_sequence(
412
+ [q.to(device) for q in qs_slice], batch_first=True, padding_value=0
413
+ )
414
+ for j in range(0, len(ps), batch_size):
415
+ ps_slice = ps[j : j + batch_size]
416
+ ps_batch = torch.nn.utils.rnn.pad_sequence(
417
+ [p.to(device) for p in ps_slice], batch_first=True, padding_value=0
418
+ )
419
+ scores_batch.append(
420
+ torch.einsum("bnd,csd->bcns", qs_batch, ps_batch)
421
+ .max(dim=3)[0]
422
+ .sum(dim=2)
423
+ )
424
+ scores_batch = torch.cat(scores_batch, dim=1)
425
+ scores_list.append(scores_batch)
426
+
427
+ scores = torch.cat(scores_list, dim=0)
428
+ return scores
429
+
430
+ def get_scores(
431
+ self,
432
+ query_embeddings: Union[torch.Tensor, List[torch.Tensor]],
433
+ passage_embeddings: Union[torch.Tensor, List[torch.Tensor]],
434
+ batch_size: Optional[int] = 128,
435
+ ) -> torch.Tensor:
436
+ """Compute ColBERT MaxSim scores between queries and passages.
437
+
438
+ Args:
439
+ query_embeddings: Query embeddings.
440
+ passage_embeddings: Passage embeddings.
441
+ batch_size: Batch size for scoring computation.
442
+
443
+ Returns:
444
+ Score matrix of shape (num_queries, num_passages).
445
+ """
446
+ if isinstance(query_embeddings, list):
447
+ if len(query_embeddings[0].shape) == 2:
448
+ query_embeddings = [q.unsqueeze(0) for q in query_embeddings]
449
+ query_embeddings = self.padding_various_shape_tensor(query_embeddings)
450
+ if isinstance(passage_embeddings, list):
451
+ if len(passage_embeddings[0].shape) == 2:
452
+ passage_embeddings = [p.unsqueeze(0) for p in passage_embeddings]
453
+ passage_embeddings = self.padding_various_shape_tensor(passage_embeddings)
454
+
455
+ return self.colbert_score(
456
+ query_embeddings, passage_embeddings, batch_size or 128
457
+ )
458
+
459
+
460
+ class Qwen3VLNemotronEmbedModel(
461
+ EmbeddingMixin, ColBERTScoringMixin, Qwen3VLNemotronEmbedForConditionalGeneration
462
+ ):
463
+ """Qwen3VLNemotronEmbed: Vision-Language Embedding Model.
464
+
465
+ A bidirectional vision-language model for document retrieval and embedding tasks.
466
+ Based on Qwen3VL architecture with bidirectional attention for embedding extraction.
467
+
468
+ Features:
469
+ - ColBERT MaxSim scoring (get_scores, colbert_score)
470
+ - High-level embedding methods (forward_queries, forward_documents, forward_images)
471
+ - Automatic processor loading for query/document processing
472
+
473
+ Example:
474
+ >>> model = AutoModel.from_pretrained("nvidia/qwen3vl-nemotron-embed-4b", trust_remote_code=True)
475
+ >>> query_embeddings = model.forward_queries(["What is machine learning?"])
476
+ >>> doc_embeddings = model.forward_documents([{"image": img, "text": "ML explanation"}])
477
+ >>> scores = model.get_scores(query_embeddings, doc_embeddings)
478
+ """
479
+
480
+ config_class = Qwen3VLNemotronEmbedConfig
mteb2_eval.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # Copyright (c) 2026 NVIDIA
3
+ # Licensed under customized NSCLv1 [see LICENSE.md for details]
4
+ # --------------------------------------------------------
5
+
6
+ """
7
+ pip install "mteb>=2.6.5, <3.0.0"
8
+ python3 mteb2_eval.py --model_name nvidia/nemotron-colembed-4b-v2 --batch_size 16 --benchmark "ViDoRe(v3)" --task-list Vidore3ComputerScienceRetrieval
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import argparse
14
+ import os
15
+
16
+ import mteb
17
+
18
+
19
+ def main():
20
+ parser = argparse.ArgumentParser()
21
+ parser.add_argument("--model_name", type=str, required=True)
22
+ parser.add_argument("--batch_size", type=int, default=16, required=False)
23
+ parser.add_argument(
24
+ "--results_folder", type=str, default="results_csv", required=False
25
+ )
26
+ parser.add_argument("--predictions_folder", type=str, default=None, required=False)
27
+ parser.add_argument(
28
+ "--benchmark",
29
+ type=str,
30
+ required=False,
31
+ default="ViDoRe(v3)",
32
+ choices=[
33
+ "ViDoRe(v3)", # Vidore V3
34
+ "VisualDocumentRetrieval", # Vidore V1 & V2
35
+ ],
36
+ )
37
+ parser.add_argument(
38
+ "--task-list",
39
+ type=str,
40
+ nargs="+", # Accept one or more space-separated string arguments
41
+ default=None, # Default to None if the argument is not provided
42
+ help="Optional: A list of task class names to run. If not provided, all tasks will be run.",
43
+ )
44
+ args = parser.parse_args()
45
+
46
+ print(f"Loading model: {args.model_name}")
47
+ model = mteb.get_model_meta(args.model_name)
48
+
49
+ # Loads all benchmark tasks
50
+ all_tasks = mteb.get_benchmark(args.benchmark).tasks
51
+ all_tasks_names = " ".join([task.__class__.__name__ for task in all_tasks])
52
+ print(f"Available tasks in benchmark {args.benchmark}: {all_tasks_names}")
53
+
54
+ # filter tasks
55
+ if args.task_list:
56
+ # If user provided a list, filter all_tasks
57
+ print(f"Running evaluation on specified tasks: {args.task_list}")
58
+ requested_task_names = set(args.task_list)
59
+ tasks = [
60
+ task
61
+ for task in all_tasks
62
+ if task.__class__.__name__ in requested_task_names
63
+ ]
64
+
65
+ # Optional: Warn if a requested task was not found
66
+ found_names = {t.__class__.__name__ for t in tasks}
67
+ missing = requested_task_names - found_names
68
+ if missing:
69
+ print(
70
+ f"Warning: The following requested tasks were not found and will be skipped: {missing}"
71
+ )
72
+ else:
73
+ # If --task-list was not provided, use all tasks
74
+ print("Running evaluation on all available tasks.")
75
+ tasks = all_tasks
76
+
77
+ tasks_names = " ".join([task.__class__.__name__ for task in tasks])
78
+ print(f"Evaluating tasks: {tasks_names}")
79
+
80
+ results = mteb.evaluate(
81
+ model=model,
82
+ tasks=tasks,
83
+ encode_kwargs={
84
+ "batch_size": args.batch_size,
85
+ },
86
+ prediction_folder=args.predictions_folder,
87
+ overwrite_strategy="always",
88
+ )
89
+
90
+ print(results)
91
+
92
+ print(f"Saving results to {args.results_folder}")
93
+ os.makedirs(args.results_folder, exist_ok=True)
94
+ model_name = args.model_name.replace("/", "_")
95
+ output_path = os.path.join(
96
+ args.results_folder, f"{model_name}-{tasks_names.replace(' ', '-')}.csv"
97
+ )
98
+ df = results.to_dataframe()
99
+ df.to_csv(output_path, index=False)
100
+
101
+
102
+ if __name__ == "__main__":
103
+ main()
processing_qwen3_vl_nemotron_embed.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Qwen3VLNemotronEmbed Processor for query and document processing."""
2
+
3
+ import math
4
+ from typing import Any, Dict, List, Optional, Union
5
+
6
+ import torch
7
+ from PIL import Image
8
+ from transformers import Qwen3VLProcessor
9
+
10
+
11
+ class Qwen3VLNemotronEmbedProcessor(Qwen3VLProcessor):
12
+ """Processor for Qwen3VLNemotronEmbed that handles query/document processing.
13
+
14
+ This processor extends Qwen3VLProcessor with methods for processing queries and
15
+ documents for retrieval tasks.
16
+
17
+ Args:
18
+ image_processor: Image processor for vision inputs.
19
+ tokenizer: Tokenizer for text inputs.
20
+ chat_template: Optional chat template.
21
+ q_max_length: Maximum length for query sequences (default: 512).
22
+ p_max_length: Maximum length for passage/document sequences (default: 4096).
23
+ query_prefix: Prefix to add to queries (default: "query:").
24
+ passage_prefix: Prefix to add to passages (default: "passage:").
25
+ reserve_tokens_for_images: Reserved tokens for image placeholders (default: 100).
26
+ """
27
+
28
+ # Attributes to save/load
29
+ processor_attributes = [
30
+ "q_max_length",
31
+ "p_max_length",
32
+ "query_prefix",
33
+ "passage_prefix",
34
+ "reserve_tokens_for_images",
35
+ ]
36
+
37
+ def __init__(
38
+ self,
39
+ image_processor=None,
40
+ tokenizer=None,
41
+ chat_template=None,
42
+ q_max_length: int = 512,
43
+ p_max_length: int = 4096,
44
+ query_prefix: str = "query:",
45
+ passage_prefix: str = "passage:",
46
+ reserve_tokens_for_images: int = 100,
47
+ **kwargs,
48
+ ):
49
+ if chat_template is not None:
50
+ super().__init__(image_processor, tokenizer, chat_template, **kwargs)
51
+ else:
52
+ super().__init__(image_processor, tokenizer, **kwargs)
53
+
54
+ self.q_max_length = q_max_length
55
+ self.p_max_length = p_max_length
56
+ self.query_prefix = query_prefix
57
+ self.passage_prefix = passage_prefix
58
+ self.reserve_tokens_for_images = reserve_tokens_for_images
59
+
60
+ self.patch_size = self.image_processor.patch_size
61
+ self.merge_size = self.image_processor.merge_size
62
+
63
+ def apply_chat_template(
64
+ self,
65
+ conversation,
66
+ chat_template=None,
67
+ **kwargs,
68
+ ) -> str:
69
+ """Apply chat template to conversation."""
70
+ return self.tokenizer.apply_chat_template(
71
+ conversation,
72
+ chat_template=chat_template,
73
+ **kwargs,
74
+ )
75
+
76
+ @property
77
+ def min_pixels(self) -> int:
78
+ """Get min_pixels from image processor."""
79
+ return self.image_processor.min_pixels
80
+
81
+ @property
82
+ def max_pixels(self) -> int:
83
+ """Get max_pixels from image processor."""
84
+ return self.image_processor.max_pixels
85
+
86
+ def calculate_image_tokens(
87
+ self,
88
+ image: Image.Image,
89
+ min_pixels: Optional[int] = None,
90
+ max_pixels: Optional[int] = None,
91
+ ) -> int:
92
+ """Calculate the number of tokens an image will use after processing.
93
+
94
+ Args:
95
+ image: PIL Image to calculate tokens for.
96
+ min_pixels: Minimum pixels for resizing (uses processor default if None).
97
+ max_pixels: Maximum pixels for resizing (uses processor default if None).
98
+
99
+ Returns:
100
+ Number of tokens the image will consume.
101
+ """
102
+ min_pixels = min_pixels or self.min_pixels
103
+ max_pixels = max_pixels or self.max_pixels
104
+
105
+ width, height = image.size
106
+ factor = self.patch_size * self.merge_size
107
+
108
+ h_bar = round(height / factor) * factor
109
+ w_bar = round(width / factor) * factor
110
+
111
+ if h_bar * w_bar > max_pixels:
112
+ beta = math.sqrt((height * width) / max_pixels)
113
+ h_bar = max(factor, math.floor(height / beta / factor) * factor)
114
+ w_bar = max(factor, math.floor(width / beta / factor) * factor)
115
+ elif h_bar * w_bar < min_pixels:
116
+ beta = math.sqrt(min_pixels / (height * width))
117
+ h_bar = math.ceil(height * beta / factor) * factor
118
+ w_bar = math.ceil(width * beta / factor) * factor
119
+
120
+ grid_h = h_bar // self.patch_size
121
+ grid_w = w_bar // self.patch_size
122
+ num_patches = grid_h * grid_w
123
+ return num_patches // (self.merge_size**2)
124
+
125
+ def process_queries(
126
+ self,
127
+ queries: List[str | dict],
128
+ padding: bool = True,
129
+ truncation: bool = True,
130
+ pad_to_multiple_of: Optional[int] = None,
131
+ return_tensors: str = "pt",
132
+ **kwargs,
133
+ ) -> Dict[str, torch.Tensor]:
134
+ """Process text queries for retrieval.
135
+
136
+ Args:
137
+ queries: List of query strings or dicts with "text" key.
138
+ padding: Whether to pad sequences.
139
+ truncation: Whether to truncate sequences.
140
+ pad_to_multiple_of: Pad to a multiple of this value.
141
+ return_tensors: Return tensor type ("pt" for PyTorch).
142
+
143
+ Returns:
144
+ Dictionary with input_ids, attention_mask, and other model inputs.
145
+ """
146
+ query_texts = []
147
+ for query in queries:
148
+ if isinstance(query, dict):
149
+ query_text = query["text"]
150
+ else:
151
+ query_text = query
152
+
153
+ prefixed = f"{self.query_prefix} {query_text}" if self.query_prefix else query_text
154
+ message = [
155
+ {
156
+ "role": "user",
157
+ "content": [{"type": "text", "text": f"Query: {prefixed}"}],
158
+ }
159
+ ]
160
+ query_text = self.apply_chat_template(
161
+ message, tokenize=False, add_generation_prompt=True
162
+ )
163
+ query_texts.append(query_text)
164
+
165
+ return self(
166
+ text=query_texts,
167
+ truncation=truncation,
168
+ max_length=self.q_max_length,
169
+ padding=padding,
170
+ pad_to_multiple_of=pad_to_multiple_of,
171
+ return_tensors=return_tensors,
172
+ **kwargs,
173
+ )
174
+
175
+ def process_documents(
176
+ self,
177
+ documents: Union[Dict[str, List], List[Dict[str, Any]]],
178
+ padding: bool = True,
179
+ truncation: bool = True,
180
+ pad_to_multiple_of: Optional[int] = None,
181
+ return_tensors: str = "pt",
182
+ **kwargs,
183
+ ) -> Dict[str, torch.Tensor]:
184
+ """Process image-text documents for retrieval.
185
+
186
+ Args:
187
+ documents: Either a dict with "image" and "text" keys containing lists,
188
+ or a list of dicts each with "image" and "text" keys.
189
+ padding: Whether to pad sequences.
190
+ truncation: Whether to truncate sequences.
191
+ pad_to_multiple_of: Pad to a multiple of this value.
192
+ return_tensors: Return tensor type ("pt" for PyTorch).
193
+
194
+ Returns:
195
+ Dictionary with input_ids, attention_mask, pixel_values, and other model inputs.
196
+ """
197
+ if isinstance(documents, dict):
198
+ images = documents["image"]
199
+ texts = documents["text"]
200
+ assert len(texts) == len(images), (
201
+ "Number of texts must match number of images"
202
+ )
203
+ elif isinstance(documents, list):
204
+ images = [d["image"] for d in documents]
205
+ texts = [d["text"] for d in documents]
206
+ else:
207
+ raise ValueError("documents must be a dict or list of dicts")
208
+
209
+ if self.passage_prefix:
210
+ texts = [f"{self.passage_prefix} {t}" for t in texts]
211
+
212
+ image_tokens_list = [self.calculate_image_tokens(img) for img in images]
213
+ max_image_tokens = max(image_tokens_list) if image_tokens_list else 0
214
+
215
+ assert self.p_max_length > max_image_tokens + self.reserve_tokens_for_images, (
216
+ f"p_max_length ({self.p_max_length}) is too small for max_image_tokens "
217
+ f"({max_image_tokens}) + reserve ({self.reserve_tokens_for_images})"
218
+ )
219
+ available_text_tokens = (
220
+ self.p_max_length - max_image_tokens - self.reserve_tokens_for_images
221
+ )
222
+
223
+ if (
224
+ pad_to_multiple_of is not None
225
+ and available_text_tokens % pad_to_multiple_of != 0
226
+ ):
227
+ available_text_tokens = (
228
+ available_text_tokens // pad_to_multiple_of
229
+ ) * pad_to_multiple_of
230
+
231
+ input_texts = []
232
+ for text, image in zip(texts, images):
233
+ message = [
234
+ {
235
+ "role": "user",
236
+ "content": [
237
+ {"type": "image", "image": image},
238
+ {"type": "text", "text": text},
239
+ ],
240
+ }
241
+ ]
242
+ input_text = self.apply_chat_template(
243
+ message, tokenize=False, add_generation_prompt=True
244
+ )
245
+ input_texts.append(input_text)
246
+
247
+ return self(
248
+ text=input_texts,
249
+ images=images,
250
+ truncation=truncation,
251
+ padding=padding,
252
+ pad_to_multiple_of=pad_to_multiple_of,
253
+ return_tensors=return_tensors,
254
+ max_length=available_text_tokens,
255
+ **kwargs,
256
+ )
processor_config.json ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_qwen3_vl_nemotron_embed.Qwen3VLNemotronEmbedProcessor"
4
+ },
5
+ "q_max_length": 512,
6
+ "p_max_length": 4096,
7
+ "query_prefix": "query:",
8
+ "passage_prefix": "passage:",
9
+ "reserve_tokens_for_images": 100,
10
+ "image_processor": {
11
+ "crop_size": null,
12
+ "data_format": "channels_first",
13
+ "device": null,
14
+ "disable_grouping": null,
15
+ "do_center_crop": null,
16
+ "do_convert_rgb": true,
17
+ "do_normalize": true,
18
+ "do_pad": null,
19
+ "do_rescale": true,
20
+ "do_resize": true,
21
+ "image_mean": [
22
+ 0.5,
23
+ 0.5,
24
+ 0.5
25
+ ],
26
+ "image_processor_type": "Qwen2VLImageProcessorFast",
27
+ "image_seq_length": null,
28
+ "image_std": [
29
+ 0.5,
30
+ 0.5,
31
+ 0.5
32
+ ],
33
+ "input_data_format": null,
34
+ "max_pixels": 802816,
35
+ "merge_size": 2,
36
+ "min_pixels": 2352,
37
+ "pad_size": null,
38
+ "patch_size": 16,
39
+ "processor_class": "Qwen3VLProcessor",
40
+ "resample": 3,
41
+ "rescale_factor": 0.00392156862745098,
42
+ "return_tensors": null,
43
+ "size": {
44
+ "longest_edge": 16777216,
45
+ "shortest_edge": 65536
46
+ },
47
+ "temporal_patch_size": 2
48
+ },
49
+ "processor_class": "Qwen3VLNemotronEmbedProcessor",
50
+ "video_processor": {
51
+ "crop_size": null,
52
+ "data_format": "channels_first",
53
+ "default_to_square": true,
54
+ "device": null,
55
+ "do_center_crop": null,
56
+ "do_convert_rgb": true,
57
+ "do_normalize": true,
58
+ "do_pad": null,
59
+ "do_rescale": true,
60
+ "do_resize": true,
61
+ "do_sample_frames": true,
62
+ "fps": 2,
63
+ "image_mean": [
64
+ 0.5,
65
+ 0.5,
66
+ 0.5
67
+ ],
68
+ "image_std": [
69
+ 0.5,
70
+ 0.5,
71
+ 0.5
72
+ ],
73
+ "input_data_format": null,
74
+ "max_frames": 768,
75
+ "merge_size": 2,
76
+ "min_frames": 4,
77
+ "num_frames": null,
78
+ "pad_size": null,
79
+ "patch_size": 16,
80
+ "processor_class": "Qwen3VLProcessor",
81
+ "resample": 3,
82
+ "rescale_factor": 0.00392156862745098,
83
+ "return_metadata": false,
84
+ "return_tensors": null,
85
+ "size": {
86
+ "longest_edge": 25165824,
87
+ "shortest_edge": 4096
88
+ },
89
+ "temporal_patch_size": 2,
90
+ "video_metadata": null,
91
+ "video_processor_type": "Qwen3VLVideoProcessor"
92
+ }
93
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
tokenizer_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "additional_special_tokens": null,
5
+ "backend": "tokenizers",
6
+ "bos_token": null,
7
+ "clean_up_tokenization_spaces": false,
8
+ "eos_token": "<|im_end|>",
9
+ "errors": "replace",
10
+ "extra_special_tokens": [
11
+ "<|im_start|>",
12
+ "<|im_end|>",
13
+ "<|object_ref_start|>",
14
+ "<|object_ref_end|>",
15
+ "<|box_start|>",
16
+ "<|box_end|>",
17
+ "<|quad_start|>",
18
+ "<|quad_end|>",
19
+ "<|vision_start|>",
20
+ "<|vision_end|>",
21
+ "<|vision_pad|>",
22
+ "<|image_pad|>",
23
+ "<|video_pad|>"
24
+ ],
25
+ "is_local": true,
26
+ "model_max_length": 262144,
27
+ "pad_token": "<|endoftext|>",
28
+ "processor_class": "Qwen3VLProcessor",
29
+ "split_special_tokens": false,
30
+ "tokenizer_class": "Qwen2Tokenizer",
31
+ "unk_token": null
32
+ }