ronay-nv commited on Jan 16

Commit

214d72a

verified ·

1 Parent(s): 7873b5f

Upload folder using huggingface_hub

Browse files

Files changed (18) hide show

.gitattributes +1 -0
.ipynb_checkpoints/config-checkpoint.json +72 -0
.ipynb_checkpoints/generation_config-checkpoint.json +7 -0
.ipynb_checkpoints/mteb2_eval-checkpoint.py +103 -0
LICENSE +35 -0
README.md +337 -0
chat_template.jinja +120 -0
config.json +72 -0
generation_config.json +7 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +722 -0
modeling_qwen3_vl_nemotron_embed.py +480 -0
mteb2_eval.py +103 -0
processing_qwen3_vl_nemotron_embed.py +256 -0
processor_config.json +93 -0
tokenizer.json +3 -0
tokenizer_config.json +32 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

.ipynb_checkpoints/config-checkpoint.json ADDED Viewed

	@@ -0,0 +1,72 @@

+{
+  "architectures": [
+    "Qwen3VLNemotronEmbedModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "modeling_qwen3_vl_nemotron_embed.Qwen3VLNemotronEmbedConfig",
+    "AutoModel": "modeling_qwen3_vl_nemotron_embed.Qwen3VLNemotronEmbedModel",
+    "AutoProcessor": "processing_qwen3_vl_nemotron_embed.Qwen3VLNemotronEmbedProcessor"
+  },
+  "dtype": "bfloat16",
+  "image_token_id": 151655,
+  "model_type": "qwen3_vl_nemotron_embed",
+  "pooling": "colbert",
+  "text_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "dtype": "bfloat16",
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 2560,
+    "initializer_range": 0.02,
+    "intermediate_size": 9728,
+    "max_position_embeddings": 262144,
+    "model_type": "qwen3_vl_text",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-06,
+    "rope_parameters": {
+      "mrope_interleaved": true,
+      "mrope_section": [
+        24,
+        20,
+        20
+      ],
+      "rope_theta": 5000000,
+      "rope_type": "default"
+    },
+    "tie_word_embeddings": true,
+    "use_cache": true,
+    "vocab_size": 151936
+  },
+  "tie_word_embeddings": true,
+  "transformers_version": "5.0.0rc0",
+  "use_cache": false,
+  "video_token_id": 151656,
+  "vision_config": {
+    "deepstack_visual_indexes": [
+      5,
+      11,
+      17
+    ],
+    "depth": 24,
+    "dtype": "bfloat16",
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1024,
+    "in_channels": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "model_type": "qwen3_vl",
+    "num_heads": 16,
+    "num_position_embeddings": 2304,
+    "out_hidden_size": 2560,
+    "patch_size": 16,
+    "spatial_merge_size": 2,
+    "temporal_patch_size": 2
+  },
+  "vision_end_token_id": 151653,
+  "vision_start_token_id": 151652
+}

.ipynb_checkpoints/generation_config-checkpoint.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "transformers_version": "5.0.0rc0",
+  "use_cache": false
+}

.ipynb_checkpoints/mteb2_eval-checkpoint.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# --------------------------------------------------------
+# Copyright (c) 2026 NVIDIA
+# Licensed under customized NSCLv1 [see LICENSE.md for details]
+# --------------------------------------------------------
+"""
+pip install "mteb>=2.6.5, <3.0.0"
+python3 mteb2_eval.py --model_name nvidia/nemotron-colembed-4b-v2 --batch_size 16 --benchmark "ViDoRe(v3)" --task-list Vidore3ComputerScienceRetrieval
+"""
+from __future__ import annotations
+import argparse
+import os
+import mteb
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", type=str, required=True)
+    parser.add_argument("--batch_size", type=int, default=16, required=False)
+    parser.add_argument(
+        "--results_folder", type=str, default="results_csv", required=False
+    )
+    parser.add_argument("--predictions_folder", type=str, default=None, required=False)
+    parser.add_argument(
+        "--benchmark",
+        type=str,
+        required=False,
+        default="ViDoRe(v3)",
+        choices=[
+            "ViDoRe(v3)",  # Vidore V3
+            "VisualDocumentRetrieval",  # Vidore V1 & V2
+        ],
+    )
+    parser.add_argument(
+        "--task-list",
+        type=str,
+        nargs="+",  # Accept one or more space-separated string arguments
+        default=None,  # Default to None if the argument is not provided
+        help="Optional: A list of task class names to run. If not provided, all tasks will be run.",
+    )
+    args = parser.parse_args()
+    print(f"Loading model: {args.model_name}")
+    model = mteb.get_model_meta(args.model_name)
+    # Loads all benchmark tasks
+    all_tasks = mteb.get_benchmark(args.benchmark).tasks
+    all_tasks_names = " ".join([task.__class__.__name__ for task in all_tasks])
+    print(f"Available tasks in benchmark {args.benchmark}: {all_tasks_names}")
+    # filter tasks
+    if args.task_list:
+        # If user provided a list, filter all_tasks
+        print(f"Running evaluation on specified tasks: {args.task_list}")
+        requested_task_names = set(args.task_list)
+        tasks = [
+            task
+            for task in all_tasks
+            if task.__class__.__name__ in requested_task_names
+        ]
+        # Optional: Warn if a requested task was not found
+        found_names = {t.__class__.__name__ for t in tasks}
+        missing = requested_task_names - found_names
+        if missing:
+            print(
+                f"Warning: The following requested tasks were not found and will be skipped: {missing}"
+            )
+    else:
+        # If --task-list was not provided, use all tasks
+        print("Running evaluation on all available tasks.")
+        tasks = all_tasks
+    tasks_names = " ".join([task.__class__.__name__ for task in tasks])
+    print(f"Evaluating tasks: {tasks_names}")
+    results = mteb.evaluate(
+        model=model,
+        tasks=tasks,
+        encode_kwargs={
+            "batch_size": args.batch_size,
+        },
+        prediction_folder=args.predictions_folder,
+        overwrite_strategy="always",
+    )
+    print(results)
+    print(f"Saving results to {args.results_folder}")
+    os.makedirs(args.results_folder, exist_ok=True)
+    model_name = args.model_name.replace("/", "_")
+    output_path = os.path.join(
+        args.results_folder, f"{model_name}-{tasks_names.replace(' ', '-')}.csv"
+    )
+    df = results.to_dataframe()
+    df.to_csv(output_path, index=False)
+if __name__ == "__main__":
+    main()

LICENSE ADDED Viewed

	@@ -0,0 +1,35 @@

+# NVIDIA License
+## 1. Definitions
+“Licensor” means any person or entity that distributes its Work.
+“Work” means (a) the original work of authorship made available under this license, which may include software, documentation, or other files, and (b) any additions to or derivative works  thereof  that are made available under this license.
+The terms “reproduce,” “reproduction,” “derivative works,” and “distribution” have the meaning as provided under U.S. copyright law; provided, however, that for the purposes of this license, derivative works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work.
+Works are “made available” under this license by including in or with the Work either (a) a copyright notice referencing the applicability of this license to the Work, or (b) a copy of this license.
+## 2. License Grant
+2.1 Copyright Grant. Subject to the terms and conditions of this license, each Licensor grants to you a perpetual, worldwide, non-exclusive, royalty-free, copyright license to use, reproduce, prepare derivative works of, publicly display, publicly perform, sublicense and distribute its Work and any resulting derivative works in any form.
+## 3. Limitations
+3.1 Redistribution. You may reproduce or distribute the Work only if (a) you do so under this license, (b) you include a complete copy of this license with your distribution, and (c) you retain without modification any copyright, patent, trademark, or attribution notices that are present in the Work.
+3.2 Derivative Works. You may specify that additional or different terms apply to the use, reproduction, and distribution of your derivative works of the Work (“Your Terms”) only if (a) Your Terms provide that the use limitation in Section 3.3 applies to your derivative works, and (b) you identify the specific derivative works that are subject to Your Terms. Notwithstanding Your Terms, this license (including the redistribution requirements in Section 3.1) will continue to apply to the Work itself.
+3.3 Use Limitation. The Work and any derivative works thereof only may be used or intended for use non-commercially. Notwithstanding the foregoing, NVIDIA Corporation and its affiliates may use the Work and any derivative works commercially. As used herein, “non-commercially” means for non-commercial research and educational purposes only.
+3.4 Patent Claims. If you bring or threaten to bring a patent claim against any Licensor (including any claim, cross-claim or counterclaim in a lawsuit) to enforce any patents that you allege are infringed by any Work, then your rights under this license from such Licensor (including the grant in Section 2.1) will terminate immediately.
+3.5 Trademarks. This license does not grant any rights to use any Licensor’s or its affiliates’ names, logos, or trademarks, except as necessary to reproduce the notices described in this license.
+3.6 Termination. If you violate any term of this license, then your rights under this license (including the grant in Section 2.1) will terminate immediately.
+## 4. Disclaimer of Warranty.
+THE WORK IS PROVIDED “AS IS” WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS LICENSE.
+## 5. Limitation of Liability.
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.

README.md ADDED Viewed

	@@ -0,0 +1,337 @@

+---
+license: other
+license_name: customized-nscl-v1
+license_link: LICENSE
+tags:
+  - text
+  - image
+  - vidore
+  - colpali
+  - multimodal-embedding
+  - multilingual-embedding
+  - Text-to-Visual Document (T→VD) retrieval
+  - feature-extraction
+language:
+  - multilingual
+inference: false
+library_name: transformers
+pipeline_tag: visual-document-retrieval
+---
+# **Model Overview**
+## Description
+The **nvidia/nemotron-colembed-4b-v2** is a state-of-the-art late interaction embedding model that ranks No. 2 in the ViDoRe V3: a comprehensive evaluation of retrieval for enterprise use-case benchmark, (as of Jan 21, 2026) with a score of `61.74` on 8 public tasks. The model was fine-tuned for query-document retrieval. Users can input `queries`, which are text, or `documents` which are page images, to the model. The model outputs ColBERT-style multi-vector numerical representations for input queries and documents.
+✨ **Key Improvements:**
+* ⚗️ **Advanced Model Merging:** Utilizes post-training model merging to combine the strengths of multiple fine-tuned checkpoints. This delivers the accuracy stability of an ensemble without any additional inference latency.
+* 🌍 **Enhanced Synthetic Data:** We significantly enriched our training mixture with diverse multilingual synthetic data, improving semantic alignment across languages and complex document types.
+This model is for non-commercial/research use only.
+### Deployment Geography
+Global
+### Use Case
+`nemotron-colembed-4b-v2` is intended for researchers exploring applications that must understand or retrieve information across both text and image modalities. It is instrumental in multimodal RAG systems, where queries are in text format and documents are images, such as pages, text, charts, tables or infographics. Potential applications include multimedia search engines, cross-modal retrieval systems, and conversational AI with rich input understanding.
+### License/Terms of Use
+The use of this model is governed by the [NVIDIA Non-Commercial License Agreement](https://huggingface.co/nvidia/nemotron-colembed-4b-v2/blob/main/LICENSE) and the use of the post-processing scripts are licensed under [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0.txt). Additional Information: Built with Qwen3-VL which is released under [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0.txt).
+This project will download and install additional third-party open source software projects. Review the license terms of these open source projects before use.
+### Release Date
+01/21/2026 via [https://huggingface.co/nvidia/nemotron-colembed-4b-v2](https://huggingface.co/nvidia/nemotron-colembed-4b-v2)
+## Model Architecture
+- **Architecture Type:** Transformer
+- **Network Architecture:** [Qwen3-VL-4B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct) based encoder.
+The `nemotron-colembed-4b-v2` is a transformer-based multimodal embedding model built from `Qwen3-VL-4B-Instruct`, which adopts a three-module architecture comprising a vision encoder (the [SigLIP-2 architecture](https://huggingface.co/google/siglip2-large-patch16-256)), an MLP-based vision–language merger, and a large language model (LLM) (see [technical report](https://arxiv.org/pdf/2511.21631) for details). It has approximately 4.8B parameters.
+## Input(s):
+**Input Type(s):** Image, Text <br>
+**Input Format(s):** <br>
+- Image: List of images- Red, Green, Blue (RGB) <br>
+- Text: List of Strings <br>
+**Input Parameters:** <br>
+- Image: Two-Dimensional (2D) <br>
+- Text: One-Dimensional (1D) <br>
+**Other Properties Related to Input:**
+- The model's maximum context length we evaluated is 10240 tokens. <br>
+- Each image tile consumes 256 tokens. We have tested this model extensively with these settings on config.json - `max_input_tiles = 8`, `use_thumbnails = True`, so that every image is split into maximum 8 tiles + 1 thumbnail (whole image at lower resolution). Images must be python PIL format. The model will scale the image into multiple tiles of 512x512.
+## Outputs
+- **Output Type:** Floats
+- **Output Format:** List of float arrays
+- **Output Parameters:**  The list of floats equivalent to [batchsize x seq length x embedding_dim]
+- **Other Properties Related to Output:** Model outputs embedding vectors of dimension for each input token.
+Our AI models are designed and/or optimized to run on NVIDIA GPU-accelerated systems. By leveraging NVIDIA’s hardware (e.g. GPU cores) and software frameworks (e.g., CUDA libraries), the model achieves faster training and inference times compared to CPU-only solutions.
+### Installation
+The model requires transformers version 5.0.0rc0 and flash attention installed.
+```bash
+pip install transformers==5.0.0rc0
+pip install flash-attn==2.6.3 --no-build-isolation
+```
+Depending on your environment you might need to upgrade polars and pydantic:
+```bash
+pip install -U datasets polars
+pip install -U pydantic
+```
+### Transformers Usage
+```python
+import requests
+from PIL import Image
+from io import BytesIO
+import torch
+from transformers import AutoModel
+from transformers.image_utils import load_image
+# Load Model
+model = AutoModel.from_pretrained(
+    'nvidia/nemotron-colembed-4b-v2',
+    device_map='cuda',
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2"
+).eval()
+# Queries
+queries = [
+    'How is AI improving the intelligence and capabilities of robots?',
+    'Canary, a multilingual model that transcribes speech in English, Spanish, German, and French with punctuation and capitalization.',
+    'Generative AI can generate DNA sequences that can be translated into proteins for bioengineering.'
+]
+image_urls = [
+    "https://developer.download.nvidia.com/images/isaac/nvidia-isaac-lab-1920x1080.jpg",
+    "https://developer-blogs.nvidia.com/wp-content/uploads/2024/03/asr-nemo-canary-featured.jpg",
+    "https://blogs.nvidia.com/wp-content/uploads/2023/02/genome-sequencing-helix.jpg"
+]
+# Load all images (load_image handles both local paths and URLs)
+images = [load_image(img_path) for img_path in image_urls]
+# Encoding
+query_embeddings = model.forward_queries(queries, batch_size=8)
+image_embeddings = model.forward_images(images, batch_size=8)
+scores = model.get_scores(
+    query_embeddings,
+    image_embeddings
+)
+# Diagonal should have higher scores
+print(scores)
+# tensor([[21.5332, 21.1848, 20.9185],
+#         [32.4948, 33.2485, 32.5982],
+#         [26.0623, 26.1014, 26.5692]], device='cuda:0')
+```
+## Software Integration: <br>
+Runtime Engine(s): TensorRT, Triton <br>
+Supported Hardware Microarchitecture Compatibility: A100 40GB, A100 80GB, H100 80GB <br>
+Supported Operating System(s): Linux
+## Model Version(s)
+**nemotron-colembed-4b-v2**
+# Training and Evaluation Datasets
+## Training Dataset
+The model was trained on publicly available datasets, including [DocMatix-IR](https://huggingface.co/datasets/Tevatron/docmatix-ir), [VDR](https://huggingface.co/datasets/vdr-multilingual-train), [Vidore-ColPali-Training](https://huggingface.co/datasets/vidore/colpali_train_set), [VisRAG-Ret-Train-Synthetic-data](https://huggingface.co/datasets/openbmb/VisRAG-Ret-Train-Synthetic-data), [VisRAG-Ret-Train-In-domain-data](https://huggingface.co/datasets/openbmb/VisRAG-Ret-Train-In-domain-data), and [Wiki-SS-NQ](https://huggingface.co/datasets/Tevatron/wiki-ss-nq).
+**Data Modality**: Image
+**Image Training Data Size**
+- Less than a Million Images
+**Data Collection Method by dataset:** Hybrid: Automated, Human, Synthetic <br>
+**Labeling Method by dataset:** Hybrid: Automated, Human, Synthetic <br>
+**Properties:** Training: The vision embedding model was fine-tuned on approximately 500k image samples.
+## Evaluation Dataset
+We evaluate the model on the datasets from [ViDoRe](https://huggingface.co/spaces/vidore/README) V1, V2 and V3 Visual Document Retrieval benchmarks.
+[ViDoRe](https://huggingface.co/spaces/vidore/README) is a premier benchmark for Visual Document Retrieval and it is composed of various page-level retrieving tasks spanning multiple domains, languages, and settings. The latest version of the benchmark is [Vidore V3](https://huggingface.co/blog/QuentinJG/introducing-vidore-v3), a comprehensive evaluation of retrieval for enterprise use-cases.
+We provide a [script](https://huggingface.co/nvidia/nemotron-colembed-4b-v2/blob/main/mteb2_eval.py) using [MTEB 2](https://github.com/embeddings-benchmark/mteb/tree/main/mteb) library to evaluate ColEmbed models on ViDoRe benchmarks.
+- **Data Collection Method by dataset:** Hybrid: Automated, Human, Synthetic
+- **Labeling Method by dataset:** Hybrid: Automated, Human, Synthetic
+- **Properties:** More details on ViDoRe V1 and ViDoRe V2 can be found on their leaderboard. [Visual Document Retrieval Benchmark](https://huggingface.co/vidore),
+## Evaluation Results
+### ViDoRE V1&V2 and V3 on MTEB leaderboards
+```bash
+pip install "mteb>=2.7.0, <3.0.0"
+# Evaluates with Vidore V1 and V2
+CUDA_VISIBLE_DEVICES=0; python3 mteb2_eval.py --model_name nvidia/nemotron-colembed-4b-v2 --batch_size 16 --benchmark "VisualDocumentRetrieval"
+# Evaluates with Vidore V3
+CUDA_VISIBLE_DEVICES=0; python3 mteb2_eval.py --model_name nvidia/nemotron-colembed-4b-v2 --batch_size 16 --benchmark "ViDoRe(v3)"
+# Evaluates with a specific task/dataset of Vidore V3: Vidore3ComputerScienceRetrieval
+CUDA_VISIBLE_DEVICES=0; python3 mteb2_eval.py --model_name nvidia/nemotron-colembed-4b-v2 --batch_size 16 --benchmark "ViDoRe(v3)" --task-list Vidore3ComputerScienceRetrieval
+```
+In this section, we evaluate the performance of nemotron-colembed-4b-v2 against other models that  previously achieved top-five rankings on the leaderboards.
+We report results on the ViDoRe benchmark suite. The tables below summarize the image-modality accuracy of nemotron-colembed-4b-v2 on the ViDoRe V1, V2, and V3 benchmarks, alongside other NVIDIA nemotron-colembed models. Note that (M)MTEB leaderboards use Borda ranking. Each task acts like a voter that ranks models based on how well they perform. Models earn more points when they rank higher on a task. The model with the most total points across all tasks gets the top overall rank.
+#### ViDoRe V3 (NDCG@10)
+| Model | **Avg** | CompSci | Energy | FinanceEn | FinanceFr | HR | Industrial | Pharma | Physics |
+| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- |
+| **nemotron-colembed-8b**| 63.50 | 79.45 | 69.39 | 67.54 | 51.44 | 66.41 | 56.07 | 67.11 | 50.59 |
+| nemotron-colembed-4b | 61.74 | 78.41 | 66.77 | 64.76 | 48.98 | 66.13 | 53.83 | 66.13 | 48.88 |
+| tomoro-colqwen3-8b| 61.60 | 75.35 | 68.41 | 65.08 | 49.10 | 63.98 | 54.41 | 66.36 | 50.13|
+| tomoro-colqwen3-4b| 60.16 | 75.44 | 66.43 | 63.84 | 46.83 | 60.09 | 53.58 | 65.74 | 49.32 |
+| [nemotron-colembed-3b-v2](https://huggingface.co/nvidia/llama-nemotron-colembed-3b-v2) | 59.70 | 77.09 | 64.88 | 64.23 | 44.41 | 62.28 | 51.71 | 66.04 | 46.93 |
+| nomic-ai/colnomic-embed-multimodal-7b | 57.64 | 76.20 | 63.58 | 56.57 | 45.46 | 58.67 | 50.13 | 62.26 | 48.25 |
+| jinaai/jina-embeddings-v4 | 57.54 | 71.81 | 63.50 | 59.30 | 46.10 | 59.53 | 50.38 | 63.09 | 46.63 |
+#### ViDoRe V2 (NDCG@10)
+| Model | **Avg** | BioMedicalLectures | ESGReportsHL | ESGReports | EconomicsReports |
+| :--- | :--- | :--- | :--- | :--- | :--- |
+| **nemotron-colembed-8b** | 67.35 | 68.48 | 73.43 | 67.93 | 59.54 |
+| nemotron-colembed-4b | 66.92 | 67.84 | 73.00 | 67.76 | 59.08 |
+| [nemotron-colembed-3b-v2](https://huggingface.co/nvidia/llama-nemotron-colembed-3b-v2)| 66.75 | 68.23 | 77.91 | 64.71 | 56.15 |
+| tomoro-colqwen3-8b| 65.40 | 65.47 | 75.98 | 60.71 | 59.46 |
+| EvoQwen2.5-VL-Retriever-7B-v1 | 65.24 | 65.20 | 76.98 | 59.67 | 59.13 |
+| tomoro-colqwen3-4b| 64.69 | 65.38 | 74.65 | 62.44 | 56.30 |
+| [nemotron-colembed-3b-v1](https://huggingface.co/nvidia/llama-nemoretriever-colembed-3b-v1)| 63.32 | 62.70 | 75.38 | 57.38 | 57.84 |
+#### ViDoRe V1 (NDCG@10)
+| Model | **Avg** | ArxivQA | DocVQA | InfoVQA | Shift | Syn-AI | Syn-Energy | Syn-Gov | Syn-Health | TabFQuAD | Tatdqa |
+| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- |
+| [nemotron-colembed-3b-v2](https://huggingface.co/nvidia/llama-nemotron-colembed-3b-v2)| 92.16 | 91.12 | 68.46 | 94.86 | 92.30 | 100.00 | 98.02 | 97.95 | 98.89 | 97.48 | 82.49 |
+| nemotron-colembed-8b | 92.06 | 92.23 | 68.61 | 94.11 | 92.66 | 100.00 | 96.26 | 98.02 | 97.79 | 98.03 | 82.90 |
+| nemotron-colembed-4b | 92.01 | 92.56 | 69.18 | 93.89 | 91.88 | 99.26 | 96.19 | 98.39 | 98.16 | 98.16 | 82.46 |
+| [nemotron-colembed-3b-v1](https://huggingface.co/nvidia/llama-nemoretriever-colembed-3b-v1)| 91.00 | 88.35 | 66.21 | 94.92 | 90.70 | 99.63 | 96.63 | 97.82 | 99.26 | 95.94 | 80.57 |
+| tomoro-colqwen3-8b| 90.76 | 91.15 | 66.37 | 94.48 | 87.89 | 99.26 | 96.71 | 97.58 | 99.06 | 94.23 | 80.92 |
+| EvoQwen2.5-VL-Retriever-7B-v1 | 90.68 | 91.49 | 65.07 | 94.11 | 88.80 | 99.63 | 96.63 | 96.29 | 98.89 | 93.63 | 82.26 |
+| tomoro-colqwen3-4b| 90.57 | 90.58 | 66.30 | 94.31 | 87.39 | 99.26 | 96.91 | 97.17 | 99.63 | 94.33 | 79.87 |
+## Inference:
+**Acceleration Engine:** Not Applicable <br>
+**Test Hardware:** A100 40GB, A100 80GB, H100 80GB
+### Citation
+```
+@misc{xu2025llamanemoretrievercolembedtopperforming,
+      title={Llama Nemoretriever Colembed: Top-Performing Text-Image Retrieval Model},
+      author={Mengyao Xu and Gabriel Moreira and Ronay Ak and Radek Osmulski and Yauhen Babakhin and Zhiding Yu and Benedikt Schifferer and Even Oldridge},
+      year={2025},
+      eprint={2507.05513},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2507.05513},
+}
+@misc{moreira2025nvretrieverimprovingtextembedding,
+      title={NV-Retriever: Improving text embedding models with effective hard-negative mining},
+      author={Gabriel de Souza P. Moreira and Radek Osmulski and Mengyao Xu and Ronay Ak and Benedikt Schifferer and Even Oldridge},
+      year={2025},
+      eprint={2407.15831},
+      archivePrefix={arXiv},
+      primaryClass={cs.IR},
+      url={https://arxiv.org/abs/2407.15831},
+}
+@article{Qwen3-VL,
+      title={Qwen3-VL Technical Report},
+      author={Shuai Bai and Yuxuan Cai and Ruizhe Chen and Keqin Chen and Xionghui Chen and Zesen Cheng and Lianghao Deng and Wei Ding and Chang Gao and Chunjiang Ge and Wenbin Ge and Zhifang Guo and Qidong Huang and Jie Huang and Fei Huang and Binyuan Hui and Shutong Jiang and Zhaohai Li and Mingsheng Li and Mei Li and Kaixin Li and Zicheng Lin and Junyang Lin and Xuejing Liu and Jiawei Liu and Chenglong Liu and Yang Liu and Dayiheng Liu and Shixuan Liu and Dunjie Lu and Ruilin Luo and Chenxu Lv and Rui Men and Lingchen Meng and Xuancheng Ren and Xingzhang Ren and Sibo Song and Yuchong Sun and Jun Tang and Jianhong Tu and Jianqiang Wan and Peng Wang and Pengfei Wang and Qiuyue Wang and Yuxuan Wang and Tianbao Xie and Yiheng Xu and Haiyang Xu and Jin Xu and Zhibo Yang and Mingkun Yang and Jianxin Yang and An Yang and Bowen Yu and Fei Zhang and Hang Zhang and Xi Zhang and Bo Zheng and Humen Zhong and Jingren Zhou and Fan Zhou and Jing Zhou and Yuanzhi Zhu and Ke Zhu},
+	  journal={arXiv preprint arXiv:2511.21631},
+      year={2025}
+}
+```
+## **Ethical Considerations**
+NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications. When downloaded or used in accordance with our terms of service, developers should work with their supporting model team to ensure this model meets requirements for the relevant industry and use case and addresses unforeseen product misuse.
+For more detailed information on ethical considerations for this model, please see the Explainability, Bias, Safety, and Privacy sections.
+Please report security vulnerabilities or NVIDIA AI Concerns [here](https://www.nvidia.com/en-us/support/submit-security-vulnerability/).
+## Bias
+| Field | Response |
+| ----- | ----- |
+| Participation considerations from adversely impacted groups [protected classes](https://www.senate.ca.gov/content/protected-classes) in model design and testing | None |
+| Measures taken to mitigate against unwanted bias | None |
+## Explainability
+| Field | Response |
+| ----- | ----- |
+| Intended Application & Domain: | Document and query embedding for question and answer retrieval.|
+| Model Type: | Transformer encoder. |
+| Intended User: | Generative AI creators working with conversational AI models. Users who want to build a question and answer application over a large corpus, leveraging the latest dense retrieval technologies. The corpus can be images of PDFs, such as text, tables, charts or infographics, or extracted plain text. |
+| Output: | Array of float numbers (Dense Vector Representation for the input text). |
+| Describe how the model works: | Model transforms the input into a dense vector representation. |
+| Technical Limitations: | The model's max sequence length is 10240. Longer text inputs should be truncated. |
+| Name the adversely impacted groups this has been tested to deliver comparable outcomes regardless of: | N/A |
+| Verified to have met prescribed NVIDIA quality standards: | Yes |
+| Performance Metrics: | Accuracy, Throughput, and Latency. |
+| Potential Known Risks: | This model does not guarantee to always retrieve the correct passage(s) for a given query. |
+| Licensing & Terms of Use: | The use of this model is governed by the [NVIDIA Non-Commercial License](https://huggingface.co/nvidia/nemotron-colembed-4b-v2/blob/main/LICENSE) and the use of the post-processing scripts are licensed under [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0.txt). Additional Information: Built with Qwen3-VL which is released under [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0.txt).|
+## Privacy
+| Field | Response |
+| ----- | ----- |
+| Generatable or reverse engineerable personal data? | No |
+| Personal data used to create this model? | None Known |
+| How often is dataset reviewed? | Dataset is initially reviewed upon addition, and subsequent reviews are conducted as needed or upon request for changes. |
+| Is there provenance for all datasets used in training? | Yes |
+| Does data labeling (annotation, metadata) comply with privacy laws? | Yes |
+| Is data compliant with data subject requests for data correction or removal, if such a request was made? | No, not possible with externally-sourced data. |
+| Was data from user interactions with the AI model (e.g. user input and prompts) used to train the model? | No |
+| Was consent obtained for any personal data used? | Not Applicable |
+| Applicable Privacy Policy | https://www.nvidia.com/en-us/about-nvidia/privacy-policy/ |
+## Safety
+| Field | Response |
+| ----- | ----- |
+| Model Application(s): | Document Embedding for Retrieval. User queries can be text and documents can be text, document page images, charts, tables, and infographics. |
+| Describe the physical safety impact (if present). | Not Applicable |
+| Use Case Restrictions: | The use of this model is governed by the [NVIDIA Non-Commercial License](https://huggingface.co/nvidia/nemotron-colembed-4b-v2/blob/main/LICENSE) and the use of the post-processing scripts are licensed under [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0.txt). Additional Information: Built with Qwen3-VL which is released under [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0.txt).|
+| Model and dataset restrictions: | The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to. |

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,120 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {%- if messages[0].content is string %}
+            {{- messages[0].content }}
+        {%- else %}
+            {%- for content in messages[0].content %}
+                {%- if 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' }}
+        {%- if messages[0].content is string %}
+            {{- messages[0].content }}
+        {%- else %}
+            {%- for content in messages[0].content %}
+                {%- if 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set image_count = namespace(value=0) %}
+{%- set video_count = namespace(value=0) %}
+{%- for message in messages %}
+    {%- if message.role == "user" %}
+        {{- '<|im_start|>' + message.role + '\n' }}
+        {%- if message.content is string %}
+            {{- message.content }}
+        {%- else %}
+            {%- for content in message.content %}
+                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
+                    <|vision_start|><|image_pad|><|vision_end|>
+                {%- elif content.type == 'video' or 'video' in content %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
+                    <|vision_start|><|video_pad|><|vision_end|>
+                {%- elif 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role + '\n' }}
+        {%- if message.content is string %}
+            {{- message.content }}
+        {%- else %}
+            {%- for content_item in message.content %}
+                {%- if 'text' in content_item %}
+                    {{- content_item.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and message.content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {%- if message.content is string %}
+            {{- message.content }}
+        {%- else %}
+            {%- for content in message.content %}
+                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
+                    <|vision_start|><|image_pad|><|vision_end|>
+                {%- elif content.type == 'video' or 'video' in content %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
+                    <|vision_start|><|video_pad|><|vision_end|>
+                {%- elif 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,72 @@

+{
+  "architectures": [
+    "Qwen3VLNemotronEmbedModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "modeling_qwen3_vl_nemotron_embed.Qwen3VLNemotronEmbedConfig",
+    "AutoModel": "modeling_qwen3_vl_nemotron_embed.Qwen3VLNemotronEmbedModel",
+    "AutoProcessor": "processing_qwen3_vl_nemotron_embed.Qwen3VLNemotronEmbedProcessor"
+  },
+  "dtype": "bfloat16",
+  "image_token_id": 151655,
+  "model_type": "qwen3_vl_nemotron_embed",
+  "pooling": "colbert",
+  "text_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "dtype": "bfloat16",
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 2560,
+    "initializer_range": 0.02,
+    "intermediate_size": 9728,
+    "max_position_embeddings": 262144,
+    "model_type": "qwen3_vl_text",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-06,
+    "rope_parameters": {
+      "mrope_interleaved": true,
+      "mrope_section": [
+        24,
+        20,
+        20
+      ],
+      "rope_theta": 5000000,
+      "rope_type": "default"
+    },
+    "tie_word_embeddings": true,
+    "use_cache": true,
+    "vocab_size": 151936
+  },
+  "tie_word_embeddings": true,
+  "transformers_version": "5.0.0rc0",
+  "use_cache": false,
+  "video_token_id": 151656,
+  "vision_config": {
+    "deepstack_visual_indexes": [
+      5,
+      11,
+      17
+    ],
+    "depth": 24,
+    "dtype": "bfloat16",
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1024,
+    "in_channels": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "model_type": "qwen3_vl",
+    "num_heads": 16,
+    "num_position_embeddings": 2304,
+    "out_hidden_size": 2560,
+    "patch_size": 16,
+    "spatial_merge_size": 2,
+    "temporal_patch_size": 2
+  },
+  "vision_end_token_id": 151653,
+  "vision_start_token_id": 151652
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "transformers_version": "5.0.0rc0",
+  "use_cache": false
+}

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6223655cd16c06e3e48f3d7a3089c4d677990b336acea814a3ba3c6400da7a2c
+size 4990497880

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1bfa21ee16dafffd08a7fdf904cea5c2f503a47660bbcb08ade8fe7f42aa7719
+size 4663133960

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,722 @@

+{
+  "metadata": {
+    "total_parameters": 4826771968,
+    "total_size": 9653543936
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.embed_tokens.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.16.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.16.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.16.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.17.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.17.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.17.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.17.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.17.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.17.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.17.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.17.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.18.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.18.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.18.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.18.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.18.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.18.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.18.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.18.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.19.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.19.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.19.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.19.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.19.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.19.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.20.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.20.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.20.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.20.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.21.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.22.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.23.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.23.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.28.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.29.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.30.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.30.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.31.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.33.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.35.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.35.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.language_model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.language_model.norm.weight": "model-00002-of-00002.safetensors",
+    "model.visual.blocks.0.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.0.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.0.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.0.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.0.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.0.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.0.norm1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.0.norm1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.0.norm2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.0.norm2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.1.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.1.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.1.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.1.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.1.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.1.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.1.norm1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.1.norm1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.1.norm2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.1.norm2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.10.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.10.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.10.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.10.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.10.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.10.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.10.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.10.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.10.norm1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.10.norm1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.10.norm2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.10.norm2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.11.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.11.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.11.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.11.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.11.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.11.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.11.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.11.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.11.norm1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.11.norm1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.11.norm2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.11.norm2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.12.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.12.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.12.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.12.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.12.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.12.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.12.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.12.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.12.norm1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.12.norm1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.12.norm2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.12.norm2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.13.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.13.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.13.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.13.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.13.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.13.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.13.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.13.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.13.norm1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.13.norm1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.13.norm2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.13.norm2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.14.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.14.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.14.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.14.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.14.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.14.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.14.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.14.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.14.norm1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.14.norm1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.14.norm2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.14.norm2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.15.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.15.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.15.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.15.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.15.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.15.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.15.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.15.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.15.norm1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.15.norm1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.15.norm2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.15.norm2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.16.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.16.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.16.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.16.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.16.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.16.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.16.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.16.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.16.norm1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.16.norm1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.16.norm2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.16.norm2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.17.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.17.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.17.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.17.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.17.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.17.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.17.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.17.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.17.norm1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.17.norm1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.17.norm2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.17.norm2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.18.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.18.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.18.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.18.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.18.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.18.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.18.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.18.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.18.norm1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.18.norm1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.18.norm2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.18.norm2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.19.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.19.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.19.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.19.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.19.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.19.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.19.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.19.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.19.norm1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.19.norm1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.19.norm2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.19.norm2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.2.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.2.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.2.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.2.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.2.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.2.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.2.norm1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.2.norm1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.2.norm2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.2.norm2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.20.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.20.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.20.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.20.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.20.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.20.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.20.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.20.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.20.norm1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.20.norm1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.20.norm2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.20.norm2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.21.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.21.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.21.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.21.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.21.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.21.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.21.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.21.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.21.norm1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.21.norm1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.21.norm2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.21.norm2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.22.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.22.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.22.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.22.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.22.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.22.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.22.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.22.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.22.norm1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.22.norm1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.22.norm2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.22.norm2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.23.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.23.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.23.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.23.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.23.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.23.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.23.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.23.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.23.norm1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.23.norm1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.23.norm2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.23.norm2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.3.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.3.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.3.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.3.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.3.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.3.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.3.norm1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.3.norm1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.3.norm2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.3.norm2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.4.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.4.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.4.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.4.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.4.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.4.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.4.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.4.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.4.norm1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.4.norm1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.4.norm2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.4.norm2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.5.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.5.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.5.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.5.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.5.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.5.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.5.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.5.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.5.norm1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.5.norm1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.5.norm2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.5.norm2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.6.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.6.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.6.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.6.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.6.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.6.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.6.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.6.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.6.norm1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.6.norm1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.6.norm2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.6.norm2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.7.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.7.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.7.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.7.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.7.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.7.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.7.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.7.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.7.norm1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.7.norm1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.7.norm2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.7.norm2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.8.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.8.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.8.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.8.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.8.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.8.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.8.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.8.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.8.norm1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.8.norm1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.8.norm2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.8.norm2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.9.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.9.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.9.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.9.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.9.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.9.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.9.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.9.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.9.norm1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.9.norm1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.9.norm2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.blocks.9.norm2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.deepstack_merger_list.0.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.deepstack_merger_list.0.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.deepstack_merger_list.0.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.deepstack_merger_list.0.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.deepstack_merger_list.0.norm.bias": "model-00001-of-00002.safetensors",
+    "model.visual.deepstack_merger_list.0.norm.weight": "model-00001-of-00002.safetensors",
+    "model.visual.deepstack_merger_list.1.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.deepstack_merger_list.1.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.deepstack_merger_list.1.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.deepstack_merger_list.1.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.deepstack_merger_list.1.norm.bias": "model-00001-of-00002.safetensors",
+    "model.visual.deepstack_merger_list.1.norm.weight": "model-00001-of-00002.safetensors",
+    "model.visual.deepstack_merger_list.2.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.deepstack_merger_list.2.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.deepstack_merger_list.2.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.deepstack_merger_list.2.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.deepstack_merger_list.2.norm.bias": "model-00001-of-00002.safetensors",
+    "model.visual.deepstack_merger_list.2.norm.weight": "model-00001-of-00002.safetensors",
+    "model.visual.merger.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "model.visual.merger.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.visual.merger.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "model.visual.merger.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.visual.merger.norm.bias": "model-00001-of-00002.safetensors",
+    "model.visual.merger.norm.weight": "model-00001-of-00002.safetensors",
+    "model.visual.patch_embed.proj.bias": "model-00001-of-00002.safetensors",
+    "model.visual.patch_embed.proj.weight": "model-00001-of-00002.safetensors",
+    "model.visual.pos_embed.weight": "model-00001-of-00002.safetensors"
+  }
+}

modeling_qwen3_vl_nemotron_embed.py ADDED Viewed

	@@ -0,0 +1,480 @@

+"""Qwen3VLNemotronEmbed: Vision-Language Embedding Model with ColBERT-style scoring.
+This module provides a bidirectional vision-language model for document retrieval
+and embedding tasks, based on the Qwen3VL architecture with bidirectional attention.
+"""
+from contextlib import nullcontext
+from typing import Dict, List, Optional, TypeVar, Union
+import torch
+import torch.nn.functional as F
+from datasets import Dataset
+from torch.utils.data import DataLoader
+from torch.utils.data import Dataset as TorchDataset
+from tqdm import tqdm
+from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
+from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig
+from transformers.models.qwen3_vl.modeling_qwen3_vl import (
+    BaseModelOutputWithPast,
+    Cache,
+    FlashAttentionKwargs,
+    Qwen3VLModel,
+    Qwen3VLPreTrainedModel,
+    Qwen3VLTextModel,
+    Unpack,
+    auto_docstring,
+    check_model_inputs,
+)
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
+TV = TypeVar("TV")
+class ListDataset(TorchDataset[TV]):
+    """Simple dataset wrapper for list elements."""
+    def __init__(self, elements: List[TV]):
+        self.elements = elements
+    def __len__(self) -> int:
+        return len(self.elements)
+    def __getitem__(self, idx: int) -> TV:
+        return self.elements[idx]
+class Qwen3VLNemotronEmbedConfig(Qwen3VLConfig):
+    """Configuration for Qwen3VLNemotronEmbed models."""
+    model_type = "qwen3_vl_nemotron_embed"
+    pooling: str
+    def __init__(
+        self,
+        pooling: str = "colbert",
+        **kwargs,
+    ):
+        self.pooling = pooling
+        super().__init__(**kwargs)
+def _create_bidirectional_mask(
+    config,
+    input_embeds: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    cache_position: torch.Tensor,
+    past_key_values: Optional[Cache],
+    position_ids: Optional[torch.Tensor] = None,
+    **kwargs,
+) -> Optional[torch.Tensor]:
+    """Create bidirectional attention mask based on attention implementation."""
+    if config._attn_implementation == "flash_attention_2":
+        if attention_mask is not None and (attention_mask == 0.0).any():
+            return attention_mask
+        return None
+    elif config._attn_implementation == "eager":
+        if attention_mask is not None:
+            return _prepare_4d_attention_mask(
+                attention_mask,
+                dtype=input_embeds.dtype,
+                tgt_len=input_embeds.shape[1],
+            )
+        return None
+    else:
+        if attention_mask is not None:
+            return _prepare_4d_attention_mask(
+                attention_mask,
+                dtype=input_embeds.dtype,
+                tgt_len=input_embeds.shape[1],
+            )
+        return None
+class Qwen3VLNemotronEmbedTextModel(Qwen3VLTextModel):
+    """Bidirectional text model for Qwen3VLNemotronEmbed."""
+    def __init__(self, config):
+        super().__init__(config)
+        for layer in self.layers:
+            layer.self_attn.is_causal = False
+    @check_model_inputs()
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        visual_pos_masks: Optional[torch.Tensor] = None,
+        deepstack_visual_embeds: Optional[list[torch.Tensor]] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        """
+        visual_pos_masks (`torch.Tensor`, *optional*):
+            Boolean mask indicating positions of visual tokens in the sequence.
+            Used for deepstack processing to identify where to inject visual features.
+        deepstack_visual_embeds (`list[torch.Tensor]`, *optional*):
+            List of visual embeddings from intermediate vision encoder layers.
+            Each tensor corresponds to a decoder layer where visual features are injected.
+        """
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You must specify exactly one of input_ids or inputs_embeds"
+            )
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if position_ids is None:
+            position_ids = cache_position.view(1, 1, -1).expand(
+                3, inputs_embeds.shape[0], -1
+            )
+        elif position_ids.ndim == 2:
+            position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
+        if position_ids.ndim == 3 and position_ids.shape[0] == 4:
+            text_position_ids = position_ids[0]
+            position_ids = position_ids[1:]
+        else:
+            text_position_ids = position_ids[0]
+        attention_mask = _create_bidirectional_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=text_position_ids,
+        )
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        for layer_idx, decoder_layer in enumerate(self.layers):
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=text_position_ids,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+            hidden_states = layer_outputs
+            if deepstack_visual_embeds is not None and layer_idx in range(
+                len(deepstack_visual_embeds)
+            ):
+                hidden_states = self._deepstack_process(
+                    hidden_states,
+                    visual_pos_masks,
+                    deepstack_visual_embeds[layer_idx],
+                )
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+        )
+class Qwen3VLNemotronEmbedVisionLanguageModel(Qwen3VLModel):
+    """Vision-language model with bidirectional text attention."""
+    def __init__(self, config):
+        Qwen3VLPreTrainedModel.__init__(self, config)
+        from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLVisionModel
+        self.visual = Qwen3VLVisionModel._from_config(config.vision_config)
+        self.language_model = Qwen3VLNemotronEmbedTextModel._from_config(
+            config.text_config
+        )
+        self.rope_deltas = None
+        self.post_init()
+class Qwen3VLNemotronEmbedForConditionalGeneration(Qwen3VLForConditionalGeneration):
+    """Qwen3VLNemotronEmbed for conditional generation (base class)."""
+    config: Qwen3VLNemotronEmbedConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Qwen3VLNemotronEmbedVisionLanguageModel._from_config(config)
+class EmbeddingMixin:
+    """Mixin providing high-level embedding extraction methods."""
+    def _get_processor(self):
+        """Get or create the processor for this model."""
+        if not hasattr(self, "_processor") or self._processor is None:
+            self._processor = AutoProcessor.from_pretrained(
+                self.config._name_or_path, trust_remote_code=True
+            )
+        return self._processor
+    def process_queries(self, queries: List[str], **kwargs) -> Dict[str, torch.Tensor]:
+        """Process text queries for embedding extraction."""
+        return self._get_processor().process_queries(queries, **kwargs)
+    def process_documents(
+        self, documents: Union[Dict, List[Dict]], **kwargs
+    ) -> Dict[str, torch.Tensor]:
+        """Process documents (image + text) for embedding extraction."""
+        return self._get_processor().process_documents(documents, **kwargs)
+    def _extract_embeddings(
+        self, dataloader: DataLoader, is_query: bool
+    ) -> torch.Tensor:
+        """Extract embeddings from a dataloader.
+        Args:
+            dataloader: DataLoader yielding batches of processed inputs.
+            is_query: Whether these are query embeddings (for progress message).
+        Returns:
+            Tensor of embeddings with shape (num_samples, max_seq_len, hidden_dim).
+        """
+        device = next(self.parameters()).device
+        qs = []
+        message = "query" if is_query else "document"
+        for batch in tqdm(dataloader, desc=f"Extracting {message} embeddings..."):
+            with torch.inference_mode():
+                with (
+                    torch.autocast(device_type="cuda", dtype=torch.bfloat16)
+                    if device.type == "cuda"
+                    else nullcontext()
+                ):
+                    if "pixel_values" in batch and batch["pixel_values"] is None:
+                        batch.pop("pixel_values")
+                    batch = {k: v.to(device) for k, v in batch.items()}
+                    embeddings = self(**batch, output_hidden_states=True).hidden_states[
+                        -1
+                    ]
+                    embeddings = embeddings * batch["attention_mask"].unsqueeze(-1)
+                    embeddings = F.normalize(embeddings, dim=-1)
+            if not torch.isfinite(embeddings).all():
+                raise ValueError("Embeddings contain NaN or Inf values")
+            qs.append(embeddings.detach().cpu())
+        all_embeddings_tensor = self.padding_various_shape_tensor(qs)
+        return all_embeddings_tensor
+    def forward_queries(self, queries: List[str], batch_size: int = 8) -> torch.Tensor:
+        """Forward text queries and extract embeddings.
+        Args:
+            queries: List of query strings.
+            batch_size: Batch size for processing.
+        Returns:
+            Tensor of query embeddings with shape (num_queries, max_seq_len, hidden_dim).
+        """
+        if isinstance(queries, DataLoader):
+            dataset = queries.dataset
+        else:
+            dataset = ListDataset[str](queries)
+        dataloader = DataLoader(
+            dataset=dataset,
+            batch_size=batch_size,
+            collate_fn=self.process_queries,
+            shuffle=False,
+            drop_last=False,
+        )
+        return self._extract_embeddings(dataloader=dataloader, is_query=True)
+    def forward_documents(
+        self, corpus: List[Dict], batch_size: int = 8
+    ) -> torch.Tensor:
+        """Forward documents (image + text) and extract embeddings.
+        Args:
+            corpus: List of dicts with "image" and "text" keys.
+            batch_size: Batch size for processing.
+        Returns:
+            Tensor of document embeddings with shape (num_docs, max_seq_len, hidden_dim).
+        """
+        images = []
+        texts = []
+        for doc in corpus:
+            text = doc.get("text", "")
+            image = doc.get("image")
+            if image is not None and image.mode != "RGB":
+                image = image.convert("RGB")
+            images.append(image)
+            texts.append(text)
+        dataset = Dataset.from_dict({"image": images, "text": texts})
+        dataloader = DataLoader(
+            dataset=dataset,
+            batch_size=batch_size,
+            collate_fn=self.process_documents,
+            shuffle=False,
+            num_workers=8,
+            pin_memory=True,
+            drop_last=False,
+        )
+        return self._extract_embeddings(dataloader=dataloader, is_query=False)
+    def forward_images(
+        self, images: List, batch_size: int = 8, **kwargs
+    ) -> torch.Tensor:
+        """Forward images as image-only documents.
+        Args:
+            images: List of PIL Images.
+            batch_size: Batch size for processing.
+        Returns:
+            Tensor of image embeddings.
+        """
+        corpus = [{"image": image, "text": ""} for image in images]
+        return self.forward_documents(corpus, batch_size)
+    def forward_passages(
+        self, images: List, batch_size: int = 8, **kwargs
+    ) -> torch.Tensor:
+        """Forward passages as image-only documents (alias for forward_images)."""
+        return self.forward_images(images, batch_size, **kwargs)
+class ColBERTScoringMixin:
+    """Mixin providing ColBERT MaxSim scoring methods."""
+    def padding_various_shape_tensor(self, tensors: List[torch.Tensor]) -> torch.Tensor:
+        """Pad tensors of various shapes for ColBERT-like scoring.
+        Args:
+            tensors: List of tensors with shape (batch, seq_len, hidden_dim)
+        Returns:
+            Concatenated tensor with all sequences padded to max length.
+        """
+        max_seq_len = max(t.shape[1] for t in tensors)
+        padded_tensors = [
+            F.pad(t, (0, 0, 0, max_seq_len - t.shape[1]), mode="constant", value=0)
+            for t in tensors
+        ]
+        return torch.cat(padded_tensors, dim=0)
+    def colbert_score(
+        self,
+        qs: Union[torch.Tensor, List[torch.Tensor]],
+        ps: Union[torch.Tensor, List[torch.Tensor]],
+        batch_size: int = 128,
+        device: Optional[Union[str, torch.device]] = None,
+    ) -> torch.Tensor:
+        """Compute ColBERT MaxSim scores between queries and passages.
+        Args:
+            qs: Query embeddings - tensor or list of tensors.
+            ps: Passage embeddings - tensor or list of tensors.
+            batch_size: Batch size for scoring computation.
+            device: Device to run computation on.
+        Returns:
+            Score matrix of shape (num_queries, num_passages).
+        """
+        if batch_size is None:
+            batch_size = 128
+        if device is None:
+            device = next(self.parameters()).device
+        if isinstance(qs, torch.Tensor):
+            qs = [qs[i] for i in range(qs.shape[0])]
+        if isinstance(ps, torch.Tensor):
+            ps = [ps[i] for i in range(ps.shape[0])]
+        if len(qs) == 0:
+            raise ValueError("No queries provided")
+        if len(ps) == 0:
+            raise ValueError("No passages provided")
+        scores_list: List[torch.Tensor] = []
+        for i in range(0, len(qs), batch_size):
+            scores_batch = []
+            qs_slice = qs[i : i + batch_size]
+            qs_batch = torch.nn.utils.rnn.pad_sequence(
+                [q.to(device) for q in qs_slice], batch_first=True, padding_value=0
+            )
+            for j in range(0, len(ps), batch_size):
+                ps_slice = ps[j : j + batch_size]
+                ps_batch = torch.nn.utils.rnn.pad_sequence(
+                    [p.to(device) for p in ps_slice], batch_first=True, padding_value=0
+                )
+                scores_batch.append(
+                    torch.einsum("bnd,csd->bcns", qs_batch, ps_batch)
+                    .max(dim=3)[0]
+                    .sum(dim=2)
+                )
+            scores_batch = torch.cat(scores_batch, dim=1)
+            scores_list.append(scores_batch)
+        scores = torch.cat(scores_list, dim=0)
+        return scores
+    def get_scores(
+        self,
+        query_embeddings: Union[torch.Tensor, List[torch.Tensor]],
+        passage_embeddings: Union[torch.Tensor, List[torch.Tensor]],
+        batch_size: Optional[int] = 128,
+    ) -> torch.Tensor:
+        """Compute ColBERT MaxSim scores between queries and passages.
+        Args:
+            query_embeddings: Query embeddings.
+            passage_embeddings: Passage embeddings.
+            batch_size: Batch size for scoring computation.
+        Returns:
+            Score matrix of shape (num_queries, num_passages).
+        """
+        if isinstance(query_embeddings, list):
+            if len(query_embeddings[0].shape) == 2:
+                query_embeddings = [q.unsqueeze(0) for q in query_embeddings]
+            query_embeddings = self.padding_various_shape_tensor(query_embeddings)
+        if isinstance(passage_embeddings, list):
+            if len(passage_embeddings[0].shape) == 2:
+                passage_embeddings = [p.unsqueeze(0) for p in passage_embeddings]
+            passage_embeddings = self.padding_various_shape_tensor(passage_embeddings)
+        return self.colbert_score(
+            query_embeddings, passage_embeddings, batch_size or 128
+        )
+class Qwen3VLNemotronEmbedModel(
+    EmbeddingMixin, ColBERTScoringMixin, Qwen3VLNemotronEmbedForConditionalGeneration
+):
+    """Qwen3VLNemotronEmbed: Vision-Language Embedding Model.
+    A bidirectional vision-language model for document retrieval and embedding tasks.
+    Based on Qwen3VL architecture with bidirectional attention for embedding extraction.
+    Features:
+    - ColBERT MaxSim scoring (get_scores, colbert_score)
+    - High-level embedding methods (forward_queries, forward_documents, forward_images)
+    - Automatic processor loading for query/document processing
+    Example:
+        >>> model = AutoModel.from_pretrained("nvidia/qwen3vl-nemotron-embed-4b", trust_remote_code=True)
+        >>> query_embeddings = model.forward_queries(["What is machine learning?"])
+        >>> doc_embeddings = model.forward_documents([{"image": img, "text": "ML explanation"}])
+        >>> scores = model.get_scores(query_embeddings, doc_embeddings)
+    """
+    config_class = Qwen3VLNemotronEmbedConfig

mteb2_eval.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# --------------------------------------------------------
+# Copyright (c) 2026 NVIDIA
+# Licensed under customized NSCLv1 [see LICENSE.md for details]
+# --------------------------------------------------------
+"""
+pip install "mteb>=2.6.5, <3.0.0"
+python3 mteb2_eval.py --model_name nvidia/nemotron-colembed-4b-v2 --batch_size 16 --benchmark "ViDoRe(v3)" --task-list Vidore3ComputerScienceRetrieval
+"""
+from __future__ import annotations
+import argparse
+import os
+import mteb
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", type=str, required=True)
+    parser.add_argument("--batch_size", type=int, default=16, required=False)
+    parser.add_argument(
+        "--results_folder", type=str, default="results_csv", required=False
+    )
+    parser.add_argument("--predictions_folder", type=str, default=None, required=False)
+    parser.add_argument(
+        "--benchmark",
+        type=str,
+        required=False,
+        default="ViDoRe(v3)",
+        choices=[
+            "ViDoRe(v3)",  # Vidore V3
+            "VisualDocumentRetrieval",  # Vidore V1 & V2
+        ],
+    )
+    parser.add_argument(
+        "--task-list",
+        type=str,
+        nargs="+",  # Accept one or more space-separated string arguments
+        default=None,  # Default to None if the argument is not provided
+        help="Optional: A list of task class names to run. If not provided, all tasks will be run.",
+    )
+    args = parser.parse_args()
+    print(f"Loading model: {args.model_name}")
+    model = mteb.get_model_meta(args.model_name)
+    # Loads all benchmark tasks
+    all_tasks = mteb.get_benchmark(args.benchmark).tasks
+    all_tasks_names = " ".join([task.__class__.__name__ for task in all_tasks])
+    print(f"Available tasks in benchmark {args.benchmark}: {all_tasks_names}")
+    # filter tasks
+    if args.task_list:
+        # If user provided a list, filter all_tasks
+        print(f"Running evaluation on specified tasks: {args.task_list}")
+        requested_task_names = set(args.task_list)
+        tasks = [
+            task
+            for task in all_tasks
+            if task.__class__.__name__ in requested_task_names
+        ]
+        # Optional: Warn if a requested task was not found
+        found_names = {t.__class__.__name__ for t in tasks}
+        missing = requested_task_names - found_names
+        if missing:
+            print(
+                f"Warning: The following requested tasks were not found and will be skipped: {missing}"
+            )
+    else:
+        # If --task-list was not provided, use all tasks
+        print("Running evaluation on all available tasks.")
+        tasks = all_tasks
+    tasks_names = " ".join([task.__class__.__name__ for task in tasks])
+    print(f"Evaluating tasks: {tasks_names}")
+    results = mteb.evaluate(
+        model=model,
+        tasks=tasks,
+        encode_kwargs={
+            "batch_size": args.batch_size,
+        },
+        prediction_folder=args.predictions_folder,
+        overwrite_strategy="always",
+    )
+    print(results)
+    print(f"Saving results to {args.results_folder}")
+    os.makedirs(args.results_folder, exist_ok=True)
+    model_name = args.model_name.replace("/", "_")
+    output_path = os.path.join(
+        args.results_folder, f"{model_name}-{tasks_names.replace(' ', '-')}.csv"
+    )
+    df = results.to_dataframe()
+    df.to_csv(output_path, index=False)
+if __name__ == "__main__":
+    main()

processing_qwen3_vl_nemotron_embed.py ADDED Viewed

	@@ -0,0 +1,256 @@

+"""Qwen3VLNemotronEmbed Processor for query and document processing."""
+import math
+from typing import Any, Dict, List, Optional, Union
+import torch
+from PIL import Image
+from transformers import Qwen3VLProcessor
+class Qwen3VLNemotronEmbedProcessor(Qwen3VLProcessor):
+    """Processor for Qwen3VLNemotronEmbed that handles query/document processing.
+    This processor extends Qwen3VLProcessor with methods for processing queries and
+    documents for retrieval tasks.
+    Args:
+        image_processor: Image processor for vision inputs.
+        tokenizer: Tokenizer for text inputs.
+        chat_template: Optional chat template.
+        q_max_length: Maximum length for query sequences (default: 512).
+        p_max_length: Maximum length for passage/document sequences (default: 4096).
+        query_prefix: Prefix to add to queries (default: "query:").
+        passage_prefix: Prefix to add to passages (default: "passage:").
+        reserve_tokens_for_images: Reserved tokens for image placeholders (default: 100).
+    """
+    # Attributes to save/load
+    processor_attributes = [
+        "q_max_length",
+        "p_max_length",
+        "query_prefix",
+        "passage_prefix",
+        "reserve_tokens_for_images",
+    ]
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        chat_template=None,
+        q_max_length: int = 512,
+        p_max_length: int = 4096,
+        query_prefix: str = "query:",
+        passage_prefix: str = "passage:",
+        reserve_tokens_for_images: int = 100,
+        **kwargs,
+    ):
+        if chat_template is not None:
+            super().__init__(image_processor, tokenizer, chat_template, **kwargs)
+        else:
+            super().__init__(image_processor, tokenizer, **kwargs)
+        self.q_max_length = q_max_length
+        self.p_max_length = p_max_length
+        self.query_prefix = query_prefix
+        self.passage_prefix = passage_prefix
+        self.reserve_tokens_for_images = reserve_tokens_for_images
+        self.patch_size = self.image_processor.patch_size
+        self.merge_size = self.image_processor.merge_size
+    def apply_chat_template(
+        self,
+        conversation,
+        chat_template=None,
+        **kwargs,
+    ) -> str:
+        """Apply chat template to conversation."""
+        return self.tokenizer.apply_chat_template(
+            conversation,
+            chat_template=chat_template,
+            **kwargs,
+        )
+    @property
+    def min_pixels(self) -> int:
+        """Get min_pixels from image processor."""
+        return self.image_processor.min_pixels
+    @property
+    def max_pixels(self) -> int:
+        """Get max_pixels from image processor."""
+        return self.image_processor.max_pixels
+    def calculate_image_tokens(
+        self,
+        image: Image.Image,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+    ) -> int:
+        """Calculate the number of tokens an image will use after processing.
+        Args:
+            image: PIL Image to calculate tokens for.
+            min_pixels: Minimum pixels for resizing (uses processor default if None).
+            max_pixels: Maximum pixels for resizing (uses processor default if None).
+        Returns:
+            Number of tokens the image will consume.
+        """
+        min_pixels = min_pixels or self.min_pixels
+        max_pixels = max_pixels or self.max_pixels
+        width, height = image.size
+        factor = self.patch_size * self.merge_size
+        h_bar = round(height / factor) * factor
+        w_bar = round(width / factor) * factor
+        if h_bar * w_bar > max_pixels:
+            beta = math.sqrt((height * width) / max_pixels)
+            h_bar = max(factor, math.floor(height / beta / factor) * factor)
+            w_bar = max(factor, math.floor(width / beta / factor) * factor)
+        elif h_bar * w_bar < min_pixels:
+            beta = math.sqrt(min_pixels / (height * width))
+            h_bar = math.ceil(height * beta / factor) * factor
+            w_bar = math.ceil(width * beta / factor) * factor
+        grid_h = h_bar // self.patch_size
+        grid_w = w_bar // self.patch_size
+        num_patches = grid_h * grid_w
+        return num_patches // (self.merge_size**2)
+    def process_queries(
+        self,
+        queries: List[str | dict],
+        padding: bool = True,
+        truncation: bool = True,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: str = "pt",
+        **kwargs,
+    ) -> Dict[str, torch.Tensor]:
+        """Process text queries for retrieval.
+        Args:
+            queries: List of query strings or dicts with "text" key.
+            padding: Whether to pad sequences.
+            truncation: Whether to truncate sequences.
+            pad_to_multiple_of: Pad to a multiple of this value.
+            return_tensors: Return tensor type ("pt" for PyTorch).
+        Returns:
+            Dictionary with input_ids, attention_mask, and other model inputs.
+        """
+        query_texts = []
+        for query in queries:
+            if isinstance(query, dict):
+                query_text = query["text"]
+            else:
+                query_text = query
+            prefixed = f"{self.query_prefix} {query_text}" if self.query_prefix else query_text
+            message = [
+                {
+                    "role": "user",
+                    "content": [{"type": "text", "text": f"Query: {prefixed}"}],
+                }
+            ]
+            query_text = self.apply_chat_template(
+                message, tokenize=False, add_generation_prompt=True
+            )
+            query_texts.append(query_text)
+        return self(
+            text=query_texts,
+            truncation=truncation,
+            max_length=self.q_max_length,
+            padding=padding,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            **kwargs,
+        )
+    def process_documents(
+        self,
+        documents: Union[Dict[str, List], List[Dict[str, Any]]],
+        padding: bool = True,
+        truncation: bool = True,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: str = "pt",
+        **kwargs,
+    ) -> Dict[str, torch.Tensor]:
+        """Process image-text documents for retrieval.
+        Args:
+            documents: Either a dict with "image" and "text" keys containing lists,
+                      or a list of dicts each with "image" and "text" keys.
+            padding: Whether to pad sequences.
+            truncation: Whether to truncate sequences.
+            pad_to_multiple_of: Pad to a multiple of this value.
+            return_tensors: Return tensor type ("pt" for PyTorch).
+        Returns:
+            Dictionary with input_ids, attention_mask, pixel_values, and other model inputs.
+        """
+        if isinstance(documents, dict):
+            images = documents["image"]
+            texts = documents["text"]
+            assert len(texts) == len(images), (
+                "Number of texts must match number of images"
+            )
+        elif isinstance(documents, list):
+            images = [d["image"] for d in documents]
+            texts = [d["text"] for d in documents]
+        else:
+            raise ValueError("documents must be a dict or list of dicts")
+        if self.passage_prefix:
+            texts = [f"{self.passage_prefix} {t}" for t in texts]
+        image_tokens_list = [self.calculate_image_tokens(img) for img in images]
+        max_image_tokens = max(image_tokens_list) if image_tokens_list else 0
+        assert self.p_max_length > max_image_tokens + self.reserve_tokens_for_images, (
+            f"p_max_length ({self.p_max_length}) is too small for max_image_tokens "
+            f"({max_image_tokens}) + reserve ({self.reserve_tokens_for_images})"
+        )
+        available_text_tokens = (
+            self.p_max_length - max_image_tokens - self.reserve_tokens_for_images
+        )
+        if (
+            pad_to_multiple_of is not None
+            and available_text_tokens % pad_to_multiple_of != 0
+        ):
+            available_text_tokens = (
+                available_text_tokens // pad_to_multiple_of
+            ) * pad_to_multiple_of
+        input_texts = []
+        for text, image in zip(texts, images):
+            message = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "image": image},
+                        {"type": "text", "text": text},
+                    ],
+                }
+            ]
+            input_text = self.apply_chat_template(
+                message, tokenize=False, add_generation_prompt=True
+            )
+            input_texts.append(input_text)
+        return self(
+            text=input_texts,
+            images=images,
+            truncation=truncation,
+            padding=padding,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            max_length=available_text_tokens,
+            **kwargs,
+        )

processor_config.json ADDED Viewed

	@@ -0,0 +1,93 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_qwen3_vl_nemotron_embed.Qwen3VLNemotronEmbedProcessor"
+  },
+  "q_max_length": 512,
+  "p_max_length": 4096,
+  "query_prefix": "query:",
+  "passage_prefix": "passage:",
+  "reserve_tokens_for_images": 100,
+  "image_processor": {
+    "crop_size": null,
+    "data_format": "channels_first",
+    "device": null,
+    "disable_grouping": null,
+    "do_center_crop": null,
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_pad": null,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "image_processor_type": "Qwen2VLImageProcessorFast",
+    "image_seq_length": null,
+    "image_std": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "input_data_format": null,
+    "max_pixels": 802816,
+    "merge_size": 2,
+    "min_pixels": 2352,
+    "pad_size": null,
+    "patch_size": 16,
+    "processor_class": "Qwen3VLProcessor",
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_tensors": null,
+    "size": {
+      "longest_edge": 16777216,
+      "shortest_edge": 65536
+    },
+    "temporal_patch_size": 2
+  },
+  "processor_class": "Qwen3VLNemotronEmbedProcessor",
+  "video_processor": {
+    "crop_size": null,
+    "data_format": "channels_first",
+    "default_to_square": true,
+    "device": null,
+    "do_center_crop": null,
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_pad": null,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "fps": 2,
+    "image_mean": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "image_std": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "input_data_format": null,
+    "max_frames": 768,
+    "merge_size": 2,
+    "min_frames": 4,
+    "num_frames": null,
+    "pad_size": null,
+    "patch_size": 16,
+    "processor_class": "Qwen3VLProcessor",
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "return_tensors": null,
+    "size": {
+      "longest_edge": 25165824,
+      "shortest_edge": 4096
+    },
+    "temporal_patch_size": 2,
+    "video_metadata": null,
+    "video_processor_type": "Qwen3VLVideoProcessor"
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
+size 11422654

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "additional_special_tokens": null,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": true,
+  "model_max_length": 262144,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "Qwen3VLProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}