# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0. """ pip install "mteb>=2.6.5, <3.0.0" python3 mteb2_eval.py --model_name nvidia/nemotron-colembed-vl-4b-v2 --batch_size 16 --benchmark "ViDoRe(v3)" --task-list Vidore3ComputerScienceRetrieval """ from __future__ import annotations import argparse import os import mteb def main(): parser = argparse.ArgumentParser() parser.add_argument("--model_name", type=str, required=True) parser.add_argument("--batch_size", type=int, default=16, required=False) parser.add_argument( "--results_folder", type=str, default="results_csv", required=False ) parser.add_argument("--predictions_folder", type=str, default=None, required=False) parser.add_argument( "--benchmark", type=str, required=False, default="ViDoRe(v3)", choices=[ "ViDoRe(v3)", # Vidore V3 "VisualDocumentRetrieval", # Vidore V1 & V2 ], ) parser.add_argument( "--task-list", type=str, nargs="+", # Accept one or more space-separated string arguments default=None, # Default to None if the argument is not provided help="Optional: A list of task class names to run. If not provided, all tasks will be run.", ) args = parser.parse_args() print(f"Loading model: {args.model_name}") model = mteb.get_model_meta(args.model_name) # Loads all benchmark tasks all_tasks = mteb.get_benchmark(args.benchmark).tasks all_tasks_names = " ".join([task.__class__.__name__ for task in all_tasks]) print(f"Available tasks in benchmark {args.benchmark}: {all_tasks_names}") # filter tasks if args.task_list: # If user provided a list, filter all_tasks print(f"Running evaluation on specified tasks: {args.task_list}") requested_task_names = set(args.task_list) tasks = [ task for task in all_tasks if task.__class__.__name__ in requested_task_names ] # Optional: Warn if a requested task was not found found_names = {t.__class__.__name__ for t in tasks} missing = requested_task_names - found_names if missing: print( f"Warning: The following requested tasks were not found and will be skipped: {missing}" ) else: # If --task-list was not provided, use all tasks print("Running evaluation on all available tasks.") tasks = all_tasks tasks_names = " ".join([task.__class__.__name__ for task in tasks]) print(f"Evaluating tasks: {tasks_names}") results = mteb.evaluate( model=model, tasks=tasks, encode_kwargs={ "batch_size": args.batch_size, }, prediction_folder=args.predictions_folder, overwrite_strategy="always", ) print(results) print(f"Saving results to {args.results_folder}") os.makedirs(args.results_folder, exist_ok=True) model_name = args.model_name.replace("/", "_") output_path = os.path.join( args.results_folder, f"{model_name}-{tasks_names.replace(' ', '-')}.csv" ) df = results.to_dataframe() df.to_csv(output_path, index=False) if __name__ == "__main__": main()