# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # # How to use: # # 1. copy this file locally # 2. (optional) uncomment limit_samples in the config file to run with 10 samples for quick testing # 3. export your HF token in the terminal (some benchmark datasets might be gated) # 4. run `nemo-evaluator-launcher run --config path/to/local_qwen3-30b-a3b-base.yaml` # # ⚠️ WARNING: # Always run full evaluations (without limit_samples) for actual benchmark results. # Using a subset of samples is solely for testing configuration and setup. # Results from such test runs should NEVER be used to compare models or # report benchmark performance. defaults: - execution: local - deployment: vllm - _self_ execution: output_dir: Qwen3-30B-A3B-Base # mode: sequential # enables sequential execution # specify deployment arguments deployment: image: vllm/vllm-openai:v0.11.0 checkpoint_path: null hf_model_handle: Qwen/Qwen3-30B-A3B-Base served_model_name: Qwen/Qwen3-30B-A3B-Base tensor_parallel_size: 1 data_parallel_size: 1 # specify the benchmarks to evaluate evaluation: env_vars: HF_TOKEN: HF_TOKEN nemo_evaluator_config: # global config settings that apply to all tasks config: params: max_retries: 5 # number of retries for API requests request_timeout: 360 # timeout for API requests in seconds parallelism: 4 # number of parallel requests # limit_samples: 10 # uncomment to limit number of samples for quick testing extra: tokenizer: Qwen/Qwen3-30B-A3B-Base tokenizer_backend: huggingface tasks: - name: adlr_mmlu_pro_5_shot_base - name: adlr_mmlu - name: adlr_agieval_en_cot - name: adlr_humaneval_greedy - name: adlr_mbpp_sanitized_3_shot_greedy - name: adlr_gsm8k_cot_8_shot - name: adlr_minerva_math_nemo_4_shot - name: adlr_math_500_4_shot_sampled - name: adlr_arc_challenge_llama_25_shot - name: hellaswag - name: openbookqa - name: piqa - name: adlr_race - name: adlr_winogrande_5_shot - name: adlr_global_mmlu_lite_5_shot - name: adlr_mgsm_native_cot_8_shot