# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
# How to use:
#
# 1. copy this file locally
# 2. (optional) uncomment limit_samples in the config file to run with 10 samples for quick testing
# 3. export your HF token in the terminal (some benchmark datasets might be gated)
# 4. run `nemo-evaluator-launcher run --config path/to/local_qwen3-30b-a3b-base.yaml`
#
# ⚠️  WARNING:
#     Always run full evaluations (without limit_samples) for actual benchmark results.
#     Using a subset of samples is solely for testing configuration and setup.
#     Results from such test runs should NEVER be used to compare models or
#     report benchmark performance.
defaults:
  - execution: local
  - deployment: vllm
  - _self_

execution:
  output_dir: Qwen3-30B-A3B-Base
  # mode: sequential  # enables sequential execution

# specify deployment arguments
deployment:
  image: vllm/vllm-openai:v0.11.0
  checkpoint_path: null
  hf_model_handle: Qwen/Qwen3-30B-A3B-Base
  served_model_name: Qwen/Qwen3-30B-A3B-Base
  tensor_parallel_size: 1
  data_parallel_size: 1

# specify the benchmarks to evaluate
evaluation:
  env_vars:
    HF_TOKEN: HF_TOKEN
  nemo_evaluator_config:  # global config settings that apply to all tasks
    config:
      params:
        max_retries: 5  # number of retries for API requests
        request_timeout: 360  # timeout for API requests in seconds
        parallelism: 4  # number of parallel requests
        # limit_samples: 10 # uncomment to limit number of samples for quick testing
        extra:
          tokenizer: Qwen/Qwen3-30B-A3B-Base
          tokenizer_backend: huggingface
  tasks:
    - name: adlr_mmlu_pro_5_shot_base
    - name: adlr_mmlu
    - name: adlr_agieval_en_cot
    - name: adlr_humaneval_greedy
    - name: adlr_mbpp_sanitized_3_shot_greedy
    - name: adlr_gsm8k_cot_8_shot
    - name: adlr_minerva_math_nemo_4_shot
    - name: adlr_math_500_4_shot_sampled
    - name: adlr_arc_challenge_llama_25_shot
    - name: hellaswag
    - name: openbookqa
    - name: piqa
    - name: adlr_race
    - name: adlr_winogrande_5_shot
    - name: adlr_global_mmlu_lite_5_shot
    - name: adlr_mgsm_native_cot_8_shot