{
  "architectures": [
    "CohereAsrForConditionalGeneration"
  ],
  "auto_map": {
    "AutoConfig": "configuration_cohere_asr.CohereAsrConfig",
    "AutoFeatureExtractor": "processing_cohere_asr.CohereAsrFeatureExtractor",
    "AutoModel": "modeling_cohere_asr.CohereAsrModel",
    "AutoModelForSpeechSeq2Seq": "modeling_cohere_asr.CohereAsrForConditionalGeneration",
    "AutoProcessor": "processing_cohere_asr.CohereAsrProcessor",
    "AutoTokenizer": "tokenization_cohere_asr.CohereAsrTokenizer"
  },
  "batch_size": 128,
  "decoding": {
    "beam": {
      "beam_size": 1,
      "len_pen": 0.0,
      "max_generation_delta": 50
    },
    "return_best_hypothesis": true,
    "strategy": "beam"
  },
  "encoder": {
    "att_context_size": [
      -1,
      -1
    ],
    "causal_downsampling": false,
    "conv_context_size": null,
    "conv_kernel_size": 9,
    "conv_norm_type": "batch_norm",
    "d_model": 1280,
    "dropout": 0,
    "dropout_att": 0,
    "dropout_emb": 0,
    "dropout_pre_encoder": 0,
    "feat_in": 128,
    "feat_out": -1,
    "ff_expansion_factor": 4,
    "n_heads": 8,
    "n_layers": 48,
    "pos_emb_max_len": 5000,
    "reduction": null,
    "reduction_factor": 1,
    "reduction_position": null,
    "self_attention_model": "rel_pos",
    "subsampling": "dw_striding",
    "subsampling_conv_channels": 256,
    "subsampling_factor": 8,
    "untie_biases": true,
    "xscaling": false
  },
  "head": {
    "activation": "relu",
    "dropout": 0,
    "hidden_size": 1024,
    "log_softmax": true,
    "num_classes": 16384,
    "num_layers": 1,
    "use_transformer_init": true
  },
  "is_encoder_decoder": true,
  "log_batch_stats": false,
  "log_prediction": true,
  "max_audio_clip_s": 35,
  "max_seq_len": 1024,
  "model_defaults": {
    "asr_enc_hidden": 1280,
    "lm_dec_hidden": 1024,
    "lm_enc_hidden": 1024
  },
  "model_type": "cohere_asr",
  "multitask_metrics_cfg": {
    "log_predictions": true,
    "metrics": {
      "wer": {
        "constraint": ".source_lang==.target_lang"
      }
    }
  },
  "overlap_chunk_second": 5,
  "preprocessor": {
    "dither": 1e-05,
    "features": 128,
    "frame_splicing": 1,
    "log": true,
    "n_fft": 512,
    "normalize": "per_feature",
    "pad_to": 0,
    "pad_value": 0.0,
    "sample_rate": 16000,
    "window": "hann",
    "window_size": 0.025,
    "window_stride": 0.01
  },
  "prompt_defaults": [
    {
      "role": "user",
      "slots": {
        "decodercontext": "",
        "diarize": "<|nodiarize|>",
        "emotion": "<|emo:undefined|>",
        "itn": "<|noitn|>",
        "pnc": "<|pnc|>",
        "source_lang": "<|en|>",
        "target_lang": "<|en|>",
        "timestamp": "<|notimestamp|>"
      }
    },
    {
      "role": "user_partial",
      "slots": {
        "decodercontext": ""
      }
    }
  ],
  "prompt_format": "cohere_asr",
  "sample_rate": 16000,
  "supported_languages": [
    "en",
    "fr",
    "de",
    "es",
    "it",
    "pt",
    "nl",
    "pl",
    "el",
    "ar",
    "ja",
    "zh",
    "vi",
    "ko"
  ],
  "transf_decoder": {
    "config_dict": {
      "attn_layer_dropout": 0,
      "attn_score_dropout": 0,
      "embedding_dropout": 0,
      "ffn_dropout": 0,
      "hidden_act": "relu",
      "hidden_size": 1024,
      "inner_size": 4096,
      "learn_positional_encodings": false,
      "lm_dec_hidden": 1280,
      "max_sequence_length": 1024,
      "num_attention_heads": 8,
      "num_layers": 8,
      "num_token_types": 0,
      "pre_ln": true,
      "vocab_size": "None"
    },
    "encoder": null,
    "model_name": null,
    "pre_ln_final_layer_norm": true,
    "pretrained": false
  },
  "transf_encoder": {
    "attn_layer_dropout": 0,
    "attn_score_dropout": 0,
    "ffn_dropout": 0,
    "hidden_size": 1024,
    "inner_size": 4096,
    "mask_future": false,
    "num_attention_heads": 8,
    "num_layers": 0,
    "pre_ln": true,
    "pre_ln_final_layer_norm": true
  },
  "use_loss_mask_for_prompt": false,
  "vocab_size": 16384
}