{ "architectures": [ "CohereAsrForConditionalGeneration" ], "auto_map": { "AutoConfig": "configuration_cohere_asr.CohereAsrConfig", "AutoFeatureExtractor": "processing_cohere_asr.CohereAsrFeatureExtractor", "AutoModel": "modeling_cohere_asr.CohereAsrModel", "AutoModelForSpeechSeq2Seq": "modeling_cohere_asr.CohereAsrForConditionalGeneration", "AutoProcessor": "processing_cohere_asr.CohereAsrProcessor", "AutoTokenizer": "tokenization_cohere_asr.CohereAsrTokenizer" }, "batch_size": 128, "decoding": { "beam": { "beam_size": 1, "len_pen": 0.0, "max_generation_delta": 50 }, "return_best_hypothesis": true, "strategy": "beam" }, "encoder": { "att_context_size": [ -1, -1 ], "causal_downsampling": false, "conv_context_size": null, "conv_kernel_size": 9, "conv_norm_type": "batch_norm", "d_model": 1280, "dropout": 0, "dropout_att": 0, "dropout_emb": 0, "dropout_pre_encoder": 0, "feat_in": 128, "feat_out": -1, "ff_expansion_factor": 4, "n_heads": 8, "n_layers": 48, "pos_emb_max_len": 5000, "reduction": null, "reduction_factor": 1, "reduction_position": null, "self_attention_model": "rel_pos", "subsampling": "dw_striding", "subsampling_conv_channels": 256, "subsampling_factor": 8, "untie_biases": true, "xscaling": false }, "head": { "activation": "relu", "dropout": 0, "hidden_size": 1024, "log_softmax": true, "num_classes": 16384, "num_layers": 1, "use_transformer_init": true }, "is_encoder_decoder": true, "log_batch_stats": false, "log_prediction": true, "max_audio_clip_s": 35, "max_seq_len": 1024, "model_defaults": { "asr_enc_hidden": 1280, "lm_dec_hidden": 1024, "lm_enc_hidden": 1024 }, "model_type": "cohere_asr", "multitask_metrics_cfg": { "log_predictions": true, "metrics": { "wer": { "constraint": ".source_lang==.target_lang" } } }, "overlap_chunk_second": 5, "preprocessor": { "dither": 1e-05, "features": 128, "frame_splicing": 1, "log": true, "n_fft": 512, "normalize": "per_feature", "pad_to": 0, "pad_value": 0.0, "sample_rate": 16000, "window": "hann", "window_size": 0.025, "window_stride": 0.01 }, "prompt_defaults": [ { "role": "user", "slots": { "decodercontext": "", "diarize": "<|nodiarize|>", "emotion": "<|emo:undefined|>", "itn": "<|noitn|>", "pnc": "<|pnc|>", "source_lang": "<|en|>", "target_lang": "<|en|>", "timestamp": "<|notimestamp|>" } }, { "role": "user_partial", "slots": { "decodercontext": "" } } ], "prompt_format": "cohere_asr", "sample_rate": 16000, "supported_languages": [ "en", "fr", "de", "es", "it", "pt", "nl", "pl", "el", "ar", "ja", "zh", "vi", "ko" ], "transf_decoder": { "config_dict": { "attn_layer_dropout": 0, "attn_score_dropout": 0, "embedding_dropout": 0, "ffn_dropout": 0, "hidden_act": "relu", "hidden_size": 1024, "inner_size": 4096, "learn_positional_encodings": false, "lm_dec_hidden": 1280, "max_sequence_length": 1024, "num_attention_heads": 8, "num_layers": 8, "num_token_types": 0, "pre_ln": true, "vocab_size": "None" }, "encoder": null, "model_name": null, "pre_ln_final_layer_norm": true, "pretrained": false }, "transf_encoder": { "attn_layer_dropout": 0, "attn_score_dropout": 0, "ffn_dropout": 0, "hidden_size": 1024, "inner_size": 4096, "mask_future": false, "num_attention_heads": 8, "num_layers": 0, "pre_ln": true, "pre_ln_final_layer_norm": true }, "use_loss_mask_for_prompt": false, "vocab_size": 16384 }