{ "model": { "type": "nemotron_speech", "vocab_size": 1025, "context_length": 1, "speech": { "num_mels": 128, "fft_size": 512, "hop_length": 160, "win_length": 400, "preemph": 0.97, "log_eps": 5.96046448e-08, "subsampling_factor": 8, "left_context": 70, "conv_context": 8, "pre_encode_cache_size": 9, "sample_rate": 16000, "chunk_samples": 8960, "blank_id": 1024, "max_symbols_per_step": 10, "enc_in_length": "length", "enc_in_cache_channel": "cache_last_channel", "enc_in_cache_time": "cache_last_time", "enc_in_cache_channel_len": "cache_last_channel_len", "enc_out_length": "encoded_lengths", "enc_out_cache_channel": "cache_last_channel_next", "enc_out_cache_time": "cache_last_time_next", "enc_out_cache_channel_len": "cache_last_channel_next_len" }, "encoder": { "filename": "encoder.int8.onnx", "hidden_size": 1024, "num_hidden_layers": 24, "inputs": { "audio_features": "audio_signal" }, "outputs": { "encoder_outputs": "outputs" } }, "decoder": { "filename": "decoder.int8.onnx", "hidden_size": 640, "num_hidden_layers": 2, "inputs": { "targets": "targets", "target_length": "target_length", "states_1": "states.1", "states_2": "onnx::Slice_3" }, "outputs": { "outputs": "outputs", "prednet_lengths": "prednet_lengths", "states_1": "states", "states_2": "162" } }, "joiner": { "filename": "joiner.int8.onnx", "inputs": { "encoder_outputs": "encoder_outputs", "decoder_outputs": "decoder_outputs" }, "outputs": { "logits": "outputs" } } }, "search": { "max_length": 1024, "num_beams": 1 } }