{ "_name_or_path": "/network/scratch/z/zhixuan.lin/linear-rnn-torch/release/transformer-pro-760m-longcrawl64-48b", "architectures": [ "ForgettingTransformerForCausalLM" ], "attention_bias": false, "bos_token_id": null, "decay_time_max": null, "decay_time_min": null, "elementwise_affine": true, "eos_token_id": null, "fgate_bias_init": false, "fgate_type": "none", "fuse_cross_entropy": true, "fuse_norm": true, "hidden_act": "swish", "hidden_ratio": 3.5, "hidden_size": 1536, "initializer_range": 0.02, "intermediate_size": null, "max_position_embeddings": null, "model_type": "forgetting_transformer-project_fox", "norm_eps": 1e-06, "num_heads": 12, "num_hidden_layers": 24, "num_kv_heads": null, "ogate_act": "sigmoid", "qk_norm": true, "qk_norm_share_param_across_head": true, "rope_base": 500000.0, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.44.0", "use_cache": true, "use_k_shift": true, "use_output_gate": true, "use_output_norm": true, "use_rope": true, "use_v_shift": true, "vocab_size": 50257, "window_size": null }