Ctrl+K
- bimamba_stage_a
- bimamba_stage_a_128
- bimamba_stage_a_40k
- bimamba_stage_b
- bimamba_stage_b_joint_1024_40k
- bimamba_stage_b_joint_1024_40k_fixed
- bimamba_stage_b_joint_ckpt_40k
- bimamba_stage_b_joint_ckpt_40k_fixed
- bimamba_stage_b_joint_ckpt_40k_fixed_v2
- bimamba_stage_b_light_xattn_from_stage_a_256_retrain
- checkpoints
- dememwm_full_berzelius_8a100_bs16_global128_350k
- dememwm_full_berzelius_8a100_bs16_global128_350k_gate_off
- dememwm_sep_full_seq_base_8a100_bs16_global128_350k
- dememwm_sep_full_seq_gate_none_8a100_bs16_global128_350k
- dememwm_sep_full_seq_gate_none_nocomp_a2_8a100_bs16_global128_350k
- dememwm_sep_full_seq_gate_none_snr_shift_lr8e5_8a100_bs4_global32_750k
- dememwm_sep_full_seq_gate_none_snr_shift_ntok8_8a100_bs4_global32_750k
- dememwm_sep_full_seq_gate_none_snr_shift_plucker_mem_8a100_bs4_global32_750k
- dememwm_sep_full_seq_gate_none_snr_shift_rev1_8a100_bs4_global32_750k
- dememwm_sep_full_seq_gate_none_snr_shift_rev1cap_8a100_bs4_global32_750k
- dememwm_separate_memory_berzelius_8a100_bs16_global128_350k
- dememwm_separate_memory_full_hidden1024_all_layers_berzelius_8a100_bs16_global128_350k
- hierarchy_bimamba_stage_a_40k
- hierarchy_bimamba_stage_b_joint
- hierarchy_bimamba_stage_b_joint_fixed
- train_dememwm_sep_mem_fulldit_mem_base_a2d4r2_route_ditlr4e5_ntok8_8a100_bs8_global64_350k
- train_dememwm_sep_mem_fulldit_mem_base_a2d4r2_route_ntok8_8a100_bs8_global64_350k
- train_dememwm_sep_mem_fulldit_mem_base_a2d4r2_route_ntok8_8h200_bs16_global128_175k
- train_dememwm_sep_mem_fulldit_mem_no_anchor_d4r2_route_ntok8_8a100_bs8_global64_350k
- train_dememwm_sep_mem_fulldit_mem_no_dynamic_a2r2_route_ntok8_8a100_bs8_global64_350k
- train_dememwm_sep_mem_fulldit_mem_no_noise_route_a2d4r2_ntok8_8a100_bs8_global64_350k
- train_dememwm_sep_mem_fulldit_mem_no_noise_route_a2d4r2_ntok8_8h200_bs16_global128_175k
- train_dememwm_sep_mem_fulldit_mem_no_noise_route_no_dynamic_a2r2_ntok8_8h200_bs16_global128_175k
- train_dememwm_sep_mem_fulldit_mem_route_anchor_high_dynamic_revisit_low_a2d4r2_ntok8_8h200_bs16_global128_175k
- train_dememwm_separate_memory_gatefix_4layers512_valid500_berzelius_8a100_bs16_global128_350k
- train_dememwm_separate_memory_gatefix_alllayers1024_valid500_berzelius_8a100_bs16_global128_350k
- videos
- wandb
- wandb_128
- wandb_256
- 640 kB
- 72.2 kB
- 12.1 GB xet
- 7.4 MB
- 11.1 MB xet
- 15 MB xet
- 9.09 MB
- 8.99 MB
- 167 kB