# Macros: # ============================================================================== batch_size = 384 new_freq = 24000 # Parameters for AllMPNetBaseV2: # ============================================================================== AllMPNetBaseV2.model_id = 'sentence-transformers/all-mpnet-base-v2' # Parameters for AudioDataModule: # ============================================================================== AudioDataModule.num_workers = 0 # Parameters for AudioDataset: # ============================================================================== AudioDataset.half_precision = True AudioDataset.mono = True AudioDataset.new_freq = 24000 AudioDataset.num_frames = 480000 AudioDataset.orig_freq = 16000 # Parameters for build_dev_datamodule: # ============================================================================== build_dev_datamodule.datamodule = @discotube_text_audio # Parameters for build_module: # ============================================================================== build_module.ckpt_path = \ '/home/jovyan/shared/palonso/data/logs/mtg-text-audio/dt86c2jx/checkpoints/epoch=287-step=148032.ckpt' build_module.module = @modules.maskingmodel.MaskingModel build_module.net = @nets.conformer.Conformer build_module.representation = \ [@nets.melspectrogram.MelSpectrogram, @nets.waveform.Waveform, @nets.cqt.CQT, @nets.encodec.EnCodec] # Parameters for CLAP: # ============================================================================== CLAP.aggregation_type = 'mean' CLAP.audio_encoder = @OMARRQ CLAP.loss_type = 'info_nce_multimodal' CLAP.lr = 0.0001 CLAP.n_pool_att_heads = 8 CLAP.proj_size = 512 CLAP.seed = 0 CLAP.temp = 0.1 CLAP.text_encoder = @AllMPNetBaseV2 CLAP.tokenizers_parallelism = False CLAP.train_audio_encoder = True CLAP.train_text_encoder = False CLAP.weight_decay = 0.01 # Parameters for ConcatTextAudioDataModule: # ============================================================================== ConcatTextAudioDataModule.batch_size = %batch_size ConcatTextAudioDataModule.datamodules = \ [@DiscotubeTextAudioCleanDataModule, @MSDTextAudioDataModule, @FreesoundTextAudioDataModule, @PSETextAudioDataModule] ConcatTextAudioDataModule.num_workers = 30 ConcatTextAudioDataModule.ratios = [0.55, 0.3, 0.11, 0.04] # Parameters for Conformer: # ============================================================================== Conformer.alpha_deepnorm = 2.21 Conformer.beta_deepnorm = 0.0026 Conformer.conv_kernel_size = 5 Conformer.depth = 12 Conformer.dropout = 0.2 Conformer.embed_dim = 512 Conformer.input_dropout = 0.0 Conformer.mlp_ratio = 4.0 Conformer.mlp_residual_factor = 4.0 Conformer.num_heads = 8 Conformer.num_patches = None Conformer.use_deepnorm = True Conformer.use_rope = True # Parameters for CosineAnnealingCallback: # ============================================================================== CosineAnnealingCallback.eta_min = 1e-07 CosineAnnealingCallback.warmup_steps = 15000 # Parameters for CQT: # ============================================================================== CQT.bins_per_octave = 24 CQT.f_min = 32.703 CQT.hop_len = 320 CQT.logC = True CQT.magnitude = True CQT.n_bins = 188 CQT.norm_mean = 4.754879065310596 CQT.norm_std = 1.9055732535255916 CQT.patch_size = (188, 4) CQT.power = 2 CQT.sr = %new_freq # Parameters for def_module: # ============================================================================== def_module.module = @clap # Parameters for DiscotubeTextAudioCleanDataModule: # ============================================================================== DiscotubeTextAudioCleanDataModule.batch_size = %batch_size DiscotubeTextAudioCleanDataModule.data_dir = '/' DiscotubeTextAudioCleanDataModule.filelist_train = \ '/home/jovyan/shared/palonso/data/discotube/metadata/mmap_ids_train' DiscotubeTextAudioCleanDataModule.filelist_val = \ '/home/jovyan/shared/palonso/data/discotube/metadata/mmap_ids_val' DiscotubeTextAudioCleanDataModule.max_sentences = 3 DiscotubeTextAudioCleanDataModule.num_workers = 30 DiscotubeTextAudioCleanDataModule.text_file = \ '/home/jovyan/shared/palonso/data/discotube/metadata/Qwen_Qwen2.5-32B__chatgpt_v2__t0.5__1.1.jsonl' # Parameters for DiscotubeTextAudioCleanDataset: # ============================================================================== DiscotubeTextAudioCleanDataset.num_frames = 160000 # Parameters for EnCodec: # ============================================================================== EnCodec.norm_type = 'global' EnCodec.orig_sr = %new_freq EnCodec.patch_size = (128, 4) EnCodec.stats_path = None EnCodec.weights_path = '/gpfs/scratch/upf97/model_weights/encodec_24khz/' # Parameters for FreesoundTextAudioDataModule: # ============================================================================== FreesoundTextAudioDataModule.batch_size = %batch_size FreesoundTextAudioDataModule.data_dir = \ '/home/jovyan/shared/palonso/data/freesound/mmaps/' FreesoundTextAudioDataModule.description_prob = 0.2 FreesoundTextAudioDataModule.filelist_train = \ '/home/jovyan/shared/palonso/data/freesound/filelist_full_train_mmap.txt' FreesoundTextAudioDataModule.filelist_val = \ '/home/jovyan/shared/palonso/data/freesound/filelist_full_val_mmap.txt' FreesoundTextAudioDataModule.num_workers = 30 FreesoundTextAudioDataModule.text_file = \ '/home/jovyan/shared/palonso/data/freesound/freesound_metadata.jsonl' # Parameters for FreesoundTextAudioDataset: # ============================================================================== FreesoundTextAudioDataset.num_frames = 160000 # Parameters for MaskingModel: # ============================================================================== MaskingModel.codebook_dim = 16 MaskingModel.codebook_size = 8196 MaskingModel.diff_input = False MaskingModel.input_representation = @nets.waveform.Waveform MaskingModel.lr = 0.0001 MaskingModel.mask_prob = 0.6 MaskingModel.mask_seconds = 0.4 MaskingModel.num_codebooks = 1 MaskingModel.plot_tokens = False MaskingModel.seed = 0 MaskingModel.weight_decay = 0.01 # Parameters for MelSpectrogram: # ============================================================================== MelSpectrogram.freq_mask_param = 0 MelSpectrogram.hop_len = 320 MelSpectrogram.mel_scale = 'slaney' MelSpectrogram.n_mel = 96 MelSpectrogram.norm = 'slaney' MelSpectrogram.norm_mean = 2.06755686098554 MelSpectrogram.norm_std = 1.268292820667291 MelSpectrogram.patch_size = (96, 4) MelSpectrogram.power = 2 MelSpectrogram.sr = %new_freq MelSpectrogram.stretch_factor = 1 MelSpectrogram.time_mask_param = 0 MelSpectrogram.win_len = 512 # Parameters for MSDTextAudioDataModule: # ============================================================================== MSDTextAudioDataModule.batch_size = %batch_size MSDTextAudioDataModule.data_dir = '/home/jovyan/shared/palonso/data/msd/mmaps/' MSDTextAudioDataModule.description_prob = 0.5 MSDTextAudioDataModule.filelist_train = \ '/home/jovyan/shared/palonso/data/msd/filelist_train_mmap.txt' MSDTextAudioDataModule.filelist_val = \ '/home/jovyan/shared/palonso/data/msd/filelist_val_mmap.txt' MSDTextAudioDataModule.num_workers = 30 # Parameters for MSDTextAudioDataset: # ============================================================================== MSDTextAudioDataset.num_frames = 160000 # Parameters for OMARRQ: # ============================================================================== OMARRQ.model_id = 'mtg-upf/clap_omarrq_mp_small_music' # Parameters for PSETextAudioDataModule: # ============================================================================== PSETextAudioDataModule.batch_size = %batch_size PSETextAudioDataModule.data_dir = \ '/home/jovyan/shared/palonso/data/pse/mmaps/pse_data_december_2024/data/' PSETextAudioDataModule.filelist_train = \ '/home/jovyan/shared/palonso/data/pse/filelist_train.txt' PSETextAudioDataModule.filelist_val = \ '/home/jovyan/shared/palonso/data/pse/filelist_val.txt' PSETextAudioDataModule.num_workers = 30 # Parameters for PSETextAudioDataset: # ============================================================================== PSETextAudioDataset.num_frames = 160000 # Parameters for train: # ============================================================================== train.ckpt_save_every_n_epochs = 8 train.datamodule = @concat_text_audio train.hf_ckpt = 'mtg-upf/clap_omarrq_mp_small_music' train.params = \ {'accelerator': 'gpu', 'check_val_every_n_epoch': 8, 'devices': 8, 'log_every_n_steps': 50, 'max_steps': 150000, 'num_nodes': 1, 'num_sanity_val_steps': -1, 'precision': 'bf16-mixed', 'strategy': 'ddp_find_unused_parameters_true'} train.wandb_params = \ {'entity': 'mtg-upf', 'group': 'vanilla_clap', 'name': 'config_clap_mpnet_base_v2_ssl_mp_10s_small_clap_dt_msd_fs_pse_lr_5e-6', 'offline': False, 'project': 'mtg-text-audio', 'save_dir': '/home/jovyan/shared/palonso/data/logs'} # Parameters for Waveform: # ============================================================================== Waveform.norm_mean = None Waveform.norm_std = None Waveform.patch_size = (1, 1280) Waveform.sr = %new_freq