# Deepfake Audio Detection — Central Configuration # All hyperparameters, paths, and settings live here. # Notebooks and scripts read from this file rather than hardcoding values. project: name: "deepfake-audio-detection" seed: 42 # --- Paths (Colab + Drive) --- paths: drive_root: "/content/drive/MyDrive/deepfake_audio" data_raw: "/content/drive/MyDrive/deepfake_audio/data/raw" data_processed: "/content/drive/MyDrive/deepfake_audio/data/processed" checkpoints: "/content/drive/MyDrive/deepfake_audio/checkpoints" logs: "/content/drive/MyDrive/deepfake_audio/logs" # --- Audio preprocessing --- audio: sample_rate: 16000 # Hz, must match Wav2Vec 2.0 pretraining channels: 1 # mono window_seconds: 4.0 # length of each training segment overlap_ratio: 0.5 # 50% overlap between consecutive windows pad_short_clips: true # zero-pad clips shorter than window_seconds # --- Model --- model: base: "facebook/wav2vec2-base" # 95M params, 12 transformer layers num_classes: 2 # bona fide vs spoof pooling: "mean" # mean | max | attentive # --- Training: Stage 1 (frozen backbone) --- stage1: freeze_backbone: true learning_rate: 1.0e-3 batch_size: 32 epochs: 5 optimizer: "adamw" weight_decay: 0.01 early_stopping_patience: 3 # --- Training: Stage 2 (partial fine-tuning) --- stage2: freeze_backbone: false unfreeze_top_n_layers: 2 # of 12 transformer layers learning_rate: 1.0e-5 # 100x smaller than stage 1 batch_size: 16 # smaller — full backprop through some layers epochs: 12 optimizer: "adamw" weight_decay: 0.01 warmup_ratio: 0.1 scheduler: "linear" early_stopping_patience: 3 mixed_precision: true # --- Loss --- loss: type: "weighted_cross_entropy" # Weights computed from class frequencies: 2580 bona fide vs 22800 spoof # weight = total / (n_classes * count) → roughly [4.92, 0.56] # --- Augmentation (Stage 2 only) --- augmentation: enabled: true noise_injection: enabled: true snr_range_db: [10, 30] rir_convolution: enabled: false # dropped per cut priority # --- Evaluation --- evaluation: metrics: ["eer", "t_dcf", "auc_roc"] aggregation_across_windows: "mean" # mean | median of per-window probs # --- Datasets --- datasets: asvspoof_2019_la: role: "primary" download_url: "https://datashare.ed.ac.uk/handle/10283/3336" asvspoof_2021_la: role: "secondary" download_url: "https://www.asvspoof.org/index2021.html" wavefake: role: "supplementary" sample_per_vocoder: 1500 bona_fide_source: "ljspeech" # --- Logging --- wandb: project: "deepfake-audio-detection" entity: "sara-jaffrani17-dlp" log_freq: 10 # log every N training steps