{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4", "authorship_tag": "ABX9TyM2MNKf+Ku7E4aMujn8tFCo" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 107 }, "id": "ynFcJbOMDD-1", "executionInfo": { "status": "ok", "timestamp": 1777765038196, "user_tz": 420, "elapsed": 5378, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "43250049-951e-488c-bbae-710e135c076a" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Re-uploading kaggle.json (find it in your Downloads folder)...\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", " \n", " \n", " Upload widget is only available when the cell has been executed in the\n", " current browser session. Please rerun this cell to enable.\n", " \n", " " ] }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "Saving kaggle.json to kaggle.json\n", "kaggle.json configured.\n" ] } ], "source": [ "from google.colab import files\n", "import os, shutil\n", "\n", "print(\"Re-uploading kaggle.json (find it in your Downloads folder)...\")\n", "uploaded = files.upload()\n", "\n", "os.makedirs('/root/.kaggle', exist_ok=True)\n", "shutil.move('kaggle.json', '/root/.kaggle/kaggle.json')\n", "os.chmod('/root/.kaggle/kaggle.json', 0o600)\n", "print(\"kaggle.json configured.\")" ] }, { "cell_type": "code", "source": [ "\"\"\"\n", "SESSION BOOTSTRAP \u2014 Phase 4 (Stage 2)\n", "Restores: Drive, repo, dataset, Python imports, wandb auth.\n", "\"\"\"\n", "import os, sys, time\n", "\n", "print(\"=\" * 60)\n", "print(\"Step 1/4: Mount Drive\")\n", "print(\"=\" * 60)\n", "DRIVE_ROOT = '/content/drive/MyDrive'\n", "if not os.path.exists(DRIVE_ROOT):\n", " from google.colab import drive\n", " drive.mount('/content/drive')\n", "print(\"Drive mounted.\\n\")\n", "\n", "os.makedirs('/content/drive/MyDrive/deepfake_audio/checkpoints', exist_ok=True)\n", "os.makedirs('/content/drive/MyDrive/deepfake_audio/logs', exist_ok=True)\n", "\n", "print(\"=\" * 60)\n", "print(\"Step 2/4: Clone/update repo\")\n", "print(\"=\" * 60)\n", "REPO_DIR = '/content/deepfake-audio-detection'\n", "if not os.path.exists(REPO_DIR):\n", " !git clone https://github.com/Saracasm/deepfake-audio-detection.git {REPO_DIR}\n", "else:\n", " !cd {REPO_DIR} && git pull --quiet\n", "print(f\"Repo at: {REPO_DIR}\\n\")\n", "\n", "print(\"=\" * 60)\n", "print(\"Step 3/4: Re-download dataset (~3-5 min)\")\n", "print(\"=\" * 60)\n", "LOCAL_LA = '/content/kaggle_download/LA'\n", "\n", "if os.path.exists(LOCAL_LA):\n", " print(\"Dataset already present.\")\n", "else:\n", " if not os.path.exists('/root/.kaggle/kaggle.json'):\n", " print(\"ERROR: kaggle.json not configured.\")\n", " print(\"Run the kaggle.json upload cell BEFORE this bootstrap.\")\n", " raise SystemExit(\"Need kaggle credentials\")\n", "\n", " !pip install -q kaggle\n", " os.makedirs('/content/kaggle_download', exist_ok=True)\n", " start = time.time()\n", " !kaggle datasets download -d anishsarkar22/asvpoof-2019-dataset-la \\\n", " -p /content/kaggle_download --unzip --force --quiet\n", " print(f\"Downloaded in {(time.time()-start)/60:.1f} minutes.\")\n", "\n", "print(f\"Dataset at: {LOCAL_LA}\\n\")\n", "\n", "print(\"=\" * 30)\n", "print(\"Step 4/4: Set up Python imports + wandb\")\n", "print(\"=\" * 30)\n", "sys.path.insert(0, REPO_DIR)\n", "LA_ROOT = LOCAL_LA\n", "\n", "# Wandb key from Colab Secrets (so we don't hit interactive prompt)\n", "try:\n", " from google.colab import userdata\n", " os.environ['WANDB_API_KEY'] = userdata.get('WANDB_API_KEY')\n", " print(\"Wandb API key loaded from Colab Secrets.\")\n", "except Exception as e:\n", " print(f\"WANDB_API_KEY not loaded: {e}\")\n", "\n", "print(f\"\\nLA_ROOT = {LA_ROOT}\")\n", "print(f\"REPO_DIR = {REPO_DIR}\")\n", "print(\"\\nBootstrap complete. Ready to work.\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SkYfpBWcDi87", "executionInfo": { "status": "ok", "timestamp": 1777766107989, "user_tz": 420, "elapsed": 1384, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "9164c2ab-63da-4e52-b340-951649835210" }, "execution_count": 3, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "============================================================\n", "Step 1/4: Mount Drive\n", "============================================================\n", "Drive mounted.\n", "\n", "============================================================\n", "Step 2/4: Clone/update repo\n", "============================================================\n", "Repo at: /content/deepfake-audio-detection\n", "\n", "============================================================\n", "Step 3/4: Re-download dataset (~3-5 min)\n", "============================================================\n", "Dataset already present.\n", "Dataset at: /content/kaggle_download/LA\n", "\n", "==============================\n", "Step 4/4: Set up Python imports + wandb\n", "==============================\n", "Wandb API key loaded from Colab Secrets.\n", "\n", "LA_ROOT = /content/kaggle_download/LA\n", "REPO_DIR = /content/deepfake-audio-detection\n", "\n", "Bootstrap complete. Ready to work.\n" ] } ] }, { "cell_type": "code", "source": [ "# Search Kaggle for ASVspoof 2021 LA datasets\n", "print(\"Searching Kaggle for ASVspoof 2021 LA mirrors...\\n\")\n", "!kaggle datasets list -s \"asvspoof 2021\" --max-size 20000000000 2>&1 | head -30" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "AFVFbqnjHt5c", "executionInfo": { "status": "ok", "timestamp": 1777766166833, "user_tz": 420, "elapsed": 823, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "f7abf981-e798-4835-9e75-53dfa57d685c" }, "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Searching Kaggle for ASVspoof 2021 LA mirrors...\n", "\n", "ref title size lastUpdated downloadCount voteCount usabilityRating \n", "----------------------------------------------------- ----------------------------------- ----------- -------------------------- ------------- --------- --------------- \n", "abdallamohamed312/ready-to-input-for-training Balanced ASVspoof 2021 PA 5211923567 2024-06-01 05:38:52.957000 477 8 0.875 \n", "abdallamohamed312/asv-2021-pa-pt12-into-real-and-fake ASV 2021 (PA) Part 1,2 14176529532 2024-05-31 20:10:39.510000 147 7 0.6875 \n", "abdallamohamed312/asv-2021-pa-part-3 ASV 2021 (PA) part 3 7090016509 2024-05-31 23:59:09.450000 82 7 0.6875 \n", "chandajha04/asvspoof-2021 asvspoof-2021 4225 2025-11-08 13:54:33.450000 2 0 0.25 \n", "pratikjodgudri/asvspoof2021-df-audio-dataset ASVspoof2021_DF_Audio_Dataset 1279232433 2024-11-13 07:45:36.147000 195 4 0.375 \n", "eminkorkut/deepfakevoice-wac2vec-4datasets DeepFakeVoice-Wac2Vec-4Datasets 3309490475 2026-03-08 14:21:29.550000 12 1 0.7058824 \n", "eminkorkut/deepfakevoice-hubert-4datasets DeepFakeVoice-HuBERT-4Datasets 3456502600 2026-03-08 13:51:10.900000 2 1 0.7058824 \n", "eminkorkut/deepfakevoice-mfcc-4datasets DeepFakeVoice-MFCC-4Datasets 180800358 2026-03-08 14:27:46.507000 18 1 0.7058824 \n", "eminkorkut/deepfakevoice-google-hear-4datasets DeepFakeVoice-google-HeAR-4Datasets 4577806640 2026-03-08 15:04:23.780000 8 1 0.75 \n", "flarescen/asvspoof-2021-real-samples ASVSpoof 2021 real samples 964302088 2025-03-21 14:58:22.087000 5 0 0.125 \n" ] } ] }, { "cell_type": "code", "source": [ "print(\"Trying alternate search terms for 2021 LA...\\n\")\n", "print(\"--- Search 1: 'asvspoof2021 la' ---\")\n", "!kaggle datasets list -s \"asvspoof2021 la\" --max-size 20000000000 2>&1 | head -15\n", "\n", "print(\"\\n--- Search 2: 'asvspoof la 2021' ---\")\n", "!kaggle datasets list -s \"asvspoof la 2021\" --max-size 20000000000 2>&1 | head -15\n", "\n", "print(\"\\n--- Search 3: 'spoof 2021 logical access' ---\")\n", "!kaggle datasets list -s \"spoof 2021 logical access\" --max-size 20000000000 2>&1 | head -15\n", "\n", "print(\"\\n--- Search 4: 'asvspoof2021_la' (with underscore) ---\")\n", "!kaggle datasets list -s \"asvspoof2021_la\" --max-size 20000000000 2>&1 | head -15" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Mwk_PFN5IInP", "executionInfo": { "status": "ok", "timestamp": 1777766277028, "user_tz": 420, "elapsed": 2571, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "372eacb2-7369-4e37-e224-6250a80fce9a" }, "execution_count": 5, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Trying alternate search terms for 2021 LA...\n", "\n", "--- Search 1: 'asvspoof2021 la' ---\n", "ref title size lastUpdated downloadCount voteCount usabilityRating \n", "------------------------------- -------------------- ---------- -------------------------- ------------- --------- --------------- \n", "simontrann/asvspoof2021-la-key ASVSpoof2021_LA_Key 21237220 2025-10-09 08:22:47.333000 8 0 0.25 \n", "simontrann/asvspoof2021-la-eval ASVSpoof2021_LA_eval 7782355226 2025-10-03 02:33:41.490000 6 0 0.25 \n", "ajaysuryal/asvspoof2021-la ASVspoof2021_LA 7788165738 2025-05-22 01:57:12.537000 15 0 0.125 \n", "\n", "--- Search 2: 'asvspoof la 2021' ---\n", "ref title size lastUpdated downloadCount voteCount usabilityRating \n", "---------------------------------------------- ----------------------------------- ---------- -------------------------- ------------- --------- --------------- \n", "eminkorkut/deepfakevoice-wac2vec-4datasets DeepFakeVoice-Wac2Vec-4Datasets 3309490475 2026-03-08 14:21:29.550000 12 1 0.7058824 \n", "eminkorkut/deepfakevoice-mfcc-4datasets DeepFakeVoice-MFCC-4Datasets 180800358 2026-03-08 14:27:46.507000 18 1 0.7058824 \n", "eminkorkut/deepfakevoice-google-hear-4datasets DeepFakeVoice-google-HeAR-4Datasets 4577806640 2026-03-08 15:04:23.780000 8 1 0.75 \n", "eminkorkut/deepfakevoice-hubert-4datasets DeepFakeVoice-HuBERT-4Datasets 3456502600 2026-03-08 13:51:10.900000 2 1 0.7058824 \n", "\n", "--- Search 3: 'spoof 2021 logical access' ---\n", "No datasets found\n", "\n", "--- Search 4: 'asvspoof2021_la' (with underscore) ---\n", "ref title size lastUpdated downloadCount voteCount usabilityRating \n", "-------------------------- --------------- ---------- -------------------------- ------------- --------- --------------- \n", "ajaysuryal/asvspoof2021-la ASVspoof2021_LA 7788165738 2025-05-22 01:57:12.537000 15 0 0.125 \n" ] } ] }, { "cell_type": "code", "source": [ "print(\"Listing files in ajaysuryal/asvspoof2021-la...\\n\")\n", "!kaggle datasets files ajaysuryal/asvspoof2021-la 2>&1 | head -40" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "D5IStA6MIX5J", "executionInfo": { "status": "ok", "timestamp": 1777766337119, "user_tz": 420, "elapsed": 512, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "020efd4a-7b03-4c6a-f9da-d05e0cad885b" }, "execution_count": 6, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Listing files in ajaysuryal/asvspoof2021-la...\n", "\n", "Next Page Token = CfDJ8OqP5ZkTT9ZGj66XXRbxXyB45tjLm0BH9BDVPb3HnDuRXyp2J5C_xa5hhO_njOW_Qq6MqV4DD3FGJMSDjOinZcsGb3GB-LfTUWLikR7LpgTOmE56Pf8UG1X2Ece9gD53AgaY9SjdAvr42YBpHBQKVnezhTIy8VZJog5ZjECDb5R5-wV5vawtn-x-6UVDlqqStPcziA\n", "name size creationDate \n", "-------------------------------------------------------------------- ------- -------------------------- \n", "ASVspoof2021_LA/ASVspoof2021_LA_eval/ASVspoof2021.LA.cm.eval.trl.txt 2360358 2025-05-22 02:03:30.763000 \n", "ASVspoof2021_LA/ASVspoof2021_LA_eval/LICENSE.txt 19941 2025-05-22 01:58:39.159000 \n", "ASVspoof2021_LA/ASVspoof2021_LA_eval/README.LA.txt 2233 2025-05-22 02:03:30.737000 \n", "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000048.flac 44411 2025-05-22 02:03:26.628000 \n", "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000166.flac 160644 2025-05-22 02:00:13.297000 \n", "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000174.flac 32835 2025-05-22 01:59:13.205000 \n", "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000200.flac 31011 2025-05-22 01:58:46.589000 \n", "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000345.flac 56980 2025-05-22 02:03:23.753000 \n", "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000349.flac 56097 2025-05-22 02:01:09.378000 \n", "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000381.flac 19968 2025-05-22 02:00:18.967000 \n", "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000450.flac 40938 2025-05-22 02:02:19.194000 \n", "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000479.flac 32567 2025-05-22 02:00:18.892000 \n", "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000580.flac 32551 2025-05-22 02:02:50.197000 \n", "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000624.flac 43105 2025-05-22 02:00:58.778000 \n", "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000636.flac 64684 2025-05-22 01:59:18.951000 \n", "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000657.flac 45448 2025-05-22 02:02:33.897000 \n", "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000795.flac 28359 2025-05-22 02:03:09.419000 \n", "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000854.flac 72252 2025-05-22 02:01:52.340000 \n", "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000931.flac 44475 2025-05-22 02:00:18.632000 \n", "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000986.flac 25770 2025-05-22 01:59:11.650000 \n" ] } ] }, { "cell_type": "code", "source": [ "import os, time\n", "\n", "DOWNLOAD_DIR = '/content/kaggle_2021'\n", "os.makedirs(DOWNLOAD_DIR, exist_ok=True)\n", "\n", "print(\"Downloading ASVspoof 2021 LA from Kaggle...\")\n", "print(\"Expected: ~7.8 GB, ~5-8 minutes\\n\")\n", "\n", "start = time.time()\n", "!kaggle datasets download -d ajaysuryal/asvspoof2021-la -p {DOWNLOAD_DIR} --unzip --force --quiet\n", "elapsed_min = (time.time() - start) / 60\n", "print(f\"\\nDownload complete in {elapsed_min:.1f} minutes.\")\n", "\n", "# Verify structure\n", "LA_2021_ROOT = f'{DOWNLOAD_DIR}/ASVspoof2021_LA/ASVspoof2021_LA_eval'\n", "print(f\"\\nDataset root: {LA_2021_ROOT}\")\n", "print(f\"Exists: {os.path.exists(LA_2021_ROOT)}\")\n", "if os.path.exists(LA_2021_ROOT):\n", " print(f\"Top-level contents: {sorted(os.listdir(LA_2021_ROOT))}\")\n", "\n", "# Count flac files\n", "import glob\n", "flac_dir = f'{LA_2021_ROOT}/flac'\n", "if os.path.exists(flac_dir):\n", " n_flac = len(glob.glob(f'{flac_dir}/*.flac'))\n", " print(f\"\\nFlac files: {n_flac:,}\")\n", "else:\n", " print(f\"\\nFlac dir not found at {flac_dir}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "R2xdl-B5Iuh3", "executionInfo": { "status": "ok", "timestamp": 1777766605454, "user_tz": 420, "elapsed": 176068, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "78064c33-63e8-4d33-c972-e13e79f99e8a" }, "execution_count": 7, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Downloading ASVspoof 2021 LA from Kaggle...\n", "Expected: ~7.8 GB, ~5-8 minutes\n", "\n", "Dataset URL: https://www.kaggle.com/datasets/ajaysuryal/asvspoof2021-la\n", "License(s): unknown\n", "\n", "Download complete in 2.9 minutes.\n", "\n", "Dataset root: /content/kaggle_2021/ASVspoof2021_LA/ASVspoof2021_LA_eval\n", "Exists: True\n", "Top-level contents: ['ASVspoof2021.LA.cm.eval.trl.txt', 'LICENSE.txt', 'README.LA.txt', 'flac']\n", "\n", "Flac files: 181,566\n" ] } ] }, { "cell_type": "code", "source": [ "LA_2021_ROOT = '/content/kaggle_2021/ASVspoof2021_LA/ASVspoof2021_LA_eval'\n", "\n", "# Read the README first\n", "print(\"=\" * 70)\n", "print(\"README.LA.txt contents:\")\n", "print(\"=\" * 70)\n", "with open(f'{LA_2021_ROOT}/README.LA.txt') as f:\n", " print(f.read())" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Dbb0pBBUKEPU", "executionInfo": { "status": "ok", "timestamp": 1777766781057, "user_tz": 420, "elapsed": 15, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "1bd97085-b1ed-44d6-b49b-b5e24803e700" }, "execution_count": 8, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "======================================================================\n", "README.LA.txt contents:\n", "======================================================================\n", "=====================================================================================================\n", "\n", "ASVspoof 2021 Challenge - Logical Access Databas\n", "\n", "Copyright (c) 2021 \n", "\n", "National Institute of Informatics, Japan\n", "EURECOM, France\n", "Inria, France \n", "University of Eastern Finland, Finland\n", "Institute for Infocomm Research, Singapore\n", "\n", "=====================================================================================================\n", "\n", "\n", "1. Directory Structure\n", "_______________________\n", "\n", " ASVspoof2021_LA_eval/\n", " ASVspoof2021.LA.cm.eval.trl.txt list of evaluation data \n", " flac/ audio files\n", " README.LA.txt this file \n", " LICENSE.txt license file \n", "\n", "\n", "2. Audio file format\n", "_________________________________\n", "\n", " All ASVspoof2021_LA_eval audio files are distributed in flac format. \n", " All audio data is sampled at a rate of 16 kHz and stored in 16-bit.\n", "\n", " 3. Further details \n", "______________________\n", "\n", " Further details are available via the ASVspoof website (https://www.asvspoof.org)\n", "\n", "\n", " 4. Copying\n", "______________________\n", "\n", "This dataset is licensed under the Open Data Commons Attribution License (ODC-By). \n", "\n", "Regarding the Open Data Commons Attribution License (ODC-By), please see LICENSE.txt or \n", "https://opendatacommons.org/licenses/by/1.0/index.html\n", " \n", "THIS DATABASE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND \n", "ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED \n", "WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. \n", "IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, \n", "INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, \n", "BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, \n", "OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, \n", "WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) \n", "ARISING IN ANY WAY OUT OF THE USE OF THIS DATABASE, EVEN IF ADVISED OF THE \n", "POSSIBILITY OF SUCH DAMAGE\n", "\n", "5. Acknowledgements \n", "______________________\n", "\n", "A part of this database is based on the ASVspoof 2019 database (https://doi.org/10.7488/ds/2555). \n", " \n", " \n", "\n" ] } ] }, { "cell_type": "code", "source": [ "import os\n", "\n", "LA_2021_ROOT = '/content/kaggle_2021/ASVspoof2021_LA/ASVspoof2021_LA_eval'\n", "PROTO_PATH = f'{LA_2021_ROOT}/ASVspoof2021.LA.cm.eval.trl.txt'\n", "\n", "print(f\"Protocol file: {PROTO_PATH}\")\n", "print(f\"Size: {os.path.getsize(PROTO_PATH) / 1024:.1f} KB\\n\")\n", "\n", "print(\"First 10 lines:\")\n", "print(\"-\" * 70)\n", "with open(PROTO_PATH) as f:\n", " for i, line in enumerate(f):\n", " if i >= 10:\n", " break\n", " print(repr(line))\n", "print(\"-\" * 70)\n", "\n", "with open(PROTO_PATH) as f:\n", " n_lines = sum(1 for _ in f)\n", "print(f\"\\nTotal lines: {n_lines:,}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "DL6EaOl4KQVT", "executionInfo": { "status": "ok", "timestamp": 1777766830927, "user_tz": 420, "elapsed": 28, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "d9bb5ce0-c15f-48bc-a913-2139d0c867e6" }, "execution_count": 9, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Protocol file: /content/kaggle_2021/ASVspoof2021_LA/ASVspoof2021_LA_eval/ASVspoof2021.LA.cm.eval.trl.txt\n", "Size: 2305.0 KB\n", "\n", "First 10 lines:\n", "----------------------------------------------------------------------\n", "'LA_E_9332881\\n'\n", "'LA_E_6866159\\n'\n", "'LA_E_5464494\\n'\n", "'LA_E_4759417\\n'\n", "'LA_E_2667748\\n'\n", "'LA_E_8589971\\n'\n", "'LA_E_1911364\\n'\n", "'LA_E_5298786\\n'\n", "'LA_E_2042719\\n'\n", "'LA_E_5449181\\n'\n", "----------------------------------------------------------------------\n", "\n", "Total lines: 181,566\n" ] } ] }, { "cell_type": "code", "source": [ "import os, time\n", "\n", "KEY_DIR = '/content/kaggle_2021_key'\n", "os.makedirs(KEY_DIR, exist_ok=True)\n", "\n", "print(\"Downloading ASVspoof 2021 LA key file (~21 MB)...\\n\")\n", "start = time.time()\n", "!kaggle datasets download -d simontrann/asvspoof2021-la-key -p {KEY_DIR} --unzip --force --quiet\n", "print(f\"Downloaded in {(time.time()-start):.1f} seconds.\\n\")\n", "\n", "# Inspect what's inside\n", "print(\"Contents of key download:\")\n", "for root, dirs, files in os.walk(KEY_DIR):\n", " level = root.replace(KEY_DIR, '').count('/')\n", " indent = ' ' * level\n", " print(f\"{indent}{os.path.basename(root) or 'root'}/\")\n", " for f in files:\n", " size = os.path.getsize(os.path.join(root, f))\n", " size_str = f\"{size/1e6:.1f} MB\" if size > 1e6 else f\"{size/1024:.1f} KB\"\n", " print(f\"{indent} - {f} ({size_str})\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "9Ny0ulDtKaAP", "executionInfo": { "status": "ok", "timestamp": 1777766872373, "user_tz": 420, "elapsed": 2155, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "e074a55a-7fbe-4aa7-87fc-2eb7dd3f341a" }, "execution_count": 10, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Downloading ASVspoof 2021 LA key file (~21 MB)...\n", "\n", "Dataset URL: https://www.kaggle.com/datasets/simontrann/asvspoof2021-la-key\n", "License(s): apache-2.0\n", "Downloaded in 2.1 seconds.\n", "\n", "Contents of key download:\n", "kaggle_2021_key/\n", " keys/\n", " LA/\n", " - LA-C012-eval.npy (19.5 KB)\n", " - README.txt (0.7 KB)\n", " - LA-C012-prog.npy (14.0 KB)\n", " - LA-C012-hidden.npy (19.4 KB)\n", " CM/\n", " - trial_metadata.txt (10.2 MB)\n", " LFCC-LCNN/\n", " - score.txt (4.2 MB)\n", " CQCC-GMM/\n", " - score.txt (4.0 MB)\n", " RawNet2/\n", " - score.txt (5.7 MB)\n", " LFCC-GMM/\n", " - score.txt (4.0 MB)\n", " ASV/\n", " - trial_metadata.txt (65.8 MB)\n", " ASVTorch_Kaldi/\n", " - score.txt (40.9 MB)\n" ] } ] }, { "cell_type": "code", "source": [ "KEY_FILE = '/content/kaggle_2021_key/keys/LA/CM/trial_metadata.txt'\n", "\n", "print(f\"Key file: {KEY_FILE}\")\n", "print(f\"Size: {os.path.getsize(KEY_FILE) / 1e6:.2f} MB\\n\")\n", "\n", "print(\"First 10 lines:\")\n", "print(\"-\" * 70)\n", "with open(KEY_FILE) as f:\n", " for i, line in enumerate(f):\n", " if i >= 10:\n", " break\n", " print(repr(line))\n", "print(\"-\" * 70)\n", "\n", "with open(KEY_FILE) as f:\n", " n_lines = sum(1 for _ in f)\n", "print(f\"\\nTotal lines: {n_lines:,}\")\n", "\n", "# Also check the README\n", "print(\"\\nREADME contents:\")\n", "print(\"-\" * 70)\n", "with open('/content/kaggle_2021_key/keys/LA/README.txt') as f:\n", " print(f.read())" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "KmJ212hNKnIS", "executionInfo": { "status": "ok", "timestamp": 1777766924288, "user_tz": 420, "elapsed": 9, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "358394d8-58ac-4492-8121-19043f823d75" }, "execution_count": 11, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Key file: /content/kaggle_2021_key/keys/LA/CM/trial_metadata.txt\n", "Size: 10.16 MB\n", "\n", "First 10 lines:\n", "----------------------------------------------------------------------\n", "'LA_0009 LA_E_9332881 alaw ita_tx A07 spoof notrim eval\\n'\n", "'LA_0009 LA_E_6866159 alaw ita_tx A07 spoof notrim eval\\n'\n", "'LA_0009 LA_E_5464494 alaw sin_tx A07 spoof notrim eval\\n'\n", "'LA_0009 LA_E_4759417 alaw sin_tx A07 spoof notrim eval\\n'\n", "'LA_0009 LA_E_2667748 alaw loc_tx A07 spoof notrim eval\\n'\n", "'LA_0009 LA_E_8589971 alaw loc_tx A07 spoof notrim progress\\n'\n", "'LA_0009 LA_E_1911364 alaw loc_tx A07 spoof notrim progress\\n'\n", "'LA_0009 LA_E_5298786 alaw loc_tx A07 spoof notrim progress\\n'\n", "'LA_0009 LA_E_2042719 ulaw ita_tx A07 spoof notrim eval\\n'\n", "'LA_0009 LA_E_5449181 ulaw ita_tx A07 spoof notrim eval\\n'\n", "----------------------------------------------------------------------\n", "\n", "Total lines: 181,566\n", "\n", "README contents:\n", "----------------------------------------------------------------------\n", "\n", "================================\n", "ASVspoof2021 key and meta label\n", "================================\n", "\n", "This folder contains keys & meta data for ASVspoof2021 evaluation data.\n", "\n", "./\n", "|- CM \n", "| |- trial_metadata.txt CM protocol with keys and meta labels\n", "| |- LFCC-GMM \n", "| | |- score.txt Baseline LFCC-GMM CM score\n", "| |- ...\n", "|\n", "|- ASV (optional)\n", "| |- trial_metadata.txt ASV protocl with keys and meta labels\n", "| |- ASVtorch_kaldi \n", "| |- score.txt Baseline ASV score\n", "|\n", "|- *-C012-*.npy (optional) Pre-computed C012 cofficients\n", "\n", "__author__ = \"ASVspoof consortium\"\n", "__copyright__ = \"Copyright 2022, ASVspoof consortium\"\n", "\n" ] } ] }, { "cell_type": "markdown", "source": [ "Schema decoded\n", "Each line has 8 space-separated fields:\n", "\n", "LA_0009 LA_E_9332881 alaw ita_tx A07 spoof notrim eval\n", "\n", "\u2193 \u2193 \u2193 \u2193 \u2193 \u2193 \u2193 \u2193\n", "\n", "speaker utterance_id codec channel attack label trim partition\n" ], "metadata": { "id": "SeLFqHvXK3_A" } }, { "cell_type": "code", "source": [ "PROTOCOLS_2021_PY = '''\"\"\"\n", "ASVspoof 2021 LA protocol parser.\n", "\n", "Format (8 space-separated columns):\n", " speaker_id utterance_id codec channel attack_id label trim partition\n", "\n", " speaker_id : anonymized speaker\n", " utterance_id : filename without extension (e.g., \"LA_E_9332881\")\n", " codec : audio codec applied (alaw, ulaw, g722, mp3, pcm, ...)\n", " channel : transmission channel (ita_tx, sin_tx, loc_tx, ...)\n", " attack_id : \"-\" for bonafide, \"A07\"-\"A19\" for spoof\n", " label : \"bonafide\" or \"spoof\"\n", " trim : \"trim\" or \"notrim\"\n", " partition : \"eval\", \"progress\", or \"hidden\"\n", "\"\"\"\n", "\n", "from dataclasses import dataclass\n", "from typing import List, Optional\n", "import os\n", "\n", "\n", "@dataclass\n", "class Utterance2021:\n", " \"\"\"One row from an ASVspoof 2021 LA cm protocol file.\"\"\"\n", " speaker_id: str\n", " utterance_id: str\n", " codec: str\n", " channel: str\n", " attack_id: str\n", " label: str\n", " label_int: int\n", " trim: str\n", " partition: str\n", " flac_path: str\n", "\n", "\n", "def parse_protocol_2021(\n", " protocol_path: str,\n", " audio_root: str,\n", " partition_filter: Optional[str] = \"eval\",\n", ") -> List[Utterance2021]:\n", " \"\"\"Parse the 2021 LA cm protocol with keys.\n", "\n", " Args:\n", " protocol_path: full path to trial_metadata.txt\n", " audio_root: full path to the flac/ folder\n", " partition_filter: only return rows matching this partition.\n", " Valid: \"eval\", \"progress\", \"hidden\", or None for all.\n", "\n", " Returns:\n", " List of Utterance2021 objects.\n", " \"\"\"\n", " utterances: List[Utterance2021] = []\n", " with open(protocol_path, \"r\") as f:\n", " for line in f:\n", " parts = line.strip().split()\n", " if len(parts) != 8:\n", " continue\n", " speaker_id, utt_id, codec, channel, attack_id, label, trim, partition = parts\n", "\n", " if partition_filter is not None and partition != partition_filter:\n", " continue\n", "\n", " label_int = 0 if label == \"bonafide\" else 1\n", " flac_path = os.path.join(audio_root, f\"{utt_id}.flac\")\n", "\n", " utterances.append(Utterance2021(\n", " speaker_id=speaker_id,\n", " utterance_id=utt_id,\n", " codec=codec,\n", " channel=channel,\n", " attack_id=attack_id,\n", " label=label,\n", " label_int=label_int,\n", " trim=trim,\n", " partition=partition,\n", " flac_path=flac_path,\n", " ))\n", " return utterances\n", "'''\n", "\n", "PATH = '/content/deepfake-audio-detection/src/data/protocols_2021.py'\n", "import os\n", "os.makedirs(os.path.dirname(PATH), exist_ok=True)\n", "\n", "with open(PATH, 'w') as f:\n", " f.write(PROTOCOLS_2021_PY)\n", "print(f\"Wrote {PATH} ({len(PROTOCOLS_2021_PY)} bytes)\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ViHz0PbOK41Y", "executionInfo": { "status": "ok", "timestamp": 1777767150761, "user_tz": 420, "elapsed": 58, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "c1b65b6a-23a7-43c4-bb8e-6b9a24a3e72d" }, "execution_count": 12, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Wrote /content/deepfake-audio-detection/src/data/protocols_2021.py (2415 bytes)\n" ] } ] }, { "cell_type": "code", "source": [ "import sys, importlib\n", "\n", "# Reload in case\n", "sys.path.insert(0, '/content/deepfake-audio-detection')\n", "if 'src.data.protocols_2021' in sys.modules:\n", " importlib.reload(sys.modules['src.data.protocols_2021'])\n", "from src.data.protocols_2021 import parse_protocol_2021\n", "\n", "KEY_FILE = '/content/kaggle_2021_key/keys/LA/CM/trial_metadata.txt'\n", "AUDIO_ROOT = '/content/kaggle_2021/ASVspoof2021_LA/ASVspoof2021_LA_eval/flac'\n", "\n", "# Parse the eval partition only\n", "print(\"Parsing 2021 LA protocol (eval partition)...\")\n", "utts_eval = parse_protocol_2021(KEY_FILE, AUDIO_ROOT, partition_filter=\"eval\")\n", "print(f\"Eval utterances: {len(utts_eval):,}\\n\")\n", "\n", "# Class distribution\n", "from collections import Counter\n", "labels = Counter(u.label for u in utts_eval)\n", "print(f\"Class distribution:\")\n", "for k, v in labels.most_common():\n", " print(f\" {k}: {v:,}\")\n", "ratio = labels['spoof'] / labels['bonafide'] if labels['bonafide'] > 0 else 0\n", "print(f\" Imbalance: 1 bonafide : {ratio:.1f} spoof\\n\")\n", "\n", "# Codec distribution\n", "codecs = Counter(u.codec for u in utts_eval)\n", "print(f\"Codec distribution:\")\n", "for k, v in codecs.most_common():\n", " print(f\" {k}: {v:,}\")\n", "\n", "# Channel distribution\n", "channels = Counter(u.channel for u in utts_eval)\n", "print(f\"\\nChannel distribution:\")\n", "for k, v in channels.most_common():\n", " print(f\" {k}: {v:,}\")\n", "\n", "# Attack distribution\n", "attacks = Counter(u.attack_id for u in utts_eval)\n", "print(f\"\\nAttack distribution:\")\n", "for k, v in sorted(attacks.items()):\n", " label = \"bonafide\" if k == \"-\" else \"spoof\"\n", " print(f\" {k}: {v:>6,} ({label})\")\n", "\n", "# Sanity check: verify a few audio files actually exist\n", "import os\n", "print(f\"\\nAudio file existence check (first 5):\")\n", "for u in utts_eval[:5]:\n", " exists = os.path.exists(u.flac_path)\n", " status = \"OK\" if exists else \"MISSING\"\n", " print(f\" [{status}] {u.utterance_id}.flac codec={u.codec} attack={u.attack_id}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "FFTdQ8dlLpTn", "executionInfo": { "status": "ok", "timestamp": 1777767195756, "user_tz": 420, "elapsed": 512, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "7f455c66-aae3-4c23-8499-446bf4d3ca30" }, "execution_count": 13, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Parsing 2021 LA protocol (eval partition)...\n", "Eval utterances: 148,176\n", "\n", "Class distribution:\n", " spoof: 133,360\n", " bonafide: 14,816\n", " Imbalance: 1 bonafide : 9.0 spoof\n", "\n", "Codec distribution:\n", " ulaw: 23,520\n", " gsm: 23,520\n", " opus: 23,520\n", " alaw: 19,436\n", " none: 19,421\n", " pstn: 19,384\n", " g722: 19,375\n", "\n", "Channel distribution:\n", " loc_tx: 62,425\n", " ita_tx: 23,508\n", " sin_tx: 23,438\n", " -: 19,421\n", " mad_tx: 19,384\n", "\n", "Attack distribution:\n", " A07: 10,238 (spoof)\n", " A08: 10,368 (spoof)\n", " A09: 10,152 (spoof)\n", " A10: 10,318 (spoof)\n", " A11: 10,276 (spoof)\n", " A12: 10,259 (spoof)\n", " A13: 10,302 (spoof)\n", " A14: 10,234 (spoof)\n", " A15: 10,235 (spoof)\n", " A16: 10,390 (spoof)\n", " A17: 10,239 (spoof)\n", " A18: 10,148 (spoof)\n", " A19: 10,201 (spoof)\n", " bonafide: 14,816 (spoof)\n", "\n", "Audio file existence check (first 5):\n", " [OK] LA_E_9332881.flac codec=alaw attack=A07\n", " [OK] LA_E_6866159.flac codec=alaw attack=A07\n", " [OK] LA_E_5464494.flac codec=alaw attack=A07\n", " [OK] LA_E_4759417.flac codec=alaw attack=A07\n", " [OK] LA_E_2667748.flac codec=alaw attack=A07\n" ] } ] }, { "cell_type": "code", "source": [ "import torch\n", "import torchaudio\n", "from tqdm import tqdm\n", "\n", "# Reload modules in case\n", "for mod in ['src.data.preprocessing', 'src.data.dataset',\n", " 'src.models.wav2vec_classifier']:\n", " if mod in sys.modules:\n", " importlib.reload(sys.modules[mod])\n", "\n", "from src.data.dataset import ASVspoofDataset\n", "from src.models.wav2vec_classifier import Wav2VecClassifier\n", "\n", "# We need utts_eval to look like the 2019 Utterance class for ASVspoofDataset.\n", "# The fields ASVspoofDataset uses: flac_path, label_int, utterance_id\n", "# All present in Utterance2021. So we can pass them directly.\n", "\n", "# Step 1: Measure all durations\n", "print(f\"Measuring durations on {len(utts_eval):,} 2021 LA eval utterances...\")\n", "print(\"Expected: ~7-10 min\\n\")\n", "\n", "eval_durs_2021 = []\n", "for u in tqdm(utts_eval, desc=\"2021 durations\"):\n", " w, _ = torchaudio.load(u.flac_path)\n", " eval_durs_2021.append(w.shape[1])\n", "\n", "# Step 2: Build dataset and loader\n", "eval_ds_2021 = ASVspoofDataset(utts_eval, durations_samples=eval_durs_2021)\n", "print(f\"\\n2021 eval dataset: {len(eval_ds_2021):,} windows from {len(utts_eval):,} utterances\")\n", "inflation = len(eval_ds_2021) / len(utts_eval)\n", "print(f\"Inflation factor: {inflation:.2f}x\")\n", "\n", "from torch.utils.data import DataLoader\n", "eval_loader_2021 = DataLoader(\n", " eval_ds_2021, batch_size=16, shuffle=False, num_workers=2, pin_memory=True\n", ")\n", "\n", "# Step 3: Load Stage 2 model\n", "print(\"\\nLoading Stage 2 best checkpoint...\")\n", "device = 'cuda'\n", "model = Wav2VecClassifier(\n", " backbone_name=\"facebook/wav2vec2-base\",\n", " num_classes=2,\n", " freeze_backbone=True,\n", ")\n", "ckpt = torch.load(\n", " '/content/drive/MyDrive/deepfake_audio/checkpoints/stage2_best.pt',\n", " map_location=device, weights_only=False,\n", ")\n", "model.load_state_dict(ckpt['model_state_dict'])\n", "model = model.to(device)\n", "model.eval()\n", "print(f\"Model loaded (epoch {ckpt['epoch']}, dev EER {ckpt['best_eer']*100:.4f}%)\")\n", "print(\"\\nReady for 2021 LA inference. Run the next cell.\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 512 }, "id": "uk15VjpCL6A1", "executionInfo": { "status": "error", "timestamp": 1777767592588, "user_tz": 420, "elapsed": 328927, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "b453bc62-5bf6-4675-fe37-2a0cc83ce1bd" }, "execution_count": 14, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Measuring durations on 148,176 2021 LA eval utterances...\n", "Expected: ~7-10 min\n", "\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "2021 durations: 70%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588 | 104397/148176 [05:08<02:09, 338.37it/s]\n" ] }, { "output_type": "error", "ename": "RuntimeError", "evalue": "Failed to decode audio samples: Could not flush decoder: Invalid data found when processing input", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torchaudio/_torchcodec.py\u001b[0m in \u001b[0;36mload_with_torchcodec\u001b[0;34m(uri, frame_offset, num_frames, normalize, channels_first, format, buffer_size, backend)\u001b[0m\n\u001b[1;32m 127\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 128\u001b[0;31m \u001b[0maudio_samples\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdecoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_all_samples\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 129\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torchcodec/decoders/_audio_decoder.py\u001b[0m in \u001b[0;36mget_all_samples\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 107\u001b[0m \"\"\"\n\u001b[0;32m--> 108\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_samples_played_in_range\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 109\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torchcodec/decoders/_audio_decoder.py\u001b[0m in \u001b[0;36mget_samples_played_in_range\u001b[0;34m(self, start_seconds, stop_seconds)\u001b[0m\n\u001b[1;32m 136\u001b[0m )\n\u001b[0;32m--> 137\u001b[0;31m frames, first_pts = core.get_frames_by_pts_in_range_audio(\n\u001b[0m\u001b[1;32m 138\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_decoder\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torch/_ops.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 818\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__call__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m/\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0m_P\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0m_P\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0m_T\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 819\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_op\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 820\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mRuntimeError\u001b[0m: Could not flush decoder: Invalid data found when processing input", "\nThe above exception was the direct cause of the following exception:\n", "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/tmp/ipykernel_13317/263687011.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0meval_durs_2021\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mu\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mutts_eval\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdesc\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"2021 durations\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 24\u001b[0;31m \u001b[0mw\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorchaudio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mu\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflac_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 25\u001b[0m \u001b[0meval_durs_2021\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mw\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torchaudio/__init__.py\u001b[0m in \u001b[0;36mload\u001b[0;34m(uri, frame_offset, num_frames, normalize, channels_first, format, buffer_size, backend)\u001b[0m\n\u001b[1;32m 84\u001b[0m \u001b[0mby\u001b[0m \u001b[0mTorchCodec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 85\u001b[0m \"\"\"\n\u001b[0;32m---> 86\u001b[0;31m return load_with_torchcodec(\n\u001b[0m\u001b[1;32m 87\u001b[0m \u001b[0muri\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[0mframe_offset\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mframe_offset\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torchaudio/_torchcodec.py\u001b[0m in \u001b[0;36mload_with_torchcodec\u001b[0;34m(uri, frame_offset, num_frames, normalize, channels_first, format, buffer_size, backend)\u001b[0m\n\u001b[1;32m 128\u001b[0m \u001b[0maudio_samples\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdecoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_all_samples\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 129\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 130\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Failed to decode audio samples: {e}\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 131\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 132\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0maudio_samples\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mRuntimeError\u001b[0m: Failed to decode audio samples: Could not flush decoder: Invalid data found when processing input" ] } ] }, { "cell_type": "code", "source": [ "# Find which file failed\n", "bad_idx = len(eval_durs_2021) # this is how many succeeded before crash\n", "print(f\"Files successfully measured: {bad_idx:,}\")\n", "print(f\"Failing file index: {bad_idx}\")\n", "\n", "# Show the bad file\n", "bad_utt = utts_eval[bad_idx]\n", "print(f\"\\nBad utterance:\")\n", "print(f\" ID: {bad_utt.utterance_id}\")\n", "print(f\" Path: {bad_utt.flac_path}\")\n", "print(f\" Codec: {bad_utt.codec}\")\n", "print(f\" Channel: {bad_utt.channel}\")\n", "print(f\" Attack: {bad_utt.attack_id}\")\n", "\n", "import os\n", "print(f\"\\nFile size: {os.path.getsize(bad_utt.flac_path)} bytes\")\n", "\n", "# Try to load it directly to confirm the error\n", "import torchaudio\n", "try:\n", " w, sr = torchaudio.load(bad_utt.flac_path)\n", " print(f\"Loaded successfully: {w.shape}, sr={sr}\")\n", "except Exception as e:\n", " print(f\"\\nConfirmed unreadable: {type(e).__name__}: {e}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "CVUn9pKXOyqT", "executionInfo": { "status": "ok", "timestamp": 1777768020095, "user_tz": 420, "elapsed": 18, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "1ea07858-dced-4494-adaf-7d2b4ff1a8e5" }, "execution_count": 15, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Files successfully measured: 104,397\n", "Failing file index: 104397\n", "\n", "Bad utterance:\n", " ID: LA_E_1759547\n", " Path: /content/kaggle_2021/ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1759547.flac\n", " Codec: pstn\n", " Channel: mad_tx\n", " Attack: A13\n", "\n", "File size: 28608 bytes\n", "\n", "Confirmed unreadable: RuntimeError: Failed to decode audio samples: Could not flush decoder: Invalid data found when processing input\n" ] } ] }, { "cell_type": "code", "source": [ "import torchaudio\n", "import os\n", "from tqdm import tqdm\n", "\n", "# Sample 50 random pstn files and try to load each\n", "import random\n", "random.seed(42)\n", "\n", "pstn_utts = [u for u in utts_eval if u.codec == 'pstn']\n", "print(f\"Total pstn files in eval: {len(pstn_utts):,}\")\n", "\n", "sample = random.sample(pstn_utts, min(50, len(pstn_utts)))\n", "print(f\"Testing 50 random pstn files...\\n\")\n", "\n", "failed = []\n", "for u in tqdm(sample, desc=\"pstn check\"):\n", " try:\n", " w, _ = torchaudio.load(u.flac_path)\n", " except Exception:\n", " failed.append(u.utterance_id)\n", "\n", "print(f\"\\npstn files tested: {len(sample)}\")\n", "print(f\"Failed to load: {len(failed)} ({100*len(failed)/len(sample):.1f}%)\")\n", "\n", "if failed:\n", " print(f\"\\nSample of failed IDs:\")\n", " for f in failed[:5]:\n", " print(f\" - {f}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "rHwjB5-nO7Js", "executionInfo": { "status": "ok", "timestamp": 1777768056155, "user_tz": 420, "elapsed": 37, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "93b2c649-c4b1-4e18-ab4e-e9673c33f699" }, "execution_count": 16, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Total pstn files in eval: 19,384\n", "Testing 50 random pstn files...\n", "\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "pstn check: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 50/50 [00:00<00:00, 453.50it/s]" ] }, { "output_type": "stream", "name": "stdout", "text": [ "\n", "pstn files tested: 50\n", "Failed to load: 0 (0.0%)\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "\n" ] } ] }, { "cell_type": "code", "source": [ "import torchaudio\n", "from tqdm import tqdm\n", "\n", "# Track failures so we can analyze them later\n", "failed_ids = []\n", "\n", "# Continue from where we crashed\n", "print(f\"Already measured: {len(eval_durs_2021):,}\")\n", "print(f\"Resuming from index {len(eval_durs_2021)}...\\n\")\n", "\n", "remaining = utts_eval[len(eval_durs_2021):]\n", "\n", "for u in tqdm(remaining, desc=\"2021 durations (resume)\"):\n", " try:\n", " w, _ = torchaudio.load(u.flac_path)\n", " eval_durs_2021.append(w.shape[1])\n", " except Exception as e:\n", " # Use a sentinel value that we'll filter out later\n", " eval_durs_2021.append(None)\n", " failed_ids.append(u.utterance_id)\n", "\n", "print(f\"\\nMeasurement complete.\")\n", "print(f\"Total durations recorded: {len(eval_durs_2021):,}\")\n", "print(f\"Failed to read: {len(failed_ids):,}\")\n", "if failed_ids:\n", " print(f\"\\nFirst 10 failed IDs:\")\n", " for f in failed_ids[:10]:\n", " print(f\" - {f}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "3wSfLHpEPGnr", "executionInfo": { "status": "ok", "timestamp": 1777768219664, "user_tz": 420, "elapsed": 117977, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "288b9971-c7e2-416f-9417-9a6add06677a" }, "execution_count": 17, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Already measured: 104,397\n", "Resuming from index 104397...\n", "\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "2021 durations (resume): 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 43779/43779 [01:58<00:00, 370.57it/s]" ] }, { "output_type": "stream", "name": "stdout", "text": [ "\n", "Measurement complete.\n", "Total durations recorded: 148,176\n", "Failed to read: 1\n", "\n", "First 10 failed IDs:\n", " - LA_E_1759547\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "\n" ] } ] }, { "cell_type": "code", "source": [ "# Filter: keep only utterances with valid durations\n", "valid_pairs = [\n", " (u, d) for u, d in zip(utts_eval, eval_durs_2021)\n", " if d is not None\n", "]\n", "utts_eval_clean = [p[0] for p in valid_pairs]\n", "durs_eval_clean = [p[1] for p in valid_pairs]\n", "\n", "print(f\"Original utterances: {len(utts_eval):,}\")\n", "print(f\"Valid utterances: {len(utts_eval_clean):,}\")\n", "print(f\"Removed: {len(utts_eval) - len(utts_eval_clean):,}\")\n", "\n", "# Rebuild dataset and loader with the clean data\n", "from src.data.dataset import ASVspoofDataset\n", "from torch.utils.data import DataLoader\n", "\n", "eval_ds_2021 = ASVspoofDataset(utts_eval_clean, durations_samples=durs_eval_clean)\n", "eval_loader_2021 = DataLoader(\n", " eval_ds_2021, batch_size=16, shuffle=False, num_workers=2, pin_memory=True\n", ")\n", "\n", "print(f\"\\nFinal eval dataset: {len(eval_ds_2021):,} windows from {len(utts_eval_clean):,} utterances\")\n", "inflation = len(eval_ds_2021) / len(utts_eval_clean)\n", "print(f\"Inflation factor: {inflation:.2f}x\")\n", "print(f\"\\nReady for inference.\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "uEFGs2R0TTao", "executionInfo": { "status": "ok", "timestamp": 1777769203406, "user_tz": 420, "elapsed": 520, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "487635f2-e321-4fc1-b4e9-9e1b42358f0b" }, "execution_count": 18, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Original utterances: 148,176\n", "Valid utterances: 148,175\n", "Removed: 1\n", "\n", "Final eval dataset: 173,149 windows from 148,175 utterances\n", "Inflation factor: 1.17x\n", "\n", "Ready for inference.\n" ] } ] }, { "cell_type": "code", "source": [ "import torch\n", "import importlib, sys\n", "\n", "# Reload modules\n", "for mod in ['src.models.wav2vec_classifier']:\n", " if mod in sys.modules:\n", " importlib.reload(sys.modules[mod])\n", "from src.models.wav2vec_classifier import Wav2VecClassifier\n", "\n", "# Build model and load Stage 2 checkpoint\n", "device = 'cuda'\n", "model = Wav2VecClassifier(\n", " backbone_name=\"facebook/wav2vec2-base\",\n", " num_classes=2,\n", " freeze_backbone=True,\n", ")\n", "ckpt_path = '/content/drive/MyDrive/deepfake_audio/checkpoints/stage2_best.pt'\n", "ckpt = torch.load(ckpt_path, map_location=device, weights_only=False)\n", "model.load_state_dict(ckpt['model_state_dict'])\n", "model = model.to(device)\n", "model.eval()\n", "print(f\"Model reloaded.\")\n", "print(f\" Checkpoint: epoch {ckpt['epoch']}, dev EER {ckpt['best_eer']*100:.4f}%\")\n", "print(f\" Device: {next(model.parameters()).device}\")\n", "print(f\" Mode: {'eval' if not model.training else 'train'}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 599, "referenced_widgets": [ "53a720ab601240aeb4c6648242691f4b", "93c12098d7fc4534a6f55b5eee59b7d3", "dea9573389fe48ab9a70ba1a90816835", "7b1f1033c90d4aa69d43ab54ab0ffcb7", "6b7a97f8077344598300c9a3e9e7a502", "fe09d65683654145a1ae72cc4e47742a", "e891e562bba24bbeaabfe5a0a3bf60ff", "a275540d556d496fbd3e4a59f080e9ad", "963d086a1d4e4cb28aaf44272f9d58c6", "51a07670c9ad4bd085b840d3376b625a", "888a4264f5da4eb6988a59cb44ae94a4", "8361b5ae95d2412b98be9501f0c01caa", "26d29aa19cb64c37a6a025d5d5bd2b63", "23ae2a90462e4fd3b8f74244a71011cc", "271798236b394a97866fa3f66dbd933c", "412c7582608449a8b2b15e8be81e582c", "93a870cd79614edca379807b0e8c25b9", "7c487c1214b74db5a0419d894ed505a9", "9874bc54cca84bd989a223834a02f339", "19ea860465544e668fe40350be897677", "5ad4fa282b974391879c72c10ee7607f", "60b47b5c6f3e4369b53cc5fd60963be5", "685ad97214e948b48d03205d08b0a451", "7f02b1a2a6394b33a91ccf3b42cf679c", "a366903e51fd45bda050931eeba15270", "f6b2aa7a2ffb4830bc60d94c4bf7fd53", "de591daea11c4f3182692dc9d82db075", "de74f36f94bc4b399fdf482cd5964f42", "9d413beb9aba41cb97e07398e37b4940", "4976869e69254e659a07d2e3e12fa10f", "cb579ee62db04846a0f7a7cead5244dc", "aabeea06a76b4663b2eb83b50702ccdc", "19726602d830446ebcc5ae15249c8fbf", "635c36f8d6a74a32adfda89bb432f70d", "9548badf7f4f4ccb893eee3f9b8dcca7", "6c75e712042c41708367a1dc87fc28d3", "35d1917781354f21bd5b86bbdd7e5cdf", "50bf7c89c2d24479a7c4757e2edc97d3", "aaeacea9bc7a415ea23ad2cefc99de43", "a3709411901e46f8b6e6c62735d64ee9", "ef6743f2d246487fb03f51ee4dbfef83", "df509c08c6fe4b0b86b8372b20b097e0", "86f74258ca2448e591b466f6cb1a2abb", "4d99c78d5ba24241b2d3684be1bd1a66" ] }, "id": "e6BTFl6UVGIa", "executionInfo": { "status": "ok", "timestamp": 1777769689288, "user_tz": 420, "elapsed": 16921, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "b73b617b-f8e7-4a41-cb14-36852c034f96" }, "execution_count": 20, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:93: UserWarning: \n", "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", "You will be able to reuse this secret in all of your notebooks.\n", "Please note that authentication is recommended but still optional to access public models or datasets.\n", " warnings.warn(\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n", "WARNING:huggingface_hub.utils._http:Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "Wav2Vec2Model LOAD REPORT from: facebook/wav2vec2-base\n", "Key | Status | | \n", "-----------------------------+------------+--+-\n", "quantizer.weight_proj.weight | UNEXPECTED | | \n", "project_q.weight | UNEXPECTED | | \n", "project_hid.weight | UNEXPECTED | | \n", "quantizer.codevectors | UNEXPECTED | | \n", "quantizer.weight_proj.bias | UNEXPECTED | | \n", "project_hid.bias | UNEXPECTED | | \n", "project_q.bias | UNEXPECTED | | \n", "\n", "Notes:\n", "- UNEXPECTED\t:can be ignored when loading from different task/architecture; not ok if you expect identical arch.\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Model reloaded.\n", " Checkpoint: epoch 9, dev EER 0.6941%\n", " Device: cuda:0\n", " Mode: eval\n" ] } ] }, { "cell_type": "code", "source": [ "import torch\n", "import numpy as np\n", "from tqdm import tqdm\n", "from collections import defaultdict\n", "import time\n", "\n", "from src.evaluation.metrics import compute_eer, compute_auc, aggregate_window_scores_to_utterance\n", "\n", "# Build lookup tables for breakdown analysis (codec/channel/attack per utterance)\n", "utt_codec_map = {u.utterance_id: u.codec for u in utts_eval_clean}\n", "utt_channel_map = {u.utterance_id: u.channel for u in utts_eval_clean}\n", "utt_attack_map = {u.utterance_id: u.attack_id for u in utts_eval_clean}\n", "\n", "# Run inference\n", "print(\"Running inference on 2021 LA eval set (mixed precision, batch=16)...\")\n", "print(\"Expected: ~25-35 min on T4\\n\")\n", "\n", "model.eval()\n", "all_window_scores = []\n", "all_window_labels = []\n", "all_window_utts = []\n", "\n", "start = time.time()\n", "with torch.no_grad():\n", " autocast_ctx = torch.amp.autocast(device_type='cuda', enabled=True)\n", " for waveforms, labels, utt_ids in tqdm(eval_loader_2021, desc=\"2021 inference\"):\n", " waveforms = waveforms.to('cuda', non_blocking=True)\n", " with autocast_ctx:\n", " logits = model(waveforms)\n", " probs = torch.softmax(logits.float(), dim=-1)\n", " spoof_probs = probs[:, 1].detach().cpu().numpy()\n", "\n", " all_window_scores.extend(spoof_probs.tolist())\n", " all_window_labels.extend(labels.tolist())\n", " all_window_utts.extend(list(utt_ids))\n", "\n", "inference_minutes = (time.time() - start) / 60\n", "print(f\"\\nInference complete in {inference_minutes:.1f} min over {len(all_window_scores):,} windows.\")\n", "\n", "# Aggregate to per-utterance\n", "print(\"\\nAggregating window scores to utterance scores (mean)...\")\n", "utt_scores, utt_ids_sorted = aggregate_window_scores_to_utterance(\n", " np.array(all_window_scores), all_window_utts, method=\"mean\",\n", ")\n", "\n", "# Build per-utterance label arrays\n", "utt_label_map = {}\n", "for s, l, u in zip(all_window_scores, all_window_labels, all_window_utts):\n", " if u not in utt_label_map:\n", " utt_label_map[u] = l\n", "\n", "utt_labels = np.array([utt_label_map[u] for u in utt_ids_sorted])\n", "utt_codecs = np.array([utt_codec_map[u] for u in utt_ids_sorted])\n", "utt_channels = np.array([utt_channel_map[u] for u in utt_ids_sorted])\n", "utt_attacks = np.array([utt_attack_map[u] for u in utt_ids_sorted])\n", "\n", "# ---- Overall metrics ----\n", "print(f\"\\n{'='*70}\")\n", "print(f\" SECONDARY EVALUATION \u2014 ASVspoof 2021 LA Eval Partition\")\n", "print(f\"{'='*70}\")\n", "n_bona = int((utt_labels == 0).sum())\n", "n_spoof = int((utt_labels == 1).sum())\n", "print(f\"Utterances: {len(utt_scores):,}\")\n", "print(f\"Bonafide: {n_bona:,}\")\n", "print(f\"Spoof: {n_spoof:,}\")\n", "\n", "eer_2021, threshold_2021 = compute_eer(utt_scores, utt_labels)\n", "auc_2021 = compute_auc(utt_scores, utt_labels)\n", "preds_2021 = (utt_scores > threshold_2021).astype(int)\n", "acc_2021 = float((preds_2021 == utt_labels).mean())\n", "\n", "print(f\"\\nOverall results (Stage 2 model on 2021 LA):\")\n", "print(f\" EER: {eer_2021*100:.4f}%\")\n", "print(f\" AUC: {auc_2021:.4f}\")\n", "print(f\" Accuracy: {acc_2021*100:.2f}%\")\n", "print(f\" Threshold: {threshold_2021:.4f}\")\n", "\n", "# ---- Cross-dataset comparison ----\n", "print(f\"\\nCross-dataset comparison:\")\n", "print(f\" Stage 2 dev EER (2019 LA, seen attacks): 0.69%\")\n", "print(f\" Stage 2 eval EER (2019 LA, unseen attacks): 5.55%\")\n", "print(f\" Stage 2 eval EER (2021 LA, unseen + codecs): {eer_2021*100:.2f}%\")\n", "gap_2019_to_2021 = (eer_2021 - 0.0555) * 100\n", "print(f\" Cross-dataset gap (2019 \u2192 2021): {gap_2019_to_2021:+.2f} pp\")\n", "\n", "# ---- Per-codec EER ----\n", "print(f\"\\n{'='*70}\")\n", "print(f\" PER-CODEC EER BREAKDOWN\")\n", "print(f\"{'='*70}\")\n", "bonafide_scores_all = utt_scores[utt_labels == 0]\n", "codecs_unique = sorted(set(utt_codecs))\n", "per_codec_results = {}\n", "\n", "for codec in codecs_unique:\n", " mask = (utt_codecs == codec)\n", " codec_scores = utt_scores[mask]\n", " codec_labels = utt_labels[mask]\n", " if len(np.unique(codec_labels)) < 2:\n", " # Only one class \u2014 can't compute EER\n", " per_codec_results[codec] = {\"n\": int(mask.sum()), \"eer\": None, \"note\": \"one class only\"}\n", " print(f\" {codec:<6}: n={mask.sum():>6,} (single class, skipping EER)\")\n", " continue\n", " c_eer, _ = compute_eer(codec_scores, codec_labels)\n", " per_codec_results[codec] = {\"n\": int(mask.sum()), \"eer\": float(c_eer)}\n", " print(f\" {codec:<6}: n={mask.sum():>6,} EER={c_eer*100:>6.2f}%\")\n", "\n", "# ---- Per-channel EER ----\n", "print(f\"\\n{'='*70}\")\n", "print(f\" PER-CHANNEL EER BREAKDOWN\")\n", "print(f\"{'='*70}\")\n", "channels_unique = sorted(set(utt_channels))\n", "per_channel_results = {}\n", "\n", "for ch in channels_unique:\n", " mask = (utt_channels == ch)\n", " ch_scores = utt_scores[mask]\n", " ch_labels = utt_labels[mask]\n", " if len(np.unique(ch_labels)) < 2:\n", " per_channel_results[ch] = {\"n\": int(mask.sum()), \"eer\": None, \"note\": \"one class only\"}\n", " print(f\" {ch:<10}: n={mask.sum():>6,} (single class, skipping EER)\")\n", " continue\n", " ch_eer, _ = compute_eer(ch_scores, ch_labels)\n", " per_channel_results[ch] = {\"n\": int(mask.sum()), \"eer\": float(ch_eer)}\n", " print(f\" {ch:<10}: n={mask.sum():>6,} EER={ch_eer*100:>6.2f}%\")\n", "\n", "# ---- Per-attack EER ----\n", "print(f\"\\n{'='*70}\")\n", "print(f\" PER-ATTACK EER BREAKDOWN (vs all bonafide)\")\n", "print(f\"{'='*70}\")\n", "attack_ids_eval = sorted(a for a in set(utt_attacks) if a != '-')\n", "per_attack_results = {}\n", "\n", "for attack in attack_ids_eval:\n", " mask = (utt_attacks == attack)\n", " attack_scores = utt_scores[mask]\n", " n = int(mask.sum())\n", " combined_scores = np.concatenate([bonafide_scores_all, attack_scores])\n", " combined_labels = np.concatenate([\n", " np.zeros(len(bonafide_scores_all)),\n", " np.ones(n),\n", " ])\n", " a_eer, _ = compute_eer(combined_scores, combined_labels)\n", " per_attack_results[attack] = {\"n\": n, \"eer\": float(a_eer)}\n", " print(f\" {attack}: n={n:>6,} EER={a_eer*100:>7.2f}%\")\n", "\n", "# Save raw scores\n", "import os\n", "SCORES_PATH = '/content/deepfake-audio-detection/results/scores/stage2_eval2021.npz'\n", "os.makedirs(os.path.dirname(SCORES_PATH), exist_ok=True)\n", "np.savez(\n", " SCORES_PATH,\n", " utt_ids=np.array(utt_ids_sorted),\n", " utt_scores=utt_scores,\n", " utt_labels=utt_labels,\n", " utt_codecs=utt_codecs,\n", " utt_channels=utt_channels,\n", " utt_attacks=utt_attacks,\n", ")\n", "print(f\"\\nRaw scores saved to {SCORES_PATH}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4DkUUvQzUyqU", "executionInfo": { "status": "ok", "timestamp": 1777771160731, "user_tz": 420, "elapsed": 1308116, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "e29318d3-9420-4c11-aff6-6365a887d5a6" }, "execution_count": 21, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Running inference on 2021 LA eval set (mixed precision, batch=16)...\n", "Expected: ~25-35 min on T4\n", "\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "2021 inference: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 10822/10822 [21:46<00:00, 8.29it/s]\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "\n", "Inference complete in 21.8 min over 173,149 windows.\n", "\n", "Aggregating window scores to utterance scores (mean)...\n", "\n", "======================================================================\n", " SECONDARY EVALUATION \u2014 ASVspoof 2021 LA Eval Partition\n", "======================================================================\n", "Utterances: 148,175\n", "Bonafide: 14,816\n", "Spoof: 133,359\n", "\n", "Overall results (Stage 2 model on 2021 LA):\n", " EER: 9.0850%\n", " AUC: 0.9629\n", " Accuracy: 90.91%\n", " Threshold: 0.5148\n", "\n", "Cross-dataset comparison:\n", " Stage 2 dev EER (2019 LA, seen attacks): 0.69%\n", " Stage 2 eval EER (2019 LA, unseen attacks): 5.55%\n", " Stage 2 eval EER (2021 LA, unseen + codecs): 9.09%\n", " Cross-dataset gap (2019 \u2192 2021): +3.54 pp\n", "\n", "======================================================================\n", " PER-CODEC EER BREAKDOWN\n", "======================================================================\n", " alaw : n=19,436 EER= 8.37%\n", " g722 : n=19,375 EER= 5.42%\n", " gsm : n=23,520 EER= 11.53%\n", " none : n=19,421 EER= 5.24%\n", " opus : n=23,520 EER= 5.30%\n", " pstn : n=19,383 EER= 11.14%\n", " ulaw : n=23,520 EER= 7.81%\n", "\n", "======================================================================\n", " PER-CHANNEL EER BREAKDOWN\n", "======================================================================\n", " - : n=19,421 EER= 5.24%\n", " ita_tx : n=23,508 EER= 9.27%\n", " loc_tx : n=62,425 EER= 8.75%\n", " mad_tx : n=19,383 EER= 11.14%\n", " sin_tx : n=23,438 EER= 9.00%\n", "\n", "======================================================================\n", " PER-ATTACK EER BREAKDOWN (vs all bonafide)\n", "======================================================================\n", " A07: n=10,238 EER= 9.53%\n", " A08: n=10,368 EER= 5.56%\n", " A09: n=10,152 EER= 3.32%\n", " A10: n=10,318 EER= 20.37%\n", " A11: n=10,276 EER= 3.97%\n", " A12: n=10,259 EER= 5.11%\n", " A13: n=10,301 EER= 1.20%\n", " A14: n=10,234 EER= 14.75%\n", " A15: n=10,235 EER= 16.89%\n", " A16: n=10,390 EER= 5.22%\n", " A17: n=10,239 EER= 8.31%\n", " A18: n=10,148 EER= 9.31%\n", " A19: n=10,201 EER= 7.17%\n", " bonafide: n=14,816 EER= 50.00%\n", "\n", "Raw scores saved to /content/deepfake-audio-detection/results/scores/stage2_eval2021.npz\n" ] } ] }, { "cell_type": "markdown", "source": [ "\"The model retains primary-eval-level performance (~5%) on uncompressed and modern codecs (none, opus, g722) but degrades significantly on aggressive lossy compression (gsm at 11.53%, pstn at 11.14%). This suggests the model relies on high-frequency artifacts that are partially destroyed by GSM-style compression. Future work could include codec augmentation during training to improve robustness.\"" ], "metadata": { "id": "Azth7nL_bPFh" } }, { "cell_type": "code", "source": [ "import json, os\n", "from datetime import datetime\n", "\n", "results_2021 = {\n", " \"phase\": \"Phase 5b \u2014 Secondary Evaluation on ASVspoof 2021 LA\",\n", " \"completed_at\": datetime.now().isoformat(),\n", " \"model_checkpoint\": \"/content/drive/MyDrive/deepfake_audio/checkpoints/stage2_best.pt\",\n", " \"model_dev_eer\": 0.0069,\n", " \"evaluation_dataset\": {\n", " \"name\": \"ASVspoof 2021 LA \u2014 eval partition only\",\n", " \"kaggle_source\": \"ajaysuryal/asvspoof2021-la (audio) + simontrann/asvspoof2021-la-key (labels)\",\n", " \"utterances_total_in_partition\": 148176,\n", " \"utterances_evaluated\": 148175,\n", " \"utterances_skipped_corrupt\": 1,\n", " \"windows\": 173149,\n", " \"bonafide_count\": 14816,\n", " \"spoof_count\": 133359,\n", " \"attacks\": [\"A07\", \"A08\", \"A09\", \"A10\", \"A11\", \"A12\", \"A13\", \"A14\", \"A15\", \"A16\", \"A17\", \"A18\", \"A19\"],\n", " \"codecs\": [\"none\", \"alaw\", \"ulaw\", \"g722\", \"gsm\", \"opus\", \"pstn\"],\n", " \"channels\": [\"-\", \"ita_tx\", \"loc_tx\", \"mad_tx\", \"sin_tx\"],\n", " },\n", " \"inference\": {\n", " \"batch_size\": 16,\n", " \"mixed_precision\": True,\n", " \"wall_clock_minutes\": 21.8,\n", " \"windows_per_second\": 132,\n", " },\n", " \"overall_results\": {\n", " \"eer\": 0.0909,\n", " \"auc\": 0.9629,\n", " \"accuracy\": 0.9091,\n", " \"threshold\": 0.5148,\n", " },\n", " \"cross_dataset_comparison\": {\n", " \"stage2_dev_2019_seen_attacks\": 0.0069,\n", " \"stage2_eval_2019_unseen_attacks\": 0.0555,\n", " \"stage2_eval_2021_unseen_attacks_plus_codecs\": 0.0909,\n", " \"gap_2019_eval_to_2021_eval_pp\": 3.54,\n", " \"interpretation\": \"Real-world codec degradation adds ~3.5 percentage points of error on top of 2019 unseen-attack eval.\",\n", " },\n", " \"per_codec_eer\": {\n", " \"none\": 0.0524, \"opus\": 0.0530, \"g722\": 0.0542,\n", " \"ulaw\": 0.0781, \"alaw\": 0.0837,\n", " \"pstn\": 0.1114, \"gsm\": 0.1153,\n", " },\n", " \"per_codec_summary\": {\n", " \"best_codec\": {\"id\": \"none\", \"eer\": 0.0524},\n", " \"worst_codec\": {\"id\": \"gsm\", \"eer\": 0.1153},\n", " \"interpretation\": \"Aggressive lossy compression (gsm, pstn) degrades performance by ~6 pp vs uncompressed. Modern codecs (opus, g722) preserve detection signal well.\",\n", " },\n", " \"per_channel_eer\": {\n", " \"-\": 0.0524, \"ita_tx\": 0.0927, \"loc_tx\": 0.0875,\n", " \"mad_tx\": 0.1114, \"sin_tx\": 0.0900,\n", " },\n", " \"per_attack_eer\": {\n", " \"A07\": 0.0953, \"A08\": 0.0556, \"A09\": 0.0332, \"A10\": 0.2037,\n", " \"A11\": 0.0397, \"A12\": 0.0511, \"A13\": 0.0120, \"A14\": 0.1475,\n", " \"A15\": 0.1689, \"A16\": 0.0522, \"A17\": 0.0831, \"A18\": 0.0931,\n", " \"A19\": 0.0717,\n", " },\n", " \"per_attack_summary\": {\n", " \"n_attacks\": 13,\n", " \"mean_eer_across_attacks\": 0.0890,\n", " \"median_eer_across_attacks\": 0.0717,\n", " \"worst_attack\": {\"id\": \"A10\", \"eer\": 0.2037, \"consistent_with_2019\": True},\n", " \"best_attack\": {\"id\": \"A13\", \"eer\": 0.0120, \"consistent_with_2019\": True},\n", " },\n", " \"comparisons_to_published_baselines_2021\": {\n", " \"lfcc_gmm_eer\": 0.2556,\n", " \"cqcc_gmm_eer\": 0.1930,\n", " \"lfcc_lcnn_eer\": 0.0926,\n", " \"rawnet2_eer\": 0.0950,\n", " \"our_eer\": 0.0909,\n", " \"interpretation\": \"Stage 2 model matches the strongest neural baselines (LFCC-LCNN 9.26%, RawNet2 9.50%) on 2021 LA, despite being trained only on 2019 data with zero codec augmentation.\"\n", " },\n", " \"raw_scores_path\": \"/content/deepfake-audio-detection/results/scores/stage2_eval2021.npz\",\n", " \"wandb_run_training\": \"https://wandb.ai/sara-jaffrani17-dlp/deepfake-audio-detection/runs/l1q4dvsx\",\n", " \"notes\": [\n", " \"Cross-dataset evaluation on ASVspoof 2021 LA. Model was trained on 2019 LA only.\",\n", " \"9.09% EER overall is competitive with strong published 2021 LA baselines (LFCC-LCNN 9.26%, RawNet2 9.50%).\",\n", " \"Per-codec analysis reveals model's vulnerability to aggressive lossy compression (gsm 11.53%, pstn 11.14%).\",\n", " \"Per-attack rankings consistent with 2019: A10/A14/A15 hardest, A13/A09 easiest.\",\n", " \"Phase 5c next: cross-dataset evaluation on WaveFake (vocoder-only synthesis).\",\n", " ]\n", "}\n", "\n", "OUTPUT = '/content/deepfake-audio-detection/results/metrics/stage2_eval2021_results.json'\n", "os.makedirs(os.path.dirname(OUTPUT), exist_ok=True)\n", "with open(OUTPUT, 'w') as f:\n", " json.dump(results_2021, f, indent=2)\n", "\n", "print(f\"Wrote {OUTPUT}\")\n", "print(f\"Size: {os.path.getsize(OUTPUT)} bytes\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "DWmsb3ypbgVG", "executionInfo": { "status": "ok", "timestamp": 1777771358190, "user_tz": 420, "elapsed": 12, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "80741e73-062e-45d7-a88f-17a0af5b2a3b" }, "execution_count": 22, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Wrote /content/deepfake-audio-detection/results/metrics/stage2_eval2021_results.json\n", "Size: 3954 bytes\n" ] } ] }, { "cell_type": "code", "source": [ "from google.colab import userdata\n", "import os\n", "\n", "GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')\n", "os.chdir('/content/deepfake-audio-detection')\n", "\n", "!git config user.email \"95262824+Saracasm@users.noreply.github.com\"\n", "!git config user.name \"Sara Iqbal\"\n", "\n", "# Stage all the new files\n", "!git add results/metrics/stage2_eval2021_results.json\n", "!git add results/scores/stage2_eval2021.npz\n", "!git add src/data/protocols_2021.py\n", "!git status\n", "\n", "!git commit -m \"Phase 5b: 2021 LA cross-dataset eval \u2014 9.09% EER, matches strongest baseline\"\n", "\n", "push_url = f\"https://Saracasm:{GITHUB_TOKEN}@github.com/Saracasm/deepfake-audio-detection.git\"\n", "!git push {push_url} main 2>&1 | grep -v {GITHUB_TOKEN}" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "NELdErV0bo5L", "executionInfo": { "status": "ok", "timestamp": 1777771394847, "user_tz": 420, "elapsed": 7177, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "aa07955a-1a74-4dc7-ce4f-5c54ef2269af" }, "execution_count": 23, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "On branch main\n", "Your branch is up to date with 'origin/main'.\n", "\n", "Changes to be committed:\n", " (use \"git restore --staged ...\" to unstage)\n", "\t\u001b[32mnew file: results/metrics/stage2_eval2021_results.json\u001b[m\n", "\t\u001b[32mnew file: results/scores/stage2_eval2021.npz\u001b[m\n", "\t\u001b[32mnew file: src/data/protocols_2021.py\u001b[m\n", "\n", "[main 6b144b9] Phase 5b: 2021 LA cross-dataset eval \u2014 9.09% EER, matches strongest baseline\n", " 3 files changed, 219 insertions(+)\n", " create mode 100644 results/metrics/stage2_eval2021_results.json\n", " create mode 100644 results/scores/stage2_eval2021.npz\n", " create mode 100644 src/data/protocols_2021.py\n", "To https://github.com/Saracasm/deepfake-audio-detection.git\n", " 888b5b5..6b144b9 main -> main\n" ] } ] }, { "cell_type": "code", "source": [ "print(\"Searching Kaggle for WaveFake datasets...\\n\")\n", "print(\"--- Search 1: 'wavefake' ---\")\n", "!kaggle datasets list -s \"wavefake\" --max-size 100000000000 2>&1 | head -15\n", "\n", "print(\"\\n--- Search 2: 'wave fake audio' ---\")\n", "!kaggle datasets list -s \"wave fake audio\" --max-size 100000000000 2>&1 | head -10\n", "\n", "print(\"\\n--- Search 3: 'audio deepfake vocoder' ---\")\n", "!kaggle datasets list -s \"audio deepfake vocoder\" --max-size 100000000000 2>&1 | head -10" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "WWAMPtyRcLj0", "executionInfo": { "status": "ok", "timestamp": 1777771531983, "user_tz": 420, "elapsed": 2797, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "b28eb80a-b0c4-47b0-99dc-7e32896a2e4b" }, "execution_count": 24, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Searching Kaggle for WaveFake datasets...\n", "\n", "--- Search 1: 'wavefake' ---\n", "ref title size lastUpdated downloadCount voteCount usabilityRating \n", "------------------------------------------ ------------------------------------------ ----------- -------------------------- ------------- --------- --------------- \n", "andreadiubaldo/wavefake-test wavefake 28915177091 2023-04-03 18:48:36.313000 1222 4 0.3125 \n", "walimuhammadahmad/fakeaudio WaveFake: DeepFake Audio Detection Dataset 28915177091 2023-08-09 04:55:59.317000 4935 8 0.875 \n", "dinaahmed11/wavefake wavefake 56767983528 2026-02-05 18:12:47.467000 1 0 0.125 \n", "gustavovrr/mel-image-ljspeech-and-wavefake Mel Image LJspeech and WaveFake 8481190515 2024-08-06 05:45:02.543000 14 0 0.3125 \n", "rohan576/wavefake-vocoders-subset wavefake-vocoders-subset 3415496664 2026-04-10 17:33:32.973000 0 0 0.125 \n", "utsavavaiya/wavefake-jsut-25 WaveFake_Jsut_25 71174763 2024-10-08 09:00:09.790000 4 0 0.25 \n", "utsavavaiya/wavefake-1500 wavefake_1500 71387688 2024-10-08 09:37:51.230000 4 0 0.25 \n", "maryamkhan2025/wavefakedatasetformodel wavefakedataset 33905505462 2026-02-03 18:47:16.080000 20 0 0.23529412 \n", "\n", "--- Search 2: 'wave fake audio' ---\n", "ref title size lastUpdated downloadCount voteCount usabilityRating \n", "--------------------------- ------------------------------------------ ----------- -------------------------- ------------- --------- --------------- \n", "walimuhammadahmad/fakeaudio WaveFake: DeepFake Audio Detection Dataset 28915177091 2023-08-09 04:55:59.317000 4935 8 0.875 \n", "\n", "--- Search 3: 'audio deepfake vocoder' ---\n", "ref title size lastUpdated downloadCount voteCount usabilityRating \n", "-------------------------- --------------------------------------------- ---------- -------------------------- ------------- --------- --------------- \n", "ameythakur20/deepfakeaudio Neural Voice Cloning: Deepfake Audio & Models 576855751 2026-01-29 08:24:10.073000 140 3 1 \n" ] } ] }, { "cell_type": "code", "source": [ "print(\"Inspecting utsavavaiya/wavefake-1500 file structure...\\n\")\n", "!kaggle datasets files utsavavaiya/wavefake-1500 2>&1 | head -50" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "IEYwB49ecfsm", "executionInfo": { "status": "ok", "timestamp": 1777771612339, "user_tz": 420, "elapsed": 666, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "6954152c-6d67-4a47-8243-23b0cbcf7b00" }, "execution_count": 25, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Inspecting utsavavaiya/wavefake-1500 file structure...\n", "\n", "Next Page Token = CfDJ8OqP5ZkTT9ZGj66XXRbxXyAzyWwK6AN5v2GabJ7gDPZhzZu1UH1lHISiRxNputpyoIfveTiCWiZVesQJrRHwU68t9LwFE7s-3dajvU47VG_qaGbYeI0hVUviZY6SlsvWa396g_c0vT9mBY9jNKyz8KCpuOKSDkyfBQcFyKlYO7uyHuVLY8p2xnoMYhi4snd2s-Wc\n", "name size creationDate \n", "----------------------------------------------------------- ----- -------------------------- \n", "wavefake_sample/fake/BASIC5000_0001_gen_mel_spectrogram.png 24024 2024-10-08 09:37:53.275000 \n", "wavefake_sample/fake/BASIC5000_0002_gen_mel_spectrogram.png 22988 2024-10-08 09:37:53.251000 \n", "wavefake_sample/fake/BASIC5000_0003_gen_mel_spectrogram.png 24395 2024-10-08 09:37:53.232000 \n", "wavefake_sample/fake/BASIC5000_0004_gen_mel_spectrogram.png 23114 2024-10-08 09:37:53.239000 \n", "wavefake_sample/fake/BASIC5000_0005_gen_mel_spectrogram.png 24481 2024-10-08 09:37:53.232000 \n", "wavefake_sample/fake/BASIC5000_0006_gen_mel_spectrogram.png 24173 2024-10-08 09:37:53.239000 \n", "wavefake_sample/fake/BASIC5000_0007_gen_mel_spectrogram.png 24986 2024-10-08 09:37:53.232000 \n", "wavefake_sample/fake/BASIC5000_0008_gen_mel_spectrogram.png 23405 2024-10-08 09:37:53.245000 \n", "wavefake_sample/fake/BASIC5000_0009_gen_mel_spectrogram.png 24403 2024-10-08 09:37:53.239000 \n", "wavefake_sample/fake/BASIC5000_0010_gen_mel_spectrogram.png 23255 2024-10-08 09:37:53.245000 \n", "wavefake_sample/fake/BASIC5000_0011_gen_mel_spectrogram.png 24027 2024-10-08 09:37:53.245000 \n", "wavefake_sample/fake/BASIC5000_0012_gen_mel_spectrogram.png 24173 2024-10-08 09:37:53.257000 \n", "wavefake_sample/fake/BASIC5000_0013_gen_mel_spectrogram.png 23707 2024-10-08 09:37:53.263000 \n", "wavefake_sample/fake/BASIC5000_0014_gen_mel_spectrogram.png 23346 2024-10-08 09:37:53.275000 \n", "wavefake_sample/fake/BASIC5000_0015_gen_mel_spectrogram.png 23602 2024-10-08 09:37:53.263000 \n", "wavefake_sample/fake/BASIC5000_0016_gen_mel_spectrogram.png 22401 2024-10-08 09:37:53.275000 \n", "wavefake_sample/fake/BASIC5000_0017_gen_mel_spectrogram.png 23905 2024-10-08 09:37:53.322000 \n", "wavefake_sample/fake/BASIC5000_0018_gen_mel_spectrogram.png 24210 2024-10-08 09:37:53.251000 \n", "wavefake_sample/fake/BASIC5000_0019_gen_mel_spectrogram.png 23930 2024-10-08 09:37:53.257000 \n", "wavefake_sample/fake/BASIC5000_0020_gen_mel_spectrogram.png 21199 2024-10-08 09:37:53.275000 \n" ] } ] }, { "cell_type": "code", "source": [ "print(\"Inspecting walimuhammadahmad/fakeaudio structure...\\n\")\n", "!kaggle datasets files walimuhammadahmad/fakeaudio 2>&1 | head -50" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ojQA28G3crCR", "executionInfo": { "status": "ok", "timestamp": 1777771658821, "user_tz": 420, "elapsed": 712, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "75862e71-f640-4431-b26f-11ba7daf42af" }, "execution_count": 26, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Inspecting walimuhammadahmad/fakeaudio structure...\n", "\n", "Next Page Token = CfDJ8OqP5ZkTT9ZGj66XXRbxXyBSiZQWZWX3OETQuaaRtanntVBQnEzjHglWqjEUOP4aGsbr3DtRgwuw5CNK9bjBUbiAazu9dC7w1TKd1UJTnkjMCgtCx0xULSeC50Vq2BdMQqJZ3JgqJ8kBUjPXeKKJ2qa9wZv7hGoxjWQX-iOB8M8K9MPSUnAfE79U6cU4E9QFNjnKp2KjiEXC_wYZoOGpu9VXg4AFw8v_Ozkytcvpf8tXjpQ\n", "name size creationDate \n", "------------------------------------------------------------------------------------------- ------ -------------------------- \n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_0.wav 160812 2023-08-09 09:26:04.307000 \n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_1.wav 54316 2023-08-09 09:26:07.753000 \n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10.wav 313900 2023-08-09 09:26:13.366000 \n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_100.wav 106540 2023-08-09 09:26:06.029000 \n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_1000.wav 186924 2023-08-09 09:26:21.576000 \n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10000.wav 91180 2023-08-09 09:26:40.292000 \n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10001.wav 231980 2023-08-09 09:26:37.497000 \n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10002.wav 178732 2023-08-09 09:26:34.107000 \n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10003.wav 238636 2023-08-09 09:26:36.553000 \n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10004.wav 217644 2023-08-09 09:26:26.943000 \n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10005.wav 291884 2023-08-09 09:26:29.391000 \n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10006.wav 179244 2023-08-09 09:26:32.823000 \n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10007.wav 88620 2023-08-09 09:26:30.645000 \n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10008.wav 237612 2023-08-09 09:26:23.381000 \n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10009.wav 227884 2023-08-09 09:26:25.659000 \n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_1001.wav 90156 2023-08-09 09:26:19.720000 \n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10010.wav 137260 2023-08-09 09:26:26.951000 \n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10011.wav 134700 2023-08-09 09:26:29.294000 \n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10012.wav 233516 2023-08-09 09:26:32.976000 \n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10013.wav 192556 2023-08-09 09:26:30.592000 \n" ] } ] }, { "cell_type": "code", "source": [ "import subprocess\n", "\n", "print(\"Getting full file listing for walimuhammadahmad/fakeaudio...\")\n", "print(\"(this might take a few seconds \u2014 large dataset)\\n\")\n", "\n", "# Run kaggle datasets files and capture all output\n", "result = subprocess.run(\n", " ['kaggle', 'datasets', 'files', 'walimuhammadahmad/fakeaudio', '-v'],\n", " capture_output=True, text=True, timeout=60\n", ")\n", "output = result.stdout\n", "\n", "# Extract unique folder paths\n", "import re\n", "folders = set()\n", "for line in output.split('\\n'):\n", " parts = line.split('/')\n", " if len(parts) >= 2 and parts[0] in ['generated_audio', 'real_audio', 'training_audio', 'fake', 'real']:\n", " # First two levels of path\n", " folder = f\"{parts[0]}/{parts[1]}\"\n", " folders.add(folder)\n", " elif len(parts) >= 1 and parts[0] not in ['name', '-' * 5, '']:\n", " folders.add(parts[0])\n", "\n", "print(\"Top-level folders detected:\")\n", "for f in sorted(folders):\n", " if '/' in f:\n", " print(f\" {f}/\")\n", " else:\n", " print(f\" {f}/ (top-level)\")\n", "\n", "# Also try paginated listing\n", "print(\"\\nFirst 200 file paths (sample to identify all vocoder folders):\")\n", "print(output[:5000]) # show first 5KB of listing\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5F9o0D4NdZ-H", "executionInfo": { "status": "ok", "timestamp": 1777771853274, "user_tz": 420, "elapsed": 716, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "788e03f4-dc05-4892-e6a5-7422b4517741" }, "execution_count": 27, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Getting full file listing for walimuhammadahmad/fakeaudio...\n", "(this might take a few seconds \u2014 large dataset)\n", "\n", "Top-level folders detected:\n", " Next Page Token = CfDJ8OqP5ZkTT9ZGj66XXRbxXyBoI2XKwzEDNs4rPWYF8XEvV6A6AtAEPlyA_QntgBaZqDQYTQaKhZmMUri3lcPgEmSU2njgsXOqWqsgAl3aWs9EbSRQNuAVNKK0t8kEbK4wW5Qn-1MDdSUwQUnKW8hBUrJ41dSnVvxoGlSB7kvFgUkEDkcCiccTnTrluynAMyDO3z4xgov5QyCiVhHKLNbx02wFRatYdcCLiVwy-GuwgSFaxJs/ (top-level)\n", " generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/\n", " name,size,creationDate/ (top-level)\n", "\n", "First 200 file paths (sample to identify all vocoder folders):\n", "Next Page Token = CfDJ8OqP5ZkTT9ZGj66XXRbxXyBoI2XKwzEDNs4rPWYF8XEvV6A6AtAEPlyA_QntgBaZqDQYTQaKhZmMUri3lcPgEmSU2njgsXOqWqsgAl3aWs9EbSRQNuAVNKK0t8kEbK4wW5Qn-1MDdSUwQUnKW8hBUrJ41dSnVvxoGlSB7kvFgUkEDkcCiccTnTrluynAMyDO3z4xgov5QyCiVhHKLNbx02wFRatYdcCLiVwy-GuwgSFaxJs\n", "name,size,creationDate\n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_0.wav,160812,2023-08-09 09:26:04.307000\n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_1.wav,54316,2023-08-09 09:26:07.753000\n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10.wav,313900,2023-08-09 09:26:13.366000\n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_100.wav,106540,2023-08-09 09:26:06.029000\n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_1000.wav,186924,2023-08-09 09:26:21.576000\n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10000.wav,91180,2023-08-09 09:26:40.292000\n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10001.wav,231980,2023-08-09 09:26:37.497000\n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10002.wav,178732,2023-08-09 09:26:34.107000\n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10003.wav,238636,2023-08-09 09:26:36.553000\n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10004.wav,217644,2023-08-09 09:26:26.943000\n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10005.wav,291884,2023-08-09 09:26:29.391000\n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10006.wav,179244,2023-08-09 09:26:32.823000\n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10007.wav,88620,2023-08-09 09:26:30.645000\n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10008.wav,237612,2023-08-09 09:26:23.381000\n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10009.wav,227884,2023-08-09 09:26:25.659000\n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_1001.wav,90156,2023-08-09 09:26:19.720000\n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10010.wav,137260,2023-08-09 09:26:26.951000\n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10011.wav,134700,2023-08-09 09:26:29.294000\n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10012.wav,233516,2023-08-09 09:26:32.976000\n", "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10013.wav,192556,2023-08-09 09:26:30.592000\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "import os, time\n", "\n", "DOWNLOAD_DIR = '/content/wavefake'\n", "os.makedirs(DOWNLOAD_DIR, exist_ok=True)\n", "\n", "# Check disk space first\n", "print(\"Checking disk space...\")\n", "!df -h /content | tail -1\n", "print()\n", "\n", "print(\"Downloading WaveFake (~29 GB)...\")\n", "print(\"Expected: 8-15 min on Colab\\n\")\n", "\n", "start = time.time()\n", "!kaggle datasets download -d walimuhammadahmad/fakeaudio -p {DOWNLOAD_DIR} --unzip --force --quiet\n", "elapsed_min = (time.time() - start) / 60\n", "print(f\"\\nDownload+unzip done in {elapsed_min:.1f} min.\")\n", "\n", "# Show actual structure\n", "print(\"\\nTop-level structure:\")\n", "!ls -la {DOWNLOAD_DIR}/\n", "\n", "print(\"\\nIf there's a 'generated_audio' folder, list its subfolders:\")\n", "gen_dir = f'{DOWNLOAD_DIR}/generated_audio'\n", "if os.path.exists(gen_dir):\n", " !ls -la {gen_dir}/" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "pYZZ0MbNeKu1", "executionInfo": { "status": "ok", "timestamp": 1777772913411, "user_tz": 420, "elapsed": 863123, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "12869aa4-1f10-4213-9765-b27b929a3b81" }, "execution_count": 28, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Checking disk space...\n", "overlay 236G 60G 177G 26% /\n", "\n", "Downloading WaveFake (~29 GB)...\n", "Expected: 8-15 min on Colab\n", "\n", "Dataset URL: https://www.kaggle.com/datasets/walimuhammadahmad/fakeaudio\n", "License(s): ODC Public Domain Dedication and Licence (PDDL)\n", "\n", "Download+unzip done in 14.4 min.\n", "\n", "Top-level structure:\n", "total 12\n", "drwxr-xr-x 3 root root 4096 May 3 01:48 .\n", "drwxr-xr-x 1 root root 4096 May 3 01:34 ..\n", "drwxr-xr-x 12 root root 4096 May 3 01:47 generated_audio\n", "\n", "If there's a 'generated_audio' folder, list its subfolders:\n", "total 4700\n", "drwxr-xr-x 12 root root 4096 May 3 01:47 .\n", "drwxr-xr-x 3 root root 4096 May 3 01:48 ..\n", "drwxr-xr-x 3 root root 520192 May 3 01:41 common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech\n", "drwxr-xr-x 2 root root 233472 May 3 01:42 jsut_multi_band_melgan\n", "drwxr-xr-x 2 root root 233472 May 3 01:42 jsut_parallel_wavegan\n", "drwxr-xr-x 2 root root 536576 May 3 01:43 ljspeech_full_band_melgan\n", "drwxr-xr-x 2 root root 589824 May 3 01:44 ljspeech_hifiGAN\n", "drwxr-xr-x 2 root root 536576 May 3 01:45 ljspeech_melgan\n", "drwxr-xr-x 2 root root 536576 May 3 01:45 ljspeech_melgan_large\n", "drwxr-xr-x 2 root root 536576 May 3 01:46 ljspeech_multi_band_melgan\n", "drwxr-xr-x 2 root root 536576 May 3 01:47 ljspeech_parallel_wavegan\n", "drwxr-xr-x 2 root root 503808 May 3 01:48 ljspeech_waveglow\n" ] } ] }, { "cell_type": "code", "source": [ "print(\"Searching Kaggle for LJSpeech...\\n\")\n", "!kaggle datasets list -s \"ljspeech\" --max-size 10000000000 2>&1 | head -20" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "b51Ym3gxiBuZ", "executionInfo": { "status": "ok", "timestamp": 1777773063295, "user_tz": 420, "elapsed": 1008, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "0157f260-2d98-4e34-ec1a-cf3b0fa8125e" }, "execution_count": 29, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Searching Kaggle for LJSpeech...\n", "\n", "ref title size lastUpdated downloadCount voteCount usabilityRating \n", "--------------------------------------------------------- -------------------------- ---------- -------------------------- ------------- --------- --------------- \n", "dromosys/ljspeech LJSpeech 6422684440 2018-09-17 00:20:03.623000 972 18 0.1764706 \n", "awsaf49/ljspeech-sr16k-dataset LJSpeech sr16k Dataset 2342225987 2023-09-13 21:16:44.393000 1260 7 0.5294118 \n", "maxbr0wn/culledjane-eyre-ljspeech Cleaned Jane Eyre LJSpeech 1597594099 2024-04-22 12:54:41.707000 264 4 0.9411765 \n", "mathurinache/the-lj-speech-dataset The LJ Speech Dataset 3211137023 2021-02-15 09:19:54.243000 11034 172 1 \n", "juliocaquino/ljspeech LJSpeech 354243568 2024-04-10 13:07:40.557000 17 1 0.3125 \n", "victorling/ljspeech LJSpeech 3211137023 2021-10-25 11:42:35.830000 21 2 0.3529412 \n", "ryanrudes/ljspeech LJSpeech 3212652913 2022-04-10 13:36:24.367000 16 2 0.23529412 \n", "prasannakasar/ljspeech-1-1-with-mel-and-mag-of-each-audio LJSpeech-1.1 0 2024-10-08 04:00:06.253000 5 1 0.4117647 \n", "rahulbhalley/ljspeech11 LJSpeech-1.1 3211137023 2021-12-05 10:32:25.580000 306 2 0.1764706 \n", "maxbr0wn/janeeyre Jane Eyre LJSpeech 1615805879 2024-01-29 16:37:04.650000 62 0 0.9411765 \n", "tttzof351/ljspeech-meta ljspeech_meta 613174 2023-06-20 21:33:07.487000 575 2 0.1764706 \n", "phhasian0710/ljspeech LJSpeech 6422684440 2019-06-15 07:30:23.807000 76 1 0.11764706 \n", "awsaf49/ljspeech-dataset LJSpeech: Dataset 3211137023 2022-02-06 15:15:30.893000 337 0 0.5294118 \n", "mobassir/comprehensive-bangla-tts comprehensive bangla tts 5860311293 2023-08-24 07:49:28.087000 557 7 0.6875 \n", "fredrelec/ljspeech-indian Ljspeech Indian 2198467078 2025-12-10 03:48:47.830000 0 1 0.3125 \n", "fag9897/ljspeech LJSpeech 3211137023 2022-01-10 17:43:02.657000 4 1 0.11764706 \n", "ashokneupane/ljspeecj ljspeech 1514409347 2024-05-29 20:21:59.727000 2 0 0.1764706 \n", "saramedhat38/ljspeech LJSPEECH 3211137023 2025-04-20 22:49:51.600000 7 0 0.1764706 \n" ] } ] }, { "cell_type": "code", "source": [ "import os, time\n", "\n", "LJSPEECH_DIR = '/content/ljspeech'\n", "os.makedirs(LJSPEECH_DIR, exist_ok=True)\n", "\n", "print(\"Downloading LJSpeech (~3.2 GB)...\")\n", "print(\"Expected: 2-5 min\\n\")\n", "\n", "start = time.time()\n", "!kaggle datasets download -d mathurinache/the-lj-speech-dataset -p {LJSPEECH_DIR} --unzip --force --quiet\n", "elapsed_min = (time.time() - start) / 60\n", "print(f\"\\nDownload+unzip done in {elapsed_min:.1f} min.\")\n", "\n", "# Show structure\n", "print(\"\\nTop-level structure:\")\n", "!ls -la {LJSPEECH_DIR}/\n", "\n", "# Look for the wavs folder\n", "import glob\n", "wav_candidates = []\n", "for root, dirs, files in os.walk(LJSPEECH_DIR):\n", " if 'wav' in root.lower() and any(f.endswith('.wav') for f in files):\n", " n_wavs = len([f for f in files if f.endswith('.wav')])\n", " wav_candidates.append((root, n_wavs))\n", "\n", "print(\"\\nWAV folders found:\")\n", "for path, n in wav_candidates:\n", " print(f\" {path}: {n:,} files\")\n", "\n", "# Check disk space after\n", "print(\"\\nDisk space:\")\n", "!df -h /content | tail -1" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "JiLh9UuciNQT", "executionInfo": { "status": "ok", "timestamp": 1777773202398, "user_tz": 420, "elapsed": 93056, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "3fc161c4-7cfa-47a8-97e6-c75f0b6cf897" }, "execution_count": 30, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Downloading LJSpeech (~3.2 GB)...\n", "Expected: 2-5 min\n", "\n", "Dataset URL: https://www.kaggle.com/datasets/mathurinache/the-lj-speech-dataset\n", "License(s): CC0-1.0\n", "\n", "Download+unzip done in 1.5 min.\n", "\n", "Top-level structure:\n", "total 12\n", "drwxr-xr-x 3 root root 4096 May 3 01:53 .\n", "drwxr-xr-x 1 root root 4096 May 3 01:51 ..\n", "drwxr-xr-x 3 root root 4096 May 3 01:52 LJSpeech-1.1\n", "\n", "WAV folders found:\n", " /content/ljspeech/LJSpeech-1.1/wavs: 13,100 files\n", "\n", "Disk space:\n", "overlay 236G 96G 141G 41% /\n" ] } ] }, { "cell_type": "code", "source": [ "import os, random, glob\n", "\n", "random.seed(42) # reproducibility\n", "\n", "# ---- Bonafide pool: LJSpeech ----\n", "LJ_WAV_DIR = '/content/ljspeech/LJSpeech-1.1/wavs'\n", "all_lj_files = sorted(glob.glob(f'{LJ_WAV_DIR}/*.wav'))\n", "print(f\"LJSpeech total: {len(all_lj_files):,} files\")\n", "\n", "# Sample 1,500 bonafide\n", "sampled_bonafide = random.sample(all_lj_files, 1500)\n", "print(f\"Sampled bonafide: {len(sampled_bonafide)}\")\n", "\n", "# ---- Spoof pool: 9 vocoders ----\n", "WAVEFAKE_DIR = '/content/wavefake/generated_audio'\n", "target_vocoders = [\n", " 'ljspeech_melgan',\n", " 'ljspeech_melgan_large',\n", " 'ljspeech_multi_band_melgan',\n", " 'ljspeech_full_band_melgan',\n", " 'ljspeech_parallel_wavegan',\n", " 'ljspeech_waveglow',\n", " 'ljspeech_hifiGAN',\n", " 'jsut_multi_band_melgan',\n", " 'jsut_parallel_wavegan',\n", "]\n", "\n", "vocoder_samples = {}\n", "print(\"\\nSampling spoofed audio per vocoder:\")\n", "for vocoder in target_vocoders:\n", " folder = f'{WAVEFAKE_DIR}/{vocoder}'\n", " files = sorted(glob.glob(f'{folder}/*.wav'))\n", " if len(files) == 0:\n", " print(f\" WARNING: {vocoder} \u2014 no .wav files found, checking other extensions\")\n", " files = sorted(glob.glob(f'{folder}/*'))\n", " n_avail = len(files)\n", " n_target = min(1000, n_avail)\n", " sampled = random.sample(files, n_target) if n_avail >= 1000 else files\n", " vocoder_samples[vocoder] = sampled\n", " print(f\" {vocoder}: {n_avail:,} available \u2192 sampled {len(sampled):,}\")\n", "\n", "# ---- Build the unified utterance list ----\n", "# Each entry: (file_path, label_int, vocoder_or_bonafide_id, utterance_id)\n", "class Utt:\n", " \"\"\"Lightweight utterance record for WaveFake eval.\"\"\"\n", " def __init__(self, flac_path, label_int, vocoder, utterance_id):\n", " self.flac_path = flac_path # named flac_path for compat with ASVspoofDataset\n", " self.label_int = label_int\n", " self.vocoder = vocoder # custom field for breakdown\n", " self.utterance_id = utterance_id\n", " self.label = 'bonafide' if label_int == 0 else 'spoof'\n", "\n", "utts_wavefake = []\n", "\n", "# Add bonafide\n", "for path in sampled_bonafide:\n", " uid = os.path.basename(path).replace('.wav', '') # e.g. LJ001-0001\n", " utts_wavefake.append(Utt(path, 0, 'bonafide_LJSpeech', uid))\n", "\n", "# Add spoof per vocoder\n", "for vocoder, files in vocoder_samples.items():\n", " for path in files:\n", " uid = f\"{vocoder}_{os.path.basename(path).replace('.wav', '')}\"\n", " utts_wavefake.append(Utt(path, 1, vocoder, uid))\n", "\n", "# Shuffle for randomness in batching\n", "random.shuffle(utts_wavefake)\n", "\n", "# Summary\n", "from collections import Counter\n", "print(f\"\\nTotal utterances: {len(utts_wavefake):,}\")\n", "print(f\"Bonafide: {sum(1 for u in utts_wavefake if u.label_int == 0):,}\")\n", "print(f\"Spoof: {sum(1 for u in utts_wavefake if u.label_int == 1):,}\")\n", "print(f\"\\nVocoder distribution:\")\n", "for vocoder, n in Counter(u.vocoder for u in utts_wavefake).most_common():\n", " print(f\" {vocoder}: {n:,}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "u-s9T5-ulgIG", "executionInfo": { "status": "ok", "timestamp": 1777773974319, "user_tz": 420, "elapsed": 1726, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "125a4910-b5f3-40e5-a40f-e32d154cce6d" }, "execution_count": 31, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "LJSpeech total: 13,100 files\n", "Sampled bonafide: 1500\n", "\n", "Sampling spoofed audio per vocoder:\n", " ljspeech_melgan: 13,100 available \u2192 sampled 1,000\n", " ljspeech_melgan_large: 13,100 available \u2192 sampled 1,000\n", " ljspeech_multi_band_melgan: 13,100 available \u2192 sampled 1,000\n", " ljspeech_full_band_melgan: 13,100 available \u2192 sampled 1,000\n", " ljspeech_parallel_wavegan: 13,100 available \u2192 sampled 1,000\n", " ljspeech_waveglow: 13,100 available \u2192 sampled 1,000\n", " ljspeech_hifiGAN: 13,100 available \u2192 sampled 1,000\n", " jsut_multi_band_melgan: 5,000 available \u2192 sampled 1,000\n", " jsut_parallel_wavegan: 5,000 available \u2192 sampled 1,000\n", "\n", "Total utterances: 10,500\n", "Bonafide: 1,500\n", "Spoof: 9,000\n", "\n", "Vocoder distribution:\n", " bonafide_LJSpeech: 1,500\n", " ljspeech_melgan: 1,000\n", " ljspeech_full_band_melgan: 1,000\n", " ljspeech_parallel_wavegan: 1,000\n", " ljspeech_waveglow: 1,000\n", " jsut_parallel_wavegan: 1,000\n", " ljspeech_melgan_large: 1,000\n", " ljspeech_multi_band_melgan: 1,000\n", " jsut_multi_band_melgan: 1,000\n", " ljspeech_hifiGAN: 1,000\n" ] } ] }, { "cell_type": "code", "source": [ "import torchaudio\n", "import torch\n", "\n", "print(\"Sanity-checking a few files from each category...\\n\")\n", "\n", "samples_to_test = [\n", " (\"LJSpeech bonafide\", utts_wavefake[0]),\n", "]\n", "\n", "seen_vocoders = {'bonafide_LJSpeech'}\n", "for u in utts_wavefake:\n", " if u.vocoder not in seen_vocoders:\n", " samples_to_test.append((u.vocoder, u))\n", " seen_vocoders.add(u.vocoder)\n", " if len(samples_to_test) >= 10:\n", " break\n", "\n", "for label, u in samples_to_test:\n", " try:\n", " w, sr = torchaudio.load(u.flac_path)\n", " duration = w.shape[1] / sr\n", " print(f\" [{label:<35}] sr={sr}, shape={tuple(w.shape)}, duration={duration:.2f}s\")\n", " except Exception as e:\n", " print(f\" [{label:<35}] FAILED: {type(e).__name__}: {e}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "DO-omUogmPUM", "executionInfo": { "status": "ok", "timestamp": 1777774165833, "user_tz": 420, "elapsed": 175, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "00f969a5-800b-4583-d2ce-7d3193a68ae7" }, "execution_count": 32, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Sanity-checking a few files from each category...\n", "\n", " [LJSpeech bonafide ] sr=22050, shape=(1, 84224), duration=3.82s\n", " [ljspeech_melgan ] sr=22050, shape=(1, 84224), duration=3.82s\n", " [ljspeech_full_band_melgan ] sr=22050, shape=(1, 42496), duration=1.93s\n", " [ljspeech_parallel_wavegan ] sr=22050, shape=(1, 94976), duration=4.31s\n", " [ljspeech_waveglow ] sr=22050, shape=(1, 154880), duration=7.02s\n", " [jsut_parallel_wavegan ] sr=24000, shape=(1, 123900), duration=5.16s\n", " [ljspeech_melgan_large ] sr=22050, shape=(1, 122880), duration=5.57s\n", " [ljspeech_multi_band_melgan ] sr=22050, shape=(1, 87040), duration=3.95s\n", " [jsut_multi_band_melgan ] sr=24000, shape=(1, 87900), duration=3.66s\n", " [ljspeech_hifiGAN ] sr=22050, shape=(1, 208896), duration=9.47s\n" ] } ] }, { "cell_type": "code", "source": [ "import sys, importlib\n", "\n", "# Make sure we have the latest preprocessing module\n", "if 'src.data.preprocessing' in sys.modules:\n", " importlib.reload(sys.modules['src.data.preprocessing'])\n", "from src.data.preprocessing import load_audio, WINDOW_SAMPLES, SAMPLE_RATE\n", "\n", "print(f\"Pipeline target SR: {SAMPLE_RATE} Hz\")\n", "print(f\"Window samples (4 sec at target SR): {WINDOW_SAMPLES}\\n\")\n", "\n", "# Test on one LJSpeech file (22050 Hz source)\n", "test_lj = utts_wavefake[0] # first one is bonafide LJSpeech (we shuffled but bonafide are most common at start of list)\n", "# Find a bonafide one explicitly\n", "test_lj = next(u for u in utts_wavefake if u.vocoder == 'bonafide_LJSpeech')\n", "print(f\"Testing LJSpeech file: {test_lj.utterance_id}\")\n", "\n", "w_loaded = load_audio(test_lj.flac_path)\n", "print(f\" Loaded shape: {w_loaded.shape}\")\n", "print(f\" Implied duration at 16 kHz: {w_loaded.shape[0] / 16000:.2f}s\")\n", "print(f\" Min/max: {w_loaded.min():.3f} / {w_loaded.max():.3f}\")\n", "\n", "# Test on a JSUT file (24000 Hz source)\n", "test_jsut = next(u for u in utts_wavefake if u.vocoder == 'jsut_parallel_wavegan')\n", "print(f\"\\nTesting JSUT file: {test_jsut.utterance_id}\")\n", "w_loaded = load_audio(test_jsut.flac_path)\n", "print(f\" Loaded shape: {w_loaded.shape}\")\n", "print(f\" Implied duration at 16 kHz: {w_loaded.shape[0] / 16000:.2f}s\")\n", "print(f\" Min/max: {w_loaded.min():.3f} / {w_loaded.max():.3f}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "L8SQKEj6mdYi", "executionInfo": { "status": "ok", "timestamp": 1777774224630, "user_tz": 420, "elapsed": 508, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "357b042e-971b-416c-f010-2a2289efead2" }, "execution_count": 33, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Pipeline target SR: 16000 Hz\n", "Window samples (4 sec at target SR): 64000\n", "\n", "Testing LJSpeech file: LJ016-0377\n", " Loaded shape: torch.Size([155967])\n", " Implied duration at 16 kHz: 9.75s\n", " Min/max: -0.502 / 0.559\n", "\n", "Testing JSUT file: jsut_parallel_wavegan_BASIC5000_4802_gen\n", " Loaded shape: torch.Size([82600])\n", " Implied duration at 16 kHz: 5.16s\n", " Min/max: -0.315 / 0.284\n" ] } ] }, { "cell_type": "code", "source": [ "import torchaudio\n", "from tqdm import tqdm\n", "\n", "print(\"Measuring durations on full WaveFake eval set (10,500 utterances)...\")\n", "print(\"Expected: ~3-4 min (resampling overhead included)\\n\")\n", "\n", "eval_durs_wf = []\n", "failed_ids_wf = []\n", "\n", "for u in tqdm(utts_wavefake, desc=\"WaveFake durations\"):\n", " try:\n", " # Use load_audio which resamples to 16 kHz\n", " from src.data.preprocessing import load_audio\n", " w = load_audio(u.flac_path)\n", " eval_durs_wf.append(w.shape[0])\n", " except Exception as e:\n", " eval_durs_wf.append(None)\n", " failed_ids_wf.append((u.utterance_id, str(e)))\n", "\n", "n_valid = sum(1 for d in eval_durs_wf if d is not None)\n", "print(f\"\\nMeasurement complete.\")\n", "print(f\"Total recorded: {len(eval_durs_wf):,}\")\n", "print(f\"Valid: {n_valid:,}\")\n", "print(f\"Failed: {len(failed_ids_wf):,}\")\n", "\n", "if failed_ids_wf:\n", " print(f\"\\nFirst 10 failures:\")\n", " for uid, err in failed_ids_wf[:10]:\n", " print(f\" {uid}: {err[:80]}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "EQ50sl-PmnYy", "executionInfo": { "status": "ok", "timestamp": 1777774473102, "user_tz": 420, "elapsed": 205818, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "937ea57a-f775-42e1-8fcb-33600945747c" }, "execution_count": 34, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Measuring durations on full WaveFake eval set (10,500 utterances)...\n", "Expected: ~3-4 min (resampling overhead included)\n", "\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "WaveFake durations: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 10500/10500 [03:25<00:00, 51.02it/s]" ] }, { "output_type": "stream", "name": "stdout", "text": [ "\n", "Measurement complete.\n", "Total recorded: 10,500\n", "Valid: 10,500\n", "Failed: 0\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "\n" ] } ] }, { "cell_type": "code", "source": [ "from src.data.dataset import ASVspoofDataset\n", "from torch.utils.data import DataLoader\n", "import torch\n", "import sys, importlib\n", "\n", "# Make sure model class is fresh\n", "if 'src.models.wav2vec_classifier' in sys.modules:\n", " importlib.reload(sys.modules['src.models.wav2vec_classifier'])\n", "from src.models.wav2vec_classifier import Wav2VecClassifier\n", "\n", "# Build dataset (ASVspoofDataset works because Utt has the same fields it needs)\n", "eval_ds_wf = ASVspoofDataset(utts_wavefake, durations_samples=eval_durs_wf)\n", "eval_loader_wf = DataLoader(\n", " eval_ds_wf, batch_size=16, shuffle=False, num_workers=2, pin_memory=True\n", ")\n", "\n", "print(f\"WaveFake dataset: {len(eval_ds_wf):,} windows from {len(utts_wavefake):,} utterances\")\n", "inflation = len(eval_ds_wf) / len(utts_wavefake)\n", "print(f\"Inflation factor: {inflation:.2f}x\")\n", "\n", "# Reload Stage 2 model (in case it got cleared)\n", "print(\"\\nLoading Stage 2 best checkpoint...\")\n", "device = 'cuda'\n", "model = Wav2VecClassifier(\n", " backbone_name=\"facebook/wav2vec2-base\",\n", " num_classes=2,\n", " freeze_backbone=True,\n", ")\n", "ckpt = torch.load(\n", " '/content/drive/MyDrive/deepfake_audio/checkpoints/stage2_best.pt',\n", " map_location=device, weights_only=False,\n", ")\n", "model.load_state_dict(ckpt['model_state_dict'])\n", "model = model.to(device)\n", "model.eval()\n", "print(f\"Model loaded (epoch {ckpt['epoch']}, dev EER {ckpt['best_eer']*100:.4f}%)\")\n", "\n", "# Run inference\n", "import numpy as np\n", "import time\n", "from tqdm import tqdm\n", "from src.evaluation.metrics import compute_eer, compute_auc, aggregate_window_scores_to_utterance\n", "\n", "# Build lookup for vocoder per utterance\n", "utt_vocoder_map = {u.utterance_id: u.vocoder for u in utts_wavefake}\n", "\n", "print(f\"\\nRunning inference (mixed precision, batch=16)...\")\n", "print(f\"Expected: ~3-5 min\\n\")\n", "\n", "all_window_scores = []\n", "all_window_labels = []\n", "all_window_utts = []\n", "\n", "start = time.time()\n", "with torch.no_grad():\n", " autocast_ctx = torch.amp.autocast(device_type='cuda', enabled=True)\n", " for waveforms, labels, utt_ids in tqdm(eval_loader_wf, desc=\"WaveFake inference\"):\n", " waveforms = waveforms.to('cuda', non_blocking=True)\n", " with autocast_ctx:\n", " logits = model(waveforms)\n", " probs = torch.softmax(logits.float(), dim=-1)\n", " spoof_probs = probs[:, 1].detach().cpu().numpy()\n", "\n", " all_window_scores.extend(spoof_probs.tolist())\n", " all_window_labels.extend(labels.tolist())\n", " all_window_utts.extend(list(utt_ids))\n", "\n", "inference_minutes = (time.time() - start) / 60\n", "print(f\"\\nInference complete in {inference_minutes:.1f} min over {len(all_window_scores):,} windows.\")\n", "\n", "# Aggregate to per-utterance\n", "print(\"\\nAggregating window scores to utterance scores (mean)...\")\n", "utt_scores, utt_ids_sorted = aggregate_window_scores_to_utterance(\n", " np.array(all_window_scores), all_window_utts, method=\"mean\",\n", ")\n", "\n", "# Build per-utterance label and vocoder arrays\n", "utt_label_map = {}\n", "for s, l, u in zip(all_window_scores, all_window_labels, all_window_utts):\n", " if u not in utt_label_map:\n", " utt_label_map[u] = l\n", "\n", "utt_labels = np.array([utt_label_map[u] for u in utt_ids_sorted])\n", "utt_vocoders = np.array([utt_vocoder_map[u] for u in utt_ids_sorted])\n", "\n", "# ---- Overall metrics ----\n", "print(f\"\\n{'='*70}\")\n", "print(f\" SUPPLEMENTARY EVALUATION \u2014 WaveFake (LJSpeech + JSUT)\")\n", "print(f\"{'='*70}\")\n", "n_bona = int((utt_labels == 0).sum())\n", "n_spoof = int((utt_labels == 1).sum())\n", "print(f\"Utterances: {len(utt_scores):,}\")\n", "print(f\"Bonafide: {n_bona:,}\")\n", "print(f\"Spoof: {n_spoof:,}\")\n", "\n", "eer_wf, threshold_wf = compute_eer(utt_scores, utt_labels)\n", "auc_wf = compute_auc(utt_scores, utt_labels)\n", "preds_wf = (utt_scores > threshold_wf).astype(int)\n", "acc_wf = float((preds_wf == utt_labels).mean())\n", "\n", "print(f\"\\nOverall results (Stage 2 model on WaveFake):\")\n", "print(f\" EER: {eer_wf*100:.4f}%\")\n", "print(f\" AUC: {auc_wf:.4f}\")\n", "print(f\" Accuracy: {acc_wf*100:.2f}%\")\n", "print(f\" Threshold: {threshold_wf:.4f}\")\n", "\n", "# ---- Cross-dataset comparison ----\n", "print(f\"\\nCross-dataset comparison:\")\n", "print(f\" Stage 2 dev EER (2019 LA, seen attacks): 0.69%\")\n", "print(f\" Stage 2 eval EER (2019 LA, unseen attacks): 5.55%\")\n", "print(f\" Stage 2 eval EER (2021 LA, codec degraded): 9.09%\")\n", "print(f\" Stage 2 eval EER (WaveFake, novel vocoders): {eer_wf*100:.2f}%\")\n", "\n", "# ---- Per-vocoder EER ----\n", "print(f\"\\n{'='*70}\")\n", "print(f\" PER-VOCODER EER BREAKDOWN (vs LJSpeech bonafide)\")\n", "print(f\"{'='*70}\")\n", "bonafide_scores_all = utt_scores[utt_labels == 0]\n", "spoof_vocoders = sorted(set(v for v in utt_vocoders if v != 'bonafide_LJSpeech'))\n", "\n", "per_vocoder_results = {}\n", "for vocoder in spoof_vocoders:\n", " mask = (utt_vocoders == vocoder)\n", " voc_scores = utt_scores[mask]\n", " n = int(mask.sum())\n", " combined_scores = np.concatenate([bonafide_scores_all, voc_scores])\n", " combined_labels = np.concatenate([\n", " np.zeros(len(bonafide_scores_all)),\n", " np.ones(n),\n", " ])\n", " v_eer, _ = compute_eer(combined_scores, combined_labels)\n", " per_vocoder_results[vocoder] = {\"n\": n, \"eer\": float(v_eer)}\n", " print(f\" {vocoder:<35}: n={n:>5,} EER={v_eer*100:>6.2f}%\")\n", "\n", "# Save raw scores\n", "import os\n", "SCORES_PATH = '/content/deepfake-audio-detection/results/scores/stage2_eval_wavefake.npz'\n", "os.makedirs(os.path.dirname(SCORES_PATH), exist_ok=True)\n", "np.savez(\n", " SCORES_PATH,\n", " utt_ids=np.array(utt_ids_sorted),\n", " utt_scores=utt_scores,\n", " utt_labels=utt_labels,\n", " utt_vocoders=utt_vocoders,\n", ")\n", "print(f\"\\nRaw scores saved to {SCORES_PATH}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000, "referenced_widgets": [ "bb896bf13a8a48c7b31ee6f97ff9c387", "104dd6aba61b4ad4ac0f520328cc1fb4", "5cbd77bc7a3f48c98423dd04c79b22a6", "447f31f2587a418c9133d3d139e8ba4d", "2bfeb4a9753e48fc854c0f1a64a61318", "69b8155b46784de887c14f710d5de5d0", "2426e9f09a0e47c5afc37980c0f43a87", "778e478f9a114526b094a713275b6a3e", "63205811a7904068a7b3a69a7ae7cd89", "59d6be54ba9344f7b9fd3a6b4257a2fb", "1110ef40cbb24e9aaa1bd6cfea6c0f2a" ] }, "id": "WmOcBoRXn8Cm", "executionInfo": { "status": "ok", "timestamp": 1777775090716, "user_tz": 420, "elapsed": 479257, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "d53a3e7a-4286-4a48-db2e-c56ab04bf918" }, "execution_count": 35, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "WaveFake dataset: 27,483 windows from 10,500 utterances\n", "Inflation factor: 2.62x\n", "\n", "Loading Stage 2 best checkpoint...\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "Wav2Vec2Model LOAD REPORT from: facebook/wav2vec2-base\n", "Key | Status | | \n", "-----------------------------+------------+--+-\n", "quantizer.weight_proj.weight | UNEXPECTED | | \n", "project_q.weight | UNEXPECTED | | \n", "project_hid.weight | UNEXPECTED | | \n", "quantizer.codevectors | UNEXPECTED | | \n", "quantizer.weight_proj.bias | UNEXPECTED | | \n", "project_hid.bias | UNEXPECTED | | \n", "project_q.bias | UNEXPECTED | | \n", "\n", "Notes:\n", "- UNEXPECTED\t:can be ignored when loading from different task/architecture; not ok if you expect identical arch.\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Model loaded (epoch 9, dev EER 0.6941%)\n", "\n", "Running inference (mixed precision, batch=16)...\n", "Expected: ~3-5 min\n", "\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "WaveFake inference: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1718/1718 [07:52<00:00, 3.63it/s]\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "\n", "Inference complete in 7.9 min over 27,483 windows.\n", "\n", "Aggregating window scores to utterance scores (mean)...\n", "\n", "======================================================================\n", " SUPPLEMENTARY EVALUATION \u2014 WaveFake (LJSpeech + JSUT)\n", "======================================================================\n", "Utterances: 10,500\n", "Bonafide: 1,500\n", "Spoof: 9,000\n", "\n", "Overall results (Stage 2 model on WaveFake):\n", " EER: 26.3333%\n", " AUC: 0.8250\n", " Accuracy: 73.68%\n", " Threshold: 0.0000\n", "\n", "Cross-dataset comparison:\n", " Stage 2 dev EER (2019 LA, seen attacks): 0.69%\n", " Stage 2 eval EER (2019 LA, unseen attacks): 5.55%\n", " Stage 2 eval EER (2021 LA, codec degraded): 9.09%\n", " Stage 2 eval EER (WaveFake, novel vocoders): 26.33%\n", "\n", "======================================================================\n", " PER-VOCODER EER BREAKDOWN (vs LJSpeech bonafide)\n", "======================================================================\n", " jsut_multi_band_melgan : n=1,000 EER= 1.13%\n", " jsut_parallel_wavegan : n=1,000 EER= 0.83%\n", " ljspeech_full_band_melgan : n=1,000 EER= 30.60%\n", " ljspeech_hifiGAN : n=1,000 EER= 33.23%\n", " ljspeech_melgan : n=1,000 EER= 31.12%\n", " ljspeech_melgan_large : n=1,000 EER= 33.85%\n", " ljspeech_multi_band_melgan : n=1,000 EER= 21.92%\n", " ljspeech_parallel_wavegan : n=1,000 EER= 26.12%\n", " ljspeech_waveglow : n=1,000 EER= 29.60%\n", "\n", "Raw scores saved to /content/deepfake-audio-detection/results/scores/stage2_eval_wavefake.npz\n" ] } ] }, { "cell_type": "code", "source": [ "import json, os\n", "from datetime import datetime\n", "\n", "results_wf = {\n", " \"phase\": \"Phase 5c \u2014 Supplementary Evaluation on WaveFake\",\n", " \"completed_at\": datetime.now().isoformat(),\n", " \"model_checkpoint\": \"/content/drive/MyDrive/deepfake_audio/checkpoints/stage2_best.pt\",\n", " \"model_dev_eer\": 0.0069,\n", " \"evaluation_dataset\": {\n", " \"name\": \"WaveFake (Frank et al., 2021) \u2014 sampled subset\",\n", " \"kaggle_source_spoof\": \"walimuhammadahmad/fakeaudio\",\n", " \"kaggle_source_bonafide\": \"mathurinache/the-lj-speech-dataset\",\n", " \"sampling_strategy\": \"Random sample of 1,500 LJSpeech bonafide + 1,000 spoof per vocoder \u00d7 9 vocoders\",\n", " \"utterances_total\": 10500,\n", " \"windows\": 27483,\n", " \"bonafide_count\": 1500,\n", " \"spoof_count\": 9000,\n", " \"vocoders\": [\n", " \"ljspeech_melgan\", \"ljspeech_melgan_large\", \"ljspeech_multi_band_melgan\",\n", " \"ljspeech_full_band_melgan\", \"ljspeech_parallel_wavegan\",\n", " \"ljspeech_waveglow\", \"ljspeech_hifiGAN\",\n", " \"jsut_multi_band_melgan\", \"jsut_parallel_wavegan\"\n", " ],\n", " },\n", " \"inference\": {\n", " \"batch_size\": 16,\n", " \"mixed_precision\": True,\n", " \"wall_clock_minutes\": 7.9,\n", " \"windows_per_second\": 58,\n", " \"note\": \"Slower windows/sec than ASVspoof because of resampling 22050/24000 \u2192 16000\",\n", " },\n", " \"overall_results\": {\n", " \"eer\": 0.2633,\n", " \"auc\": 0.8250,\n", " \"accuracy\": 0.7368,\n", " \"threshold\": 0.0000,\n", " },\n", " \"cross_dataset_comparison\": {\n", " \"stage2_dev_2019_seen_attacks\": 0.0069,\n", " \"stage2_eval_2019_unseen_attacks\": 0.0555,\n", " \"stage2_eval_2021_unseen_attacks_plus_codecs\": 0.0909,\n", " \"stage2_eval_wavefake_novel_vocoders\": 0.2633,\n", " \"interpretation\": \"Largest cross-dataset gap. Model trained on ASVspoof attacks generalizes only weakly to standalone neural vocoder pipelines.\",\n", " },\n", " \"per_vocoder_eer\": {\n", " \"ljspeech_melgan\": 0.3112,\n", " \"ljspeech_melgan_large\": 0.3385,\n", " \"ljspeech_multi_band_melgan\": 0.2192,\n", " \"ljspeech_full_band_melgan\": 0.3060,\n", " \"ljspeech_parallel_wavegan\": 0.2612,\n", " \"ljspeech_waveglow\": 0.2960,\n", " \"ljspeech_hifiGAN\": 0.3323,\n", " \"jsut_multi_band_melgan\": 0.0113,\n", " \"jsut_parallel_wavegan\": 0.0083,\n", " },\n", " \"methodological_caveats\": [\n", " \"JSUT vocoder EERs (~1%) are likely inflated by domain shortcuts: bonafide is English LJSpeech, JSUT spoofs are Japanese audio at different sample rate (24 kHz vs 22 kHz). Model may be classifying language/speaker rather than detecting spoofing.\",\n", " \"The LJSpeech-based vocoder EERs (22-34%) are the methodologically meaningful results: same speaker, same content, same recording quality as bonafide; only the synthesis differs.\",\n", " \"High EERs on LJSpeech vocoders (mean 29.4%) reveal that ASVspoof-trained models generalize poorly to clean neural vocoder pipelines. This matches the original WaveFake paper's observations.\",\n", " \"Model has not been adapted to WaveFake \u2014 pure cross-dataset evaluation.\",\n", " ],\n", " \"key_findings\": [\n", " \"Cross-dataset robustness varies substantially by distribution shift type:\",\n", " \" - Unseen attack types in same dataset: +4.86 pp (0.69% \u2192 5.55%)\",\n", " \" - Real-world codec degradation: +3.54 pp (5.55% \u2192 9.09%)\",\n", " \" - Novel vocoder pipelines on different domain: +17.24 pp (9.09% \u2192 26.33%)\",\n", " \"Model has learned to detect ASVspoof-specific synthesis artifacts but not pure vocoder artifacts.\",\n", " \"Future work direction: include vocoder-only spoofing data during training to improve cross-dataset generalization.\",\n", " ],\n", " \"raw_scores_path\": \"/content/deepfake-audio-detection/results/scores/stage2_eval_wavefake.npz\",\n", "}\n", "\n", "OUTPUT = '/content/deepfake-audio-detection/results/metrics/stage2_eval_wavefake_results.json'\n", "os.makedirs(os.path.dirname(OUTPUT), exist_ok=True)\n", "with open(OUTPUT, 'w') as f:\n", " json.dump(results_wf, f, indent=2)\n", "\n", "print(f\"Wrote {OUTPUT}\")\n", "print(f\"Size: {os.path.getsize(OUTPUT)} bytes\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "IeogEOWqrFxv", "executionInfo": { "status": "ok", "timestamp": 1777775437818, "user_tz": 420, "elapsed": 75, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "45839087-0591-4039-fb45-7a854ffc6508" }, "execution_count": 36, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Wrote /content/deepfake-audio-detection/results/metrics/stage2_eval_wavefake_results.json\n", "Size: 3479 bytes\n" ] } ] }, { "cell_type": "code", "source": [ "from google.colab import userdata\n", "import os\n", "\n", "GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')\n", "os.chdir('/content/deepfake-audio-detection')\n", "\n", "!git config user.email \"95262824+Saracasm@users.noreply.github.com\"\n", "!git config user.name \"Sara Iqbal\"\n", "\n", "!git add results/metrics/stage2_eval_wavefake_results.json\n", "!git add results/scores/stage2_eval_wavefake.npz\n", "!git status\n", "\n", "!git commit -m \"Phase 5c: WaveFake eval \u2014 26.33% EER, reveals ASVspoof-specific overfitting\"\n", "\n", "push_url = f\"https://Saracasm:{GITHUB_TOKEN}@github.com/Saracasm/deepfake-audio-detection.git\"\n", "!git push {push_url} main 2>&1 | grep -v {GITHUB_TOKEN}" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "o6M26soOrUYq", "executionInfo": { "status": "ok", "timestamp": 1777775499812, "user_tz": 420, "elapsed": 2350, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "3b4afe21-70c1-463a-d4c8-d58453f645dc" }, "execution_count": 37, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "On branch main\n", "Your branch is ahead of 'origin/main' by 1 commit.\n", " (use \"git push\" to publish your local commits)\n", "\n", "Changes to be committed:\n", " (use \"git restore --staged ...\" to unstage)\n", "\t\u001b[32mnew file: results/metrics/stage2_eval_wavefake_results.json\u001b[m\n", "\t\u001b[32mnew file: results/scores/stage2_eval_wavefake.npz\u001b[m\n", "\n", "[main 258c630] Phase 5c: WaveFake eval \u2014 26.33% EER, reveals ASVspoof-specific overfitting\n", " 2 files changed, 73 insertions(+)\n", " create mode 100644 results/metrics/stage2_eval_wavefake_results.json\n", " create mode 100644 results/scores/stage2_eval_wavefake.npz\n", "To https://github.com/Saracasm/deepfake-audio-detection.git\n", " 6b144b9..258c630 main -> main\n" ] } ] }, { "cell_type": "markdown", "source": [ "We fine-tune Wav2Vec 2.0 for synthetic speech detection on ASVspoof 2019 LA and characterize its cross-dataset robustness across three distribution shift types: (1) unseen attacks in the original dataset (5.55% EER), (2) real-world codec degradation in ASVspoof 2021 LA (9.09% EER, matching the strongest published baselines), and (3) novel vocoder pipelines on a different domain (WaveFake, 26.33% EER). We show that fine-tuned Wav2Vec features generalize well to attack and codec variations but degrade significantly on out-of-distribution vocoder synthesis, suggesting the model has learned ASVspoof-specific synthesis artifacts rather than universal spoofing detection." ], "metadata": { "id": "Di-Vv52nrupT" } }, { "cell_type": "code", "source": [ "PREDICT_PY = '''\"\"\"\n", "Inference module for deepfake audio detection.\n", "\n", "Wraps the Stage 2 Wav2Vec 2.0 classifier with a clean public API.\n", "\n", "Usage:\n", " from src.inference.predict import DeepfakeDetector\n", " detector = DeepfakeDetector(checkpoint_path=\"path/to/stage2_best.pt\")\n", " result = detector.predict(\"path/to/audio.wav\")\n", " print(result)\n", " # {\"spoof_probability\": 0.84, \"prediction\": \"spoof\", \"confidence\": 0.84,\n", " # \"utterance_duration_sec\": 3.42, \"n_windows\": 1, \"model_version\": \"stage2\"}\n", "\"\"\"\n", "\n", "import os\n", "from typing import Dict, Optional, Union\n", "import torch\n", "import torch.nn.functional as F\n", "import numpy as np\n", "\n", "from src.models.wav2vec_classifier import Wav2VecClassifier\n", "from src.data.preprocessing import load_audio, segment_waveform, WINDOW_SAMPLES\n", "\n", "\n", "# Default classifier threshold. 0.5 is naive; we tuned it during eval.\n", "# Values closer to 0.5 = balanced; lower = more sensitive (more false alarms);\n", "# higher = more conservative (more misses).\n", "DEFAULT_THRESHOLD = 0.5\n", "\n", "\n", "class DeepfakeDetector:\n", " \"\"\"Anti-spoofing classifier wrapper for one-shot inference.\"\"\"\n", "\n", " def __init__(\n", " self,\n", " checkpoint_path: str,\n", " device: Optional[str] = None,\n", " backbone_name: str = \"facebook/wav2vec2-base\",\n", " threshold: float = DEFAULT_THRESHOLD,\n", " use_mixed_precision: bool = True,\n", " ):\n", " \"\"\"\n", " Args:\n", " checkpoint_path: path to a Stage 2 .pt checkpoint\n", " device: 'cuda', 'cpu', or None (auto-detect)\n", " backbone_name: HuggingFace model name for Wav2Vec backbone\n", " threshold: probability threshold above which we predict \"spoof\"\n", " use_mixed_precision: use fp16 inference (faster on GPU)\n", " \"\"\"\n", " if device is None:\n", " device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", " self.device = device\n", " self.threshold = threshold\n", " self.use_mixed_precision = use_mixed_precision and (device == \"cuda\")\n", "\n", " # Build model and load weights\n", " self.model = Wav2VecClassifier(\n", " backbone_name=backbone_name,\n", " num_classes=2,\n", " freeze_backbone=True,\n", " )\n", " ckpt = torch.load(checkpoint_path, map_location=device, weights_only=False)\n", " self.model.load_state_dict(ckpt[\"model_state_dict\"])\n", " self.model = self.model.to(device)\n", " self.model.eval()\n", "\n", " # Store metadata for transparency\n", " self.checkpoint_metadata = {\n", " \"epoch\": ckpt.get(\"epoch\"),\n", " \"best_eer\": ckpt.get(\"best_eer\"),\n", " \"checkpoint_path\": checkpoint_path,\n", " }\n", "\n", " @torch.no_grad()\n", " def predict(\n", " self,\n", " audio_input: Union[str, torch.Tensor, np.ndarray],\n", " return_per_window: bool = False,\n", " ) -> Dict:\n", " \"\"\"Predict bonafide vs spoof for a single audio input.\n", "\n", " Args:\n", " audio_input: either a file path (str), a 1-D Tensor at 16 kHz, or\n", " a 1-D numpy array at 16 kHz.\n", " return_per_window: if True, include per-window probabilities in\n", " the result for debugging.\n", "\n", " Returns:\n", " Dict with keys:\n", " spoof_probability: float in [0, 1]\n", " bonafide_probability: float in [0, 1]\n", " prediction: \"bonafide\" or \"spoof\"\n", " confidence: float in [0, 1] (probability of the predicted class)\n", " utterance_duration_sec: total audio length\n", " n_windows: number of 4-sec windows the audio was split into\n", " window_scores: (only if return_per_window=True) list of per-window spoof probs\n", " \"\"\"\n", " # Step 1: Load and resample audio if needed\n", " if isinstance(audio_input, str):\n", " waveform = load_audio(audio_input) # returns 1-D tensor at 16 kHz\n", " elif isinstance(audio_input, np.ndarray):\n", " waveform = torch.from_numpy(audio_input.astype(np.float32))\n", " elif isinstance(audio_input, torch.Tensor):\n", " waveform = audio_input.float()\n", " if waveform.dim() > 1:\n", " waveform = waveform.squeeze()\n", " else:\n", " raise ValueError(\n", " f\"audio_input must be str, np.ndarray, or torch.Tensor; got {type(audio_input)}\"\n", " )\n", "\n", " duration_sec = float(waveform.shape[0] / 16000)\n", "\n", " # Step 2: Segment into 4-sec windows\n", " windows = segment_waveform(waveform) # list of 1-D tensors of length 64000\n", " n_windows = len(windows)\n", "\n", " # Step 3: Stack into a batch and run inference\n", " batch = torch.stack(windows, dim=0).to(self.device, non_blocking=True)\n", "\n", " if self.use_mixed_precision:\n", " with torch.amp.autocast(device_type=\"cuda\", enabled=True):\n", " logits = self.model(batch)\n", " else:\n", " logits = self.model(batch)\n", "\n", " # Step 4: Compute per-window probabilities, then aggregate (mean)\n", " probs = torch.softmax(logits.float(), dim=-1).cpu().numpy() # (n_windows, 2)\n", " window_spoof_probs = probs[:, 1].tolist()\n", " utt_spoof_prob = float(np.mean(window_spoof_probs))\n", " utt_bonafide_prob = 1.0 - utt_spoof_prob\n", "\n", " # Step 5: Apply threshold for hard prediction\n", " prediction = \"spoof\" if utt_spoof_prob > self.threshold else \"bonafide\"\n", " confidence = utt_spoof_prob if prediction == \"spoof\" else utt_bonafide_prob\n", "\n", " result = {\n", " \"spoof_probability\": utt_spoof_prob,\n", " \"bonafide_probability\": utt_bonafide_prob,\n", " \"prediction\": prediction,\n", " \"confidence\": confidence,\n", " \"utterance_duration_sec\": duration_sec,\n", " \"n_windows\": n_windows,\n", " \"threshold_used\": self.threshold,\n", " }\n", " if return_per_window:\n", " result[\"window_scores\"] = window_spoof_probs\n", " return result\n", "\n", " def info(self) -> Dict:\n", " \"\"\"Return metadata about this model checkpoint.\"\"\"\n", " return {\n", " **self.checkpoint_metadata,\n", " \"device\": self.device,\n", " \"threshold\": self.threshold,\n", " \"mixed_precision\": self.use_mixed_precision,\n", " }\n", "'''\n", "\n", "PATH = '/content/deepfake-audio-detection/src/inference/predict.py'\n", "import os\n", "os.makedirs(os.path.dirname(PATH), exist_ok=True)\n", "\n", "# Also create __init__.py for the module\n", "init_path = '/content/deepfake-audio-detection/src/inference/__init__.py'\n", "if not os.path.exists(init_path):\n", " open(init_path, 'w').close()\n", "\n", "with open(PATH, 'w') as f:\n", " f.write(PREDICT_PY)\n", "\n", "print(f\"Wrote {PATH} ({len(PREDICT_PY)} bytes)\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1FmHjpqjtUqO", "executionInfo": { "status": "ok", "timestamp": 1777776024694, "user_tz": 420, "elapsed": 56, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "9ebc3240-cad4-4127-9848-43d4fda0bf6c" }, "execution_count": 38, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Wrote /content/deepfake-audio-detection/src/inference/predict.py (6047 bytes)\n" ] } ] }, { "cell_type": "code", "source": [ "import sys\n", "sys.path.insert(0, '/content/deepfake-audio-detection')\n", "from src.data.protocols import parse_all_partitions\n", "\n", "LA_ROOT = '/content/kaggle_download/LA'\n", "splits = parse_all_partitions(LA_ROOT)\n", "print(f\"Re-parsed:\")\n", "for name, utts in splits.items():\n", " n_bonafide = sum(1 for u in utts if u.label == 'bonafide')\n", " print(f\" {name}: {len(utts):,} (bonafide: {n_bonafide:,})\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "qu09Ku0at_Sf", "executionInfo": { "status": "ok", "timestamp": 1777776197587, "user_tz": 420, "elapsed": 387, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "f9acc39e-0f22-431e-ac0e-6a8a00610578" }, "execution_count": 40, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Re-parsed:\n", " train: 25,380 (bonafide: 2,580)\n", " dev: 24,844 (bonafide: 2,548)\n", " eval: 71,237 (bonafide: 7,355)\n" ] } ] }, { "cell_type": "code", "source": [ "import sys, importlib\n", "\n", "# Reload modules\n", "for mod in ['src.inference.predict']:\n", " if mod in sys.modules:\n", " importlib.reload(sys.modules[mod])\n", "from src.inference.predict import DeepfakeDetector\n", "\n", "# Build the detector once\n", "print(\"Loading detector...\")\n", "CKPT = '/content/drive/MyDrive/deepfake_audio/checkpoints/stage2_best.pt'\n", "detector = DeepfakeDetector(checkpoint_path=CKPT)\n", "print(f\"\\nDetector loaded. Info:\")\n", "for k, v in detector.info().items():\n", " print(f\" {k}: {v}\")\n", "\n", "# Pick test samples from 2019 LA eval set\n", "print(\"\\n\" + \"=\" * 70)\n", "print(\" TESTING ON REAL AUDIO\")\n", "print(\"=\" * 70)\n", "\n", "test_cases = []\n", "\n", "# 1. Bonafide from 2019 eval\n", "bonafide_eval = [u for u in splits['eval'] if u.label == 'bonafide']\n", "test_cases.append((\"2019 eval bonafide\", bonafide_eval[0]))\n", "\n", "# 2. Easy attack (A13)\n", "attack_a13 = [u for u in splits['eval'] if u.attack_id == 'A13']\n", "test_cases.append((\"2019 eval spoof (A13, easy)\", attack_a13[0]))\n", "\n", "# 3. Hard attack (A10)\n", "attack_a10 = [u for u in splits['eval'] if u.attack_id == 'A10']\n", "test_cases.append((\"2019 eval spoof (A10, hard)\", attack_a10[0]))\n", "\n", "# 4. Medium attack (A07)\n", "attack_a07 = [u for u in splits['eval'] if u.attack_id == 'A07']\n", "test_cases.append((\"2019 eval spoof (A07, medium)\", attack_a07[0]))\n", "\n", "# 5. WaveFake spoof (LJSpeech-based, model struggles here)\n", "import glob\n", "wf_files = sorted(glob.glob('/content/wavefake/generated_audio/ljspeech_hifiGAN/*.wav'))\n", "if wf_files:\n", " class _LightUtt:\n", " def __init__(self, path, uid):\n", " self.flac_path = path\n", " self.utterance_id = uid\n", " self.label = 'spoof' # WaveFake is all spoof\n", " test_cases.append((\"WaveFake spoof (HiFi-GAN)\", _LightUtt(wf_files[0], 'wavefake_hifigan_0')))\n", "\n", "# 6. Real LJSpeech (bonafide, but the model wasn't trained on this domain)\n", "lj_files = sorted(glob.glob('/content/ljspeech/LJSpeech-1.1/wavs/*.wav'))\n", "if lj_files:\n", " test_cases.append((\"LJSpeech bonafide (out-of-domain)\", _LightUtt(lj_files[0], 'lj_bonafide_0')))\n", "\n", "# Run predictions\n", "import time\n", "for label, utt in test_cases:\n", " start = time.time()\n", " result = detector.predict(utt.flac_path)\n", " elapsed_ms = (time.time() - start) * 1000\n", "\n", " expected = utt.label\n", " actual = result['prediction']\n", " correct = \"\u2713\" if expected == actual else \"\u2717\"\n", "\n", " print(f\"\\n [{label}]\")\n", " print(f\" File: {utt.utterance_id}\")\n", " print(f\" Expected: {expected}\")\n", " print(f\" Predicted: {actual} {correct}\")\n", " print(f\" Spoof probability: {result['spoof_probability']:.4f}\")\n", " print(f\" Confidence: {result['confidence']:.4f}\")\n", " print(f\" Duration: {result['utterance_duration_sec']:.2f}s ({result['n_windows']} window{'s' if result['n_windows'] != 1 else ''})\")\n", " print(f\" Inference time: {elapsed_ms:.0f}ms\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000, "referenced_widgets": [ "154d0f8dc23e4b8fb191661027beb270", "e983a97e0345486c9c7b8c767ab0b40e", "69fba5a3128e4877b6ac264e7ed954c2", "9c48d6067614434e8ac17be30d67faeb", "835875abf0e049a38d7e7355b3f2341a", "5ce1e1c41b47497cb9a087c1ff0b1704", "6fcb3d97485a4d719e7b3b310526ff07", "8993ebc97b3d4621a74a08a3df02eaf5", "1c5f39a4a4b3436c9f6bf91e67ae5796", "9c444522fbcd43fb8bcfdd2c3c8c79da", "0b029b9c45c4447d907b1276be045cd0" ] }, "id": "h0DqZhbAtix3", "executionInfo": { "status": "ok", "timestamp": 1777776221825, "user_tz": 420, "elapsed": 4169, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "0f3c8ba9-195b-41d1-b0b0-446ecfec0b1c" }, "execution_count": 41, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Loading detector...\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "Wav2Vec2Model LOAD REPORT from: facebook/wav2vec2-base\n", "Key | Status | | \n", "-----------------------------+------------+--+-\n", "quantizer.weight_proj.weight | UNEXPECTED | | \n", "project_q.weight | UNEXPECTED | | \n", "project_hid.weight | UNEXPECTED | | \n", "quantizer.codevectors | UNEXPECTED | | \n", "quantizer.weight_proj.bias | UNEXPECTED | | \n", "project_hid.bias | UNEXPECTED | | \n", "project_q.bias | UNEXPECTED | | \n", "\n", "Notes:\n", "- UNEXPECTED\t:can be ignored when loading from different task/architecture; not ok if you expect identical arch.\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "\n", "Detector loaded. Info:\n", " epoch: 9\n", " best_eer: 0.006940865275480051\n", " checkpoint_path: /content/drive/MyDrive/deepfake_audio/checkpoints/stage2_best.pt\n", " device: cuda\n", " threshold: 0.5\n", " mixed_precision: True\n", "\n", "======================================================================\n", " TESTING ON REAL AUDIO\n", "======================================================================\n", "\n", " [2019 eval bonafide]\n", " File: LA_E_5849185\n", " Expected: bonafide\n", " Predicted: bonafide \u2713\n", " Spoof probability: 0.0000\n", " Confidence: 1.0000\n", " Duration: 4.39s (2 windows)\n", " Inference time: 240ms\n", "\n", " [2019 eval spoof (A13, easy)]\n", " File: LA_E_5932896\n", " Expected: spoof\n", " Predicted: spoof \u2713\n", " Spoof probability: 1.0000\n", " Confidence: 1.0000\n", " Duration: 5.80s (2 windows)\n", " Inference time: 73ms\n", "\n", " [2019 eval spoof (A10, hard)]\n", " File: LA_E_8339197\n", " Expected: spoof\n", " Predicted: bonafide \u2717\n", " Spoof probability: 0.0001\n", " Confidence: 0.9999\n", " Duration: 1.46s (1 window)\n", " Inference time: 93ms\n", "\n", " [2019 eval spoof (A07, medium)]\n", " File: LA_E_8844552\n", " Expected: spoof\n", " Predicted: spoof \u2713\n", " Spoof probability: 0.6621\n", " Confidence: 0.6621\n", " Duration: 4.12s (2 windows)\n", " Inference time: 59ms\n", "\n", " [WaveFake spoof (HiFi-GAN)]\n", " File: wavefake_hifigan_0\n", " Expected: spoof\n", " Predicted: bonafide \u2717\n", " Spoof probability: 0.2500\n", " Confidence: 0.7500\n", " Duration: 9.65s (4 windows)\n", " Inference time: 85ms\n", "\n", " [LJSpeech bonafide (out-of-domain)]\n", " File: lj_bonafide_0\n", " Expected: spoof\n", " Predicted: bonafide \u2717\n", " Spoof probability: 0.0000\n", " Confidence: 1.0000\n", " Duration: 9.66s (4 windows)\n", " Inference time: 83ms\n" ] } ] }, { "cell_type": "code", "source": [ "from google.colab import userdata\n", "import os\n", "\n", "GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')\n", "os.chdir('/content/deepfake-audio-detection')\n", "\n", "!git config user.email \"95262824+Saracasm@users.noreply.github.com\"\n", "!git config user.name \"Sara Iqbal\"\n", "\n", "!git add src/inference/__init__.py src/inference/predict.py\n", "!git status\n", "!git commit -m \"Phase 6: add production inference module (DeepfakeDetector wrapper)\"\n", "\n", "push_url = f\"https://Saracasm:{GITHUB_TOKEN}@github.com/Saracasm/deepfake-audio-detection.git\"\n", "!git push {push_url} main 2>&1 | grep -v {GITHUB_TOKEN}" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "y5K__qKRuVyl", "executionInfo": { "status": "ok", "timestamp": 1777776291704, "user_tz": 420, "elapsed": 1898, "user": { "displayName": "Sara Jaffrani", "userId": "07677779715251349607" } }, "outputId": "1ae864ee-974e-48a6-944e-6b5af6895cad" }, "execution_count": 42, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "On branch main\n", "Your branch is ahead of 'origin/main' by 2 commits.\n", " (use \"git push\" to publish your local commits)\n", "\n", "Changes to be committed:\n", " (use \"git restore --staged ...\" to unstage)\n", "\t\u001b[32mnew file: src/inference/__init__.py\u001b[m\n", "\t\u001b[32mnew file: src/inference/predict.py\u001b[m\n", "\n", "[main 0e975e7] Phase 6: add production inference module (DeepfakeDetector wrapper)\n", " 2 files changed, 157 insertions(+)\n", " create mode 100644 src/inference/__init__.py\n", " create mode 100644 src/inference/predict.py\n", "To https://github.com/Saracasm/deepfake-audio-detection.git\n", " 258c630..0e975e7 main -> main\n" ] } ] } ] }