{
 "nbformat": 4,
 "nbformat_minor": 0,
 "metadata": {
  "colab": {
   "provenance": [],
   "gpuType": "T4",
   "authorship_tag": "ABX9TyM2MNKf+Ku7E4aMujn8tFCo"
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3"
  },
  "language_info": {
   "name": "python"
  },
  "accelerator": "GPU"
 },
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 107
    },
    "id": "ynFcJbOMDD-1",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777765038196,
     "user_tz": 420,
     "elapsed": 5378,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "43250049-951e-488c-bbae-710e135c076a"
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Re-uploading kaggle.json (find it in your Downloads folder)...\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ],
      "text/html": [
       "\n",
       "     <input type=\"file\" id=\"files-2a3a830b-5357-40b9-825d-10090b0a1b51\" name=\"files[]\" multiple disabled\n",
       "        style=\"border:none\" />\n",
       "     <output id=\"result-2a3a830b-5357-40b9-825d-10090b0a1b51\">\n",
       "      Upload widget is only available when the cell has been executed in the\n",
       "      current browser session. Please rerun this cell to enable.\n",
       "      </output>\n",
       "      <script>// Copyright 2017 Google LLC\n",
       "//\n",
       "// Licensed under the Apache License, Version 2.0 (the \"License\");\n",
       "// you may not use this file except in compliance with the License.\n",
       "// You may obtain a copy of the License at\n",
       "//\n",
       "//      http://www.apache.org/licenses/LICENSE-2.0\n",
       "//\n",
       "// Unless required by applicable law or agreed to in writing, software\n",
       "// distributed under the License is distributed on an \"AS IS\" BASIS,\n",
       "// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
       "// See the License for the specific language governing permissions and\n",
       "// limitations under the License.\n",
       "\n",
       "/**\n",
       " * @fileoverview Helpers for google.colab Python module.\n",
       " */\n",
       "(function(scope) {\n",
       "function span(text, styleAttributes = {}) {\n",
       "  const element = document.createElement('span');\n",
       "  element.textContent = text;\n",
       "  for (const key of Object.keys(styleAttributes)) {\n",
       "    element.style[key] = styleAttributes[key];\n",
       "  }\n",
       "  return element;\n",
       "}\n",
       "\n",
       "// Max number of bytes which will be uploaded at a time.\n",
       "const MAX_PAYLOAD_SIZE = 100 * 1024;\n",
       "\n",
       "function _uploadFiles(inputId, outputId) {\n",
       "  const steps = uploadFilesStep(inputId, outputId);\n",
       "  const outputElement = document.getElementById(outputId);\n",
       "  // Cache steps on the outputElement to make it available for the next call\n",
       "  // to uploadFilesContinue from Python.\n",
       "  outputElement.steps = steps;\n",
       "\n",
       "  return _uploadFilesContinue(outputId);\n",
       "}\n",
       "\n",
       "// This is roughly an async generator (not supported in the browser yet),\n",
       "// where there are multiple asynchronous steps and the Python side is going\n",
       "// to poll for completion of each step.\n",
       "// This uses a Promise to block the python side on completion of each step,\n",
       "// then passes the result of the previous step as the input to the next step.\n",
       "function _uploadFilesContinue(outputId) {\n",
       "  const outputElement = document.getElementById(outputId);\n",
       "  const steps = outputElement.steps;\n",
       "\n",
       "  const next = steps.next(outputElement.lastPromiseValue);\n",
       "  return Promise.resolve(next.value.promise).then((value) => {\n",
       "    // Cache the last promise value to make it available to the next\n",
       "    // step of the generator.\n",
       "    outputElement.lastPromiseValue = value;\n",
       "    return next.value.response;\n",
       "  });\n",
       "}\n",
       "\n",
       "/**\n",
       " * Generator function which is called between each async step of the upload\n",
       " * process.\n",
       " * @param {string} inputId Element ID of the input file picker element.\n",
       " * @param {string} outputId Element ID of the output display.\n",
       " * @return {!Iterable<!Object>} Iterable of next steps.\n",
       " */\n",
       "function* uploadFilesStep(inputId, outputId) {\n",
       "  const inputElement = document.getElementById(inputId);\n",
       "  inputElement.disabled = false;\n",
       "\n",
       "  const outputElement = document.getElementById(outputId);\n",
       "  outputElement.innerHTML = '';\n",
       "\n",
       "  const pickedPromise = new Promise((resolve) => {\n",
       "    inputElement.addEventListener('change', (e) => {\n",
       "      resolve(e.target.files);\n",
       "    });\n",
       "  });\n",
       "\n",
       "  const cancel = document.createElement('button');\n",
       "  inputElement.parentElement.appendChild(cancel);\n",
       "  cancel.textContent = 'Cancel upload';\n",
       "  const cancelPromise = new Promise((resolve) => {\n",
       "    cancel.onclick = () => {\n",
       "      resolve(null);\n",
       "    };\n",
       "  });\n",
       "\n",
       "  // Wait for the user to pick the files.\n",
       "  const files = yield {\n",
       "    promise: Promise.race([pickedPromise, cancelPromise]),\n",
       "    response: {\n",
       "      action: 'starting',\n",
       "    }\n",
       "  };\n",
       "\n",
       "  cancel.remove();\n",
       "\n",
       "  // Disable the input element since further picks are not allowed.\n",
       "  inputElement.disabled = true;\n",
       "\n",
       "  if (!files) {\n",
       "    return {\n",
       "      response: {\n",
       "        action: 'complete',\n",
       "      }\n",
       "    };\n",
       "  }\n",
       "\n",
       "  for (const file of files) {\n",
       "    const li = document.createElement('li');\n",
       "    li.append(span(file.name, {fontWeight: 'bold'}));\n",
       "    li.append(span(\n",
       "        `(${file.type || 'n/a'}) - ${file.size} bytes, ` +\n",
       "        `last modified: ${\n",
       "            file.lastModifiedDate ? file.lastModifiedDate.toLocaleDateString() :\n",
       "                                    'n/a'} - `));\n",
       "    const percent = span('0% done');\n",
       "    li.appendChild(percent);\n",
       "\n",
       "    outputElement.appendChild(li);\n",
       "\n",
       "    const fileDataPromise = new Promise((resolve) => {\n",
       "      const reader = new FileReader();\n",
       "      reader.onload = (e) => {\n",
       "        resolve(e.target.result);\n",
       "      };\n",
       "      reader.readAsArrayBuffer(file);\n",
       "    });\n",
       "    // Wait for the data to be ready.\n",
       "    let fileData = yield {\n",
       "      promise: fileDataPromise,\n",
       "      response: {\n",
       "        action: 'continue',\n",
       "      }\n",
       "    };\n",
       "\n",
       "    // Use a chunked sending to avoid message size limits. See b/62115660.\n",
       "    let position = 0;\n",
       "    do {\n",
       "      const length = Math.min(fileData.byteLength - position, MAX_PAYLOAD_SIZE);\n",
       "      const chunk = new Uint8Array(fileData, position, length);\n",
       "      position += length;\n",
       "\n",
       "      const base64 = btoa(String.fromCharCode.apply(null, chunk));\n",
       "      yield {\n",
       "        response: {\n",
       "          action: 'append',\n",
       "          file: file.name,\n",
       "          data: base64,\n",
       "        },\n",
       "      };\n",
       "\n",
       "      let percentDone = fileData.byteLength === 0 ?\n",
       "          100 :\n",
       "          Math.round((position / fileData.byteLength) * 100);\n",
       "      percent.textContent = `${percentDone}% done`;\n",
       "\n",
       "    } while (position < fileData.byteLength);\n",
       "  }\n",
       "\n",
       "  // All done.\n",
       "  yield {\n",
       "    response: {\n",
       "      action: 'complete',\n",
       "    }\n",
       "  };\n",
       "}\n",
       "\n",
       "scope.google = scope.google || {};\n",
       "scope.google.colab = scope.google.colab || {};\n",
       "scope.google.colab._files = {\n",
       "  _uploadFiles,\n",
       "  _uploadFilesContinue,\n",
       "};\n",
       "})(self);\n",
       "</script> "
      ]
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Saving kaggle.json to kaggle.json\n",
      "kaggle.json configured.\n"
     ]
    }
   ],
   "source": [
    "from google.colab import files\n",
    "import os, shutil\n",
    "\n",
    "print(\"Re-uploading kaggle.json (find it in your Downloads folder)...\")\n",
    "uploaded = files.upload()\n",
    "\n",
    "os.makedirs('/root/.kaggle', exist_ok=True)\n",
    "shutil.move('kaggle.json', '/root/.kaggle/kaggle.json')\n",
    "os.chmod('/root/.kaggle/kaggle.json', 0o600)\n",
    "print(\"kaggle.json configured.\")"
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "\"\"\"\n",
    "SESSION BOOTSTRAP \u2014 Phase 4 (Stage 2)\n",
    "Restores: Drive, repo, dataset, Python imports, wandb auth.\n",
    "\"\"\"\n",
    "import os, sys, time\n",
    "\n",
    "print(\"=\" * 60)\n",
    "print(\"Step 1/4: Mount Drive\")\n",
    "print(\"=\" * 60)\n",
    "DRIVE_ROOT = '/content/drive/MyDrive'\n",
    "if not os.path.exists(DRIVE_ROOT):\n",
    "    from google.colab import drive\n",
    "    drive.mount('/content/drive')\n",
    "print(\"Drive mounted.\\n\")\n",
    "\n",
    "os.makedirs('/content/drive/MyDrive/deepfake_audio/checkpoints', exist_ok=True)\n",
    "os.makedirs('/content/drive/MyDrive/deepfake_audio/logs', exist_ok=True)\n",
    "\n",
    "print(\"=\" * 60)\n",
    "print(\"Step 2/4: Clone/update repo\")\n",
    "print(\"=\" * 60)\n",
    "REPO_DIR = '/content/deepfake-audio-detection'\n",
    "if not os.path.exists(REPO_DIR):\n",
    "    !git clone https://github.com/Saracasm/deepfake-audio-detection.git {REPO_DIR}\n",
    "else:\n",
    "    !cd {REPO_DIR} && git pull --quiet\n",
    "print(f\"Repo at: {REPO_DIR}\\n\")\n",
    "\n",
    "print(\"=\" * 60)\n",
    "print(\"Step 3/4: Re-download dataset (~3-5 min)\")\n",
    "print(\"=\" * 60)\n",
    "LOCAL_LA = '/content/kaggle_download/LA'\n",
    "\n",
    "if os.path.exists(LOCAL_LA):\n",
    "    print(\"Dataset already present.\")\n",
    "else:\n",
    "    if not os.path.exists('/root/.kaggle/kaggle.json'):\n",
    "        print(\"ERROR: kaggle.json not configured.\")\n",
    "        print(\"Run the kaggle.json upload cell BEFORE this bootstrap.\")\n",
    "        raise SystemExit(\"Need kaggle credentials\")\n",
    "\n",
    "    !pip install -q kaggle\n",
    "    os.makedirs('/content/kaggle_download', exist_ok=True)\n",
    "    start = time.time()\n",
    "    !kaggle datasets download -d anishsarkar22/asvpoof-2019-dataset-la \\\n",
    "        -p /content/kaggle_download --unzip --force --quiet\n",
    "    print(f\"Downloaded in {(time.time()-start)/60:.1f} minutes.\")\n",
    "\n",
    "print(f\"Dataset at: {LOCAL_LA}\\n\")\n",
    "\n",
    "print(\"=\" * 30)\n",
    "print(\"Step 4/4: Set up Python imports + wandb\")\n",
    "print(\"=\" * 30)\n",
    "sys.path.insert(0, REPO_DIR)\n",
    "LA_ROOT = LOCAL_LA\n",
    "\n",
    "# Wandb key from Colab Secrets (so we don't hit interactive prompt)\n",
    "try:\n",
    "    from google.colab import userdata\n",
    "    os.environ['WANDB_API_KEY'] = userdata.get('WANDB_API_KEY')\n",
    "    print(\"Wandb API key loaded from Colab Secrets.\")\n",
    "except Exception as e:\n",
    "    print(f\"WANDB_API_KEY not loaded: {e}\")\n",
    "\n",
    "print(f\"\\nLA_ROOT = {LA_ROOT}\")\n",
    "print(f\"REPO_DIR = {REPO_DIR}\")\n",
    "print(\"\\nBootstrap complete. Ready to work.\")"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "SkYfpBWcDi87",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777766107989,
     "user_tz": 420,
     "elapsed": 1384,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "9164c2ab-63da-4e52-b340-951649835210"
   },
   "execution_count": 3,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "============================================================\n",
      "Step 1/4: Mount Drive\n",
      "============================================================\n",
      "Drive mounted.\n",
      "\n",
      "============================================================\n",
      "Step 2/4: Clone/update repo\n",
      "============================================================\n",
      "Repo at: /content/deepfake-audio-detection\n",
      "\n",
      "============================================================\n",
      "Step 3/4: Re-download dataset (~3-5 min)\n",
      "============================================================\n",
      "Dataset already present.\n",
      "Dataset at: /content/kaggle_download/LA\n",
      "\n",
      "==============================\n",
      "Step 4/4: Set up Python imports + wandb\n",
      "==============================\n",
      "Wandb API key loaded from Colab Secrets.\n",
      "\n",
      "LA_ROOT = /content/kaggle_download/LA\n",
      "REPO_DIR = /content/deepfake-audio-detection\n",
      "\n",
      "Bootstrap complete. Ready to work.\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "# Search Kaggle for ASVspoof 2021 LA datasets\n",
    "print(\"Searching Kaggle for ASVspoof 2021 LA mirrors...\\n\")\n",
    "!kaggle datasets list -s \"asvspoof 2021\" --max-size 20000000000 2>&1 | head -30"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "AFVFbqnjHt5c",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777766166833,
     "user_tz": 420,
     "elapsed": 823,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "f7abf981-e798-4835-9e75-53dfa57d685c"
   },
   "execution_count": 4,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Searching Kaggle for ASVspoof 2021 LA mirrors...\n",
      "\n",
      "ref                                                    title                                       size  lastUpdated                 downloadCount  voteCount  usabilityRating  \n",
      "-----------------------------------------------------  -----------------------------------  -----------  --------------------------  -------------  ---------  ---------------  \n",
      "abdallamohamed312/ready-to-input-for-training          Balanced ASVspoof 2021 PA             5211923567  2024-06-01 05:38:52.957000            477          8  0.875            \n",
      "abdallamohamed312/asv-2021-pa-pt12-into-real-and-fake  ASV 2021 (PA) Part 1,2               14176529532  2024-05-31 20:10:39.510000            147          7  0.6875           \n",
      "abdallamohamed312/asv-2021-pa-part-3                   ASV 2021 (PA) part 3                  7090016509  2024-05-31 23:59:09.450000             82          7  0.6875           \n",
      "chandajha04/asvspoof-2021                              asvspoof-2021                               4225  2025-11-08 13:54:33.450000              2          0  0.25             \n",
      "pratikjodgudri/asvspoof2021-df-audio-dataset           ASVspoof2021_DF_Audio_Dataset         1279232433  2024-11-13 07:45:36.147000            195          4  0.375            \n",
      "eminkorkut/deepfakevoice-wac2vec-4datasets             DeepFakeVoice-Wac2Vec-4Datasets       3309490475  2026-03-08 14:21:29.550000             12          1  0.7058824        \n",
      "eminkorkut/deepfakevoice-hubert-4datasets              DeepFakeVoice-HuBERT-4Datasets        3456502600  2026-03-08 13:51:10.900000              2          1  0.7058824        \n",
      "eminkorkut/deepfakevoice-mfcc-4datasets                DeepFakeVoice-MFCC-4Datasets           180800358  2026-03-08 14:27:46.507000             18          1  0.7058824        \n",
      "eminkorkut/deepfakevoice-google-hear-4datasets         DeepFakeVoice-google-HeAR-4Datasets   4577806640  2026-03-08 15:04:23.780000              8          1  0.75             \n",
      "flarescen/asvspoof-2021-real-samples                   ASVSpoof 2021 real samples             964302088  2025-03-21 14:58:22.087000              5          0  0.125            \n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "print(\"Trying alternate search terms for 2021 LA...\\n\")\n",
    "print(\"--- Search 1: 'asvspoof2021 la' ---\")\n",
    "!kaggle datasets list -s \"asvspoof2021 la\" --max-size 20000000000 2>&1 | head -15\n",
    "\n",
    "print(\"\\n--- Search 2: 'asvspoof la 2021' ---\")\n",
    "!kaggle datasets list -s \"asvspoof la 2021\" --max-size 20000000000 2>&1 | head -15\n",
    "\n",
    "print(\"\\n--- Search 3: 'spoof 2021 logical access' ---\")\n",
    "!kaggle datasets list -s \"spoof 2021 logical access\" --max-size 20000000000 2>&1 | head -15\n",
    "\n",
    "print(\"\\n--- Search 4: 'asvspoof2021_la' (with underscore) ---\")\n",
    "!kaggle datasets list -s \"asvspoof2021_la\" --max-size 20000000000 2>&1 | head -15"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "Mwk_PFN5IInP",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777766277028,
     "user_tz": 420,
     "elapsed": 2571,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "372eacb2-7369-4e37-e224-6250a80fce9a"
   },
   "execution_count": 5,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Trying alternate search terms for 2021 LA...\n",
      "\n",
      "--- Search 1: 'asvspoof2021 la' ---\n",
      "ref                              title                       size  lastUpdated                 downloadCount  voteCount  usabilityRating  \n",
      "-------------------------------  --------------------  ----------  --------------------------  -------------  ---------  ---------------  \n",
      "simontrann/asvspoof2021-la-key   ASVSpoof2021_LA_Key     21237220  2025-10-09 08:22:47.333000              8          0  0.25             \n",
      "simontrann/asvspoof2021-la-eval  ASVSpoof2021_LA_eval  7782355226  2025-10-03 02:33:41.490000              6          0  0.25             \n",
      "ajaysuryal/asvspoof2021-la       ASVspoof2021_LA       7788165738  2025-05-22 01:57:12.537000             15          0  0.125            \n",
      "\n",
      "--- Search 2: 'asvspoof la 2021' ---\n",
      "ref                                             title                                      size  lastUpdated                 downloadCount  voteCount  usabilityRating  \n",
      "----------------------------------------------  -----------------------------------  ----------  --------------------------  -------------  ---------  ---------------  \n",
      "eminkorkut/deepfakevoice-wac2vec-4datasets      DeepFakeVoice-Wac2Vec-4Datasets      3309490475  2026-03-08 14:21:29.550000             12          1  0.7058824        \n",
      "eminkorkut/deepfakevoice-mfcc-4datasets         DeepFakeVoice-MFCC-4Datasets          180800358  2026-03-08 14:27:46.507000             18          1  0.7058824        \n",
      "eminkorkut/deepfakevoice-google-hear-4datasets  DeepFakeVoice-google-HeAR-4Datasets  4577806640  2026-03-08 15:04:23.780000              8          1  0.75             \n",
      "eminkorkut/deepfakevoice-hubert-4datasets       DeepFakeVoice-HuBERT-4Datasets       3456502600  2026-03-08 13:51:10.900000              2          1  0.7058824        \n",
      "\n",
      "--- Search 3: 'spoof 2021 logical access' ---\n",
      "No datasets found\n",
      "\n",
      "--- Search 4: 'asvspoof2021_la' (with underscore) ---\n",
      "ref                         title                  size  lastUpdated                 downloadCount  voteCount  usabilityRating  \n",
      "--------------------------  ---------------  ----------  --------------------------  -------------  ---------  ---------------  \n",
      "ajaysuryal/asvspoof2021-la  ASVspoof2021_LA  7788165738  2025-05-22 01:57:12.537000             15          0  0.125            \n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "print(\"Listing files in ajaysuryal/asvspoof2021-la...\\n\")\n",
    "!kaggle datasets files ajaysuryal/asvspoof2021-la 2>&1 | head -40"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "D5IStA6MIX5J",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777766337119,
     "user_tz": 420,
     "elapsed": 512,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "020efd4a-7b03-4c6a-f9da-d05e0cad885b"
   },
   "execution_count": 6,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Listing files in ajaysuryal/asvspoof2021-la...\n",
      "\n",
      "Next Page Token = CfDJ8OqP5ZkTT9ZGj66XXRbxXyB45tjLm0BH9BDVPb3HnDuRXyp2J5C_xa5hhO_njOW_Qq6MqV4DD3FGJMSDjOinZcsGb3GB-LfTUWLikR7LpgTOmE56Pf8UG1X2Ece9gD53AgaY9SjdAvr42YBpHBQKVnezhTIy8VZJog5ZjECDb5R5-wV5vawtn-x-6UVDlqqStPcziA\n",
      "name                                                                     size  creationDate                \n",
      "--------------------------------------------------------------------  -------  --------------------------  \n",
      "ASVspoof2021_LA/ASVspoof2021_LA_eval/ASVspoof2021.LA.cm.eval.trl.txt  2360358  2025-05-22 02:03:30.763000  \n",
      "ASVspoof2021_LA/ASVspoof2021_LA_eval/LICENSE.txt                        19941  2025-05-22 01:58:39.159000  \n",
      "ASVspoof2021_LA/ASVspoof2021_LA_eval/README.LA.txt                       2233  2025-05-22 02:03:30.737000  \n",
      "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000048.flac             44411  2025-05-22 02:03:26.628000  \n",
      "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000166.flac            160644  2025-05-22 02:00:13.297000  \n",
      "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000174.flac             32835  2025-05-22 01:59:13.205000  \n",
      "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000200.flac             31011  2025-05-22 01:58:46.589000  \n",
      "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000345.flac             56980  2025-05-22 02:03:23.753000  \n",
      "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000349.flac             56097  2025-05-22 02:01:09.378000  \n",
      "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000381.flac             19968  2025-05-22 02:00:18.967000  \n",
      "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000450.flac             40938  2025-05-22 02:02:19.194000  \n",
      "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000479.flac             32567  2025-05-22 02:00:18.892000  \n",
      "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000580.flac             32551  2025-05-22 02:02:50.197000  \n",
      "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000624.flac             43105  2025-05-22 02:00:58.778000  \n",
      "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000636.flac             64684  2025-05-22 01:59:18.951000  \n",
      "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000657.flac             45448  2025-05-22 02:02:33.897000  \n",
      "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000795.flac             28359  2025-05-22 02:03:09.419000  \n",
      "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000854.flac             72252  2025-05-22 02:01:52.340000  \n",
      "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000931.flac             44475  2025-05-22 02:00:18.632000  \n",
      "ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1000986.flac             25770  2025-05-22 01:59:11.650000  \n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "import os, time\n",
    "\n",
    "DOWNLOAD_DIR = '/content/kaggle_2021'\n",
    "os.makedirs(DOWNLOAD_DIR, exist_ok=True)\n",
    "\n",
    "print(\"Downloading ASVspoof 2021 LA from Kaggle...\")\n",
    "print(\"Expected: ~7.8 GB, ~5-8 minutes\\n\")\n",
    "\n",
    "start = time.time()\n",
    "!kaggle datasets download -d ajaysuryal/asvspoof2021-la -p {DOWNLOAD_DIR} --unzip --force --quiet\n",
    "elapsed_min = (time.time() - start) / 60\n",
    "print(f\"\\nDownload complete in {elapsed_min:.1f} minutes.\")\n",
    "\n",
    "# Verify structure\n",
    "LA_2021_ROOT = f'{DOWNLOAD_DIR}/ASVspoof2021_LA/ASVspoof2021_LA_eval'\n",
    "print(f\"\\nDataset root: {LA_2021_ROOT}\")\n",
    "print(f\"Exists: {os.path.exists(LA_2021_ROOT)}\")\n",
    "if os.path.exists(LA_2021_ROOT):\n",
    "    print(f\"Top-level contents: {sorted(os.listdir(LA_2021_ROOT))}\")\n",
    "\n",
    "# Count flac files\n",
    "import glob\n",
    "flac_dir = f'{LA_2021_ROOT}/flac'\n",
    "if os.path.exists(flac_dir):\n",
    "    n_flac = len(glob.glob(f'{flac_dir}/*.flac'))\n",
    "    print(f\"\\nFlac files: {n_flac:,}\")\n",
    "else:\n",
    "    print(f\"\\nFlac dir not found at {flac_dir}\")"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "R2xdl-B5Iuh3",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777766605454,
     "user_tz": 420,
     "elapsed": 176068,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "78064c33-63e8-4d33-c972-e13e79f99e8a"
   },
   "execution_count": 7,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Downloading ASVspoof 2021 LA from Kaggle...\n",
      "Expected: ~7.8 GB, ~5-8 minutes\n",
      "\n",
      "Dataset URL: https://www.kaggle.com/datasets/ajaysuryal/asvspoof2021-la\n",
      "License(s): unknown\n",
      "\n",
      "Download complete in 2.9 minutes.\n",
      "\n",
      "Dataset root: /content/kaggle_2021/ASVspoof2021_LA/ASVspoof2021_LA_eval\n",
      "Exists: True\n",
      "Top-level contents: ['ASVspoof2021.LA.cm.eval.trl.txt', 'LICENSE.txt', 'README.LA.txt', 'flac']\n",
      "\n",
      "Flac files: 181,566\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "LA_2021_ROOT = '/content/kaggle_2021/ASVspoof2021_LA/ASVspoof2021_LA_eval'\n",
    "\n",
    "# Read the README first\n",
    "print(\"=\" * 70)\n",
    "print(\"README.LA.txt contents:\")\n",
    "print(\"=\" * 70)\n",
    "with open(f'{LA_2021_ROOT}/README.LA.txt') as f:\n",
    "    print(f.read())"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "Dbb0pBBUKEPU",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777766781057,
     "user_tz": 420,
     "elapsed": 15,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "1bd97085-b1ed-44d6-b49b-b5e24803e700"
   },
   "execution_count": 8,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "======================================================================\n",
      "README.LA.txt contents:\n",
      "======================================================================\n",
      "=====================================================================================================\n",
      "\n",
      "ASVspoof 2021 Challenge - Logical Access Databas\n",
      "\n",
      "Copyright (c) 2021  \n",
      "\n",
      "National Institute of Informatics, Japan\n",
      "EURECOM, France\n",
      "Inria, France \n",
      "University of Eastern Finland, Finland\n",
      "Institute for Infocomm Research, Singapore\n",
      "\n",
      "=====================================================================================================\n",
      "\n",
      "\n",
      "1. Directory Structure\n",
      "_______________________\n",
      "\n",
      "  ASVspoof2021_LA_eval/\n",
      "      ASVspoof2021.LA.cm.eval.trl.txt list of evaluation data \n",
      "      flac/                           audio files\n",
      "      README.LA.txt                   this file \n",
      "      LICENSE.txt                     license file \n",
      "\n",
      "\n",
      "2. Audio file format\n",
      "_________________________________\n",
      "\n",
      "  All ASVspoof2021_LA_eval audio files are distributed in flac format. \n",
      "  All audio data is sampled at a rate of 16 kHz and stored in 16-bit.\n",
      "\n",
      " 3. Further details \n",
      "______________________\n",
      "\n",
      "  Further details are available via the ASVspoof website (https://www.asvspoof.org)\n",
      "\n",
      "\n",
      " 4. Copying\n",
      "______________________\n",
      "\n",
      "This dataset is licensed under the Open Data Commons Attribution License (ODC-By). \n",
      "\n",
      "Regarding the Open Data Commons Attribution License (ODC-By), please see LICENSE.txt or \n",
      "https://opendatacommons.org/licenses/by/1.0/index.html\n",
      " \n",
      "THIS DATABASE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND \n",
      "ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED \n",
      "WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. \n",
      "IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, \n",
      "INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, \n",
      "BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, \n",
      "OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, \n",
      "WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) \n",
      "ARISING IN ANY WAY OUT OF THE USE OF THIS DATABASE, EVEN IF ADVISED OF THE \n",
      "POSSIBILITY OF SUCH DAMAGE\n",
      "\n",
      "5. Acknowledgements  \n",
      "______________________\n",
      "\n",
      "A part of this database is based on the ASVspoof 2019 database (https://doi.org/10.7488/ds/2555). \n",
      " \n",
      " \n",
      "\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "import os\n",
    "\n",
    "LA_2021_ROOT = '/content/kaggle_2021/ASVspoof2021_LA/ASVspoof2021_LA_eval'\n",
    "PROTO_PATH = f'{LA_2021_ROOT}/ASVspoof2021.LA.cm.eval.trl.txt'\n",
    "\n",
    "print(f\"Protocol file: {PROTO_PATH}\")\n",
    "print(f\"Size: {os.path.getsize(PROTO_PATH) / 1024:.1f} KB\\n\")\n",
    "\n",
    "print(\"First 10 lines:\")\n",
    "print(\"-\" * 70)\n",
    "with open(PROTO_PATH) as f:\n",
    "    for i, line in enumerate(f):\n",
    "        if i >= 10:\n",
    "            break\n",
    "        print(repr(line))\n",
    "print(\"-\" * 70)\n",
    "\n",
    "with open(PROTO_PATH) as f:\n",
    "    n_lines = sum(1 for _ in f)\n",
    "print(f\"\\nTotal lines: {n_lines:,}\")"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "DL6EaOl4KQVT",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777766830927,
     "user_tz": 420,
     "elapsed": 28,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "d9bb5ce0-c15f-48bc-a913-2139d0c867e6"
   },
   "execution_count": 9,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Protocol file: /content/kaggle_2021/ASVspoof2021_LA/ASVspoof2021_LA_eval/ASVspoof2021.LA.cm.eval.trl.txt\n",
      "Size: 2305.0 KB\n",
      "\n",
      "First 10 lines:\n",
      "----------------------------------------------------------------------\n",
      "'LA_E_9332881\\n'\n",
      "'LA_E_6866159\\n'\n",
      "'LA_E_5464494\\n'\n",
      "'LA_E_4759417\\n'\n",
      "'LA_E_2667748\\n'\n",
      "'LA_E_8589971\\n'\n",
      "'LA_E_1911364\\n'\n",
      "'LA_E_5298786\\n'\n",
      "'LA_E_2042719\\n'\n",
      "'LA_E_5449181\\n'\n",
      "----------------------------------------------------------------------\n",
      "\n",
      "Total lines: 181,566\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "import os, time\n",
    "\n",
    "KEY_DIR = '/content/kaggle_2021_key'\n",
    "os.makedirs(KEY_DIR, exist_ok=True)\n",
    "\n",
    "print(\"Downloading ASVspoof 2021 LA key file (~21 MB)...\\n\")\n",
    "start = time.time()\n",
    "!kaggle datasets download -d simontrann/asvspoof2021-la-key -p {KEY_DIR} --unzip --force --quiet\n",
    "print(f\"Downloaded in {(time.time()-start):.1f} seconds.\\n\")\n",
    "\n",
    "# Inspect what's inside\n",
    "print(\"Contents of key download:\")\n",
    "for root, dirs, files in os.walk(KEY_DIR):\n",
    "    level = root.replace(KEY_DIR, '').count('/')\n",
    "    indent = '  ' * level\n",
    "    print(f\"{indent}{os.path.basename(root) or 'root'}/\")\n",
    "    for f in files:\n",
    "        size = os.path.getsize(os.path.join(root, f))\n",
    "        size_str = f\"{size/1e6:.1f} MB\" if size > 1e6 else f\"{size/1024:.1f} KB\"\n",
    "        print(f\"{indent}  - {f}  ({size_str})\")"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "9Ny0ulDtKaAP",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777766872373,
     "user_tz": 420,
     "elapsed": 2155,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "e074a55a-7fbe-4aa7-87fc-2eb7dd3f341a"
   },
   "execution_count": 10,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Downloading ASVspoof 2021 LA key file (~21 MB)...\n",
      "\n",
      "Dataset URL: https://www.kaggle.com/datasets/simontrann/asvspoof2021-la-key\n",
      "License(s): apache-2.0\n",
      "Downloaded in 2.1 seconds.\n",
      "\n",
      "Contents of key download:\n",
      "kaggle_2021_key/\n",
      "  keys/\n",
      "    LA/\n",
      "      - LA-C012-eval.npy  (19.5 KB)\n",
      "      - README.txt  (0.7 KB)\n",
      "      - LA-C012-prog.npy  (14.0 KB)\n",
      "      - LA-C012-hidden.npy  (19.4 KB)\n",
      "      CM/\n",
      "        - trial_metadata.txt  (10.2 MB)\n",
      "        LFCC-LCNN/\n",
      "          - score.txt  (4.2 MB)\n",
      "        CQCC-GMM/\n",
      "          - score.txt  (4.0 MB)\n",
      "        RawNet2/\n",
      "          - score.txt  (5.7 MB)\n",
      "        LFCC-GMM/\n",
      "          - score.txt  (4.0 MB)\n",
      "      ASV/\n",
      "        - trial_metadata.txt  (65.8 MB)\n",
      "        ASVTorch_Kaldi/\n",
      "          - score.txt  (40.9 MB)\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "KEY_FILE = '/content/kaggle_2021_key/keys/LA/CM/trial_metadata.txt'\n",
    "\n",
    "print(f\"Key file: {KEY_FILE}\")\n",
    "print(f\"Size: {os.path.getsize(KEY_FILE) / 1e6:.2f} MB\\n\")\n",
    "\n",
    "print(\"First 10 lines:\")\n",
    "print(\"-\" * 70)\n",
    "with open(KEY_FILE) as f:\n",
    "    for i, line in enumerate(f):\n",
    "        if i >= 10:\n",
    "            break\n",
    "        print(repr(line))\n",
    "print(\"-\" * 70)\n",
    "\n",
    "with open(KEY_FILE) as f:\n",
    "    n_lines = sum(1 for _ in f)\n",
    "print(f\"\\nTotal lines: {n_lines:,}\")\n",
    "\n",
    "# Also check the README\n",
    "print(\"\\nREADME contents:\")\n",
    "print(\"-\" * 70)\n",
    "with open('/content/kaggle_2021_key/keys/LA/README.txt') as f:\n",
    "    print(f.read())"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "KmJ212hNKnIS",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777766924288,
     "user_tz": 420,
     "elapsed": 9,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "358394d8-58ac-4492-8121-19043f823d75"
   },
   "execution_count": 11,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Key file: /content/kaggle_2021_key/keys/LA/CM/trial_metadata.txt\n",
      "Size: 10.16 MB\n",
      "\n",
      "First 10 lines:\n",
      "----------------------------------------------------------------------\n",
      "'LA_0009 LA_E_9332881 alaw ita_tx A07 spoof notrim eval\\n'\n",
      "'LA_0009 LA_E_6866159 alaw ita_tx A07 spoof notrim eval\\n'\n",
      "'LA_0009 LA_E_5464494 alaw sin_tx A07 spoof notrim eval\\n'\n",
      "'LA_0009 LA_E_4759417 alaw sin_tx A07 spoof notrim eval\\n'\n",
      "'LA_0009 LA_E_2667748 alaw loc_tx A07 spoof notrim eval\\n'\n",
      "'LA_0009 LA_E_8589971 alaw loc_tx A07 spoof notrim progress\\n'\n",
      "'LA_0009 LA_E_1911364 alaw loc_tx A07 spoof notrim progress\\n'\n",
      "'LA_0009 LA_E_5298786 alaw loc_tx A07 spoof notrim progress\\n'\n",
      "'LA_0009 LA_E_2042719 ulaw ita_tx A07 spoof notrim eval\\n'\n",
      "'LA_0009 LA_E_5449181 ulaw ita_tx A07 spoof notrim eval\\n'\n",
      "----------------------------------------------------------------------\n",
      "\n",
      "Total lines: 181,566\n",
      "\n",
      "README contents:\n",
      "----------------------------------------------------------------------\n",
      "\n",
      "================================\n",
      "ASVspoof2021 key and meta label\n",
      "================================\n",
      "\n",
      "This folder contains keys & meta data for ASVspoof2021 evaluation data.\n",
      "\n",
      "./\n",
      "|- CM                        \n",
      "|   |- trial_metadata.txt    CM protocol with keys and meta labels\n",
      "|   |- LFCC-GMM              \n",
      "|   |   |- score.txt         Baseline LFCC-GMM CM score\n",
      "|   |- ...\n",
      "|\n",
      "|- ASV                       (optional)\n",
      "|   |- trial_metadata.txt    ASV protocl with keys and meta labels\n",
      "|   |- ASVtorch_kaldi        \n",
      "|       |- score.txt         Baseline ASV score\n",
      "|\n",
      "|- *-C012-*.npy              (optional) Pre-computed C012 cofficients\n",
      "\n",
      "__author__ = \"ASVspoof consortium\"\n",
      "__copyright__ = \"Copyright 2022, ASVspoof consortium\"\n",
      "\n"
     ]
    }
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "Schema decoded\n",
    "Each line has 8 space-separated fields:\n",
    "\n",
    "LA_0009   LA_E_9332881    alaw    ita_tx    A07    spoof    notrim    eval\n",
    "\n",
    "\u2193         \u2193               \u2193       \u2193         \u2193      \u2193        \u2193         \u2193\n",
    "\n",
    "speaker    utterance_id    codec   channel   attack label  trim    partition\n"
   ],
   "metadata": {
    "id": "SeLFqHvXK3_A"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "PROTOCOLS_2021_PY = '''\"\"\"\n",
    "ASVspoof 2021 LA protocol parser.\n",
    "\n",
    "Format (8 space-separated columns):\n",
    "    speaker_id  utterance_id  codec  channel  attack_id  label  trim  partition\n",
    "\n",
    "    speaker_id   : anonymized speaker\n",
    "    utterance_id : filename without extension (e.g., \"LA_E_9332881\")\n",
    "    codec        : audio codec applied (alaw, ulaw, g722, mp3, pcm, ...)\n",
    "    channel      : transmission channel (ita_tx, sin_tx, loc_tx, ...)\n",
    "    attack_id    : \"-\" for bonafide, \"A07\"-\"A19\" for spoof\n",
    "    label        : \"bonafide\" or \"spoof\"\n",
    "    trim         : \"trim\" or \"notrim\"\n",
    "    partition    : \"eval\", \"progress\", or \"hidden\"\n",
    "\"\"\"\n",
    "\n",
    "from dataclasses import dataclass\n",
    "from typing import List, Optional\n",
    "import os\n",
    "\n",
    "\n",
    "@dataclass\n",
    "class Utterance2021:\n",
    "    \"\"\"One row from an ASVspoof 2021 LA cm protocol file.\"\"\"\n",
    "    speaker_id: str\n",
    "    utterance_id: str\n",
    "    codec: str\n",
    "    channel: str\n",
    "    attack_id: str\n",
    "    label: str\n",
    "    label_int: int\n",
    "    trim: str\n",
    "    partition: str\n",
    "    flac_path: str\n",
    "\n",
    "\n",
    "def parse_protocol_2021(\n",
    "    protocol_path: str,\n",
    "    audio_root: str,\n",
    "    partition_filter: Optional[str] = \"eval\",\n",
    ") -> List[Utterance2021]:\n",
    "    \"\"\"Parse the 2021 LA cm protocol with keys.\n",
    "\n",
    "    Args:\n",
    "        protocol_path: full path to trial_metadata.txt\n",
    "        audio_root: full path to the flac/ folder\n",
    "        partition_filter: only return rows matching this partition.\n",
    "                          Valid: \"eval\", \"progress\", \"hidden\", or None for all.\n",
    "\n",
    "    Returns:\n",
    "        List of Utterance2021 objects.\n",
    "    \"\"\"\n",
    "    utterances: List[Utterance2021] = []\n",
    "    with open(protocol_path, \"r\") as f:\n",
    "        for line in f:\n",
    "            parts = line.strip().split()\n",
    "            if len(parts) != 8:\n",
    "                continue\n",
    "            speaker_id, utt_id, codec, channel, attack_id, label, trim, partition = parts\n",
    "\n",
    "            if partition_filter is not None and partition != partition_filter:\n",
    "                continue\n",
    "\n",
    "            label_int = 0 if label == \"bonafide\" else 1\n",
    "            flac_path = os.path.join(audio_root, f\"{utt_id}.flac\")\n",
    "\n",
    "            utterances.append(Utterance2021(\n",
    "                speaker_id=speaker_id,\n",
    "                utterance_id=utt_id,\n",
    "                codec=codec,\n",
    "                channel=channel,\n",
    "                attack_id=attack_id,\n",
    "                label=label,\n",
    "                label_int=label_int,\n",
    "                trim=trim,\n",
    "                partition=partition,\n",
    "                flac_path=flac_path,\n",
    "            ))\n",
    "    return utterances\n",
    "'''\n",
    "\n",
    "PATH = '/content/deepfake-audio-detection/src/data/protocols_2021.py'\n",
    "import os\n",
    "os.makedirs(os.path.dirname(PATH), exist_ok=True)\n",
    "\n",
    "with open(PATH, 'w') as f:\n",
    "    f.write(PROTOCOLS_2021_PY)\n",
    "print(f\"Wrote {PATH} ({len(PROTOCOLS_2021_PY)} bytes)\")"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "ViHz0PbOK41Y",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777767150761,
     "user_tz": 420,
     "elapsed": 58,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "c1b65b6a-23a7-43c4-bb8e-6b9a24a3e72d"
   },
   "execution_count": 12,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Wrote /content/deepfake-audio-detection/src/data/protocols_2021.py (2415 bytes)\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "import sys, importlib\n",
    "\n",
    "# Reload in case\n",
    "sys.path.insert(0, '/content/deepfake-audio-detection')\n",
    "if 'src.data.protocols_2021' in sys.modules:\n",
    "    importlib.reload(sys.modules['src.data.protocols_2021'])\n",
    "from src.data.protocols_2021 import parse_protocol_2021\n",
    "\n",
    "KEY_FILE = '/content/kaggle_2021_key/keys/LA/CM/trial_metadata.txt'\n",
    "AUDIO_ROOT = '/content/kaggle_2021/ASVspoof2021_LA/ASVspoof2021_LA_eval/flac'\n",
    "\n",
    "# Parse the eval partition only\n",
    "print(\"Parsing 2021 LA protocol (eval partition)...\")\n",
    "utts_eval = parse_protocol_2021(KEY_FILE, AUDIO_ROOT, partition_filter=\"eval\")\n",
    "print(f\"Eval utterances: {len(utts_eval):,}\\n\")\n",
    "\n",
    "# Class distribution\n",
    "from collections import Counter\n",
    "labels = Counter(u.label for u in utts_eval)\n",
    "print(f\"Class distribution:\")\n",
    "for k, v in labels.most_common():\n",
    "    print(f\"  {k}: {v:,}\")\n",
    "ratio = labels['spoof'] / labels['bonafide'] if labels['bonafide'] > 0 else 0\n",
    "print(f\"  Imbalance: 1 bonafide : {ratio:.1f} spoof\\n\")\n",
    "\n",
    "# Codec distribution\n",
    "codecs = Counter(u.codec for u in utts_eval)\n",
    "print(f\"Codec distribution:\")\n",
    "for k, v in codecs.most_common():\n",
    "    print(f\"  {k}: {v:,}\")\n",
    "\n",
    "# Channel distribution\n",
    "channels = Counter(u.channel for u in utts_eval)\n",
    "print(f\"\\nChannel distribution:\")\n",
    "for k, v in channels.most_common():\n",
    "    print(f\"  {k}: {v:,}\")\n",
    "\n",
    "# Attack distribution\n",
    "attacks = Counter(u.attack_id for u in utts_eval)\n",
    "print(f\"\\nAttack distribution:\")\n",
    "for k, v in sorted(attacks.items()):\n",
    "    label = \"bonafide\" if k == \"-\" else \"spoof\"\n",
    "    print(f\"  {k}: {v:>6,}  ({label})\")\n",
    "\n",
    "# Sanity check: verify a few audio files actually exist\n",
    "import os\n",
    "print(f\"\\nAudio file existence check (first 5):\")\n",
    "for u in utts_eval[:5]:\n",
    "    exists = os.path.exists(u.flac_path)\n",
    "    status = \"OK\" if exists else \"MISSING\"\n",
    "    print(f\"  [{status}] {u.utterance_id}.flac  codec={u.codec}  attack={u.attack_id}\")"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "FFTdQ8dlLpTn",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777767195756,
     "user_tz": 420,
     "elapsed": 512,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "7f455c66-aae3-4c23-8499-446bf4d3ca30"
   },
   "execution_count": 13,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Parsing 2021 LA protocol (eval partition)...\n",
      "Eval utterances: 148,176\n",
      "\n",
      "Class distribution:\n",
      "  spoof: 133,360\n",
      "  bonafide: 14,816\n",
      "  Imbalance: 1 bonafide : 9.0 spoof\n",
      "\n",
      "Codec distribution:\n",
      "  ulaw: 23,520\n",
      "  gsm: 23,520\n",
      "  opus: 23,520\n",
      "  alaw: 19,436\n",
      "  none: 19,421\n",
      "  pstn: 19,384\n",
      "  g722: 19,375\n",
      "\n",
      "Channel distribution:\n",
      "  loc_tx: 62,425\n",
      "  ita_tx: 23,508\n",
      "  sin_tx: 23,438\n",
      "  -: 19,421\n",
      "  mad_tx: 19,384\n",
      "\n",
      "Attack distribution:\n",
      "  A07: 10,238  (spoof)\n",
      "  A08: 10,368  (spoof)\n",
      "  A09: 10,152  (spoof)\n",
      "  A10: 10,318  (spoof)\n",
      "  A11: 10,276  (spoof)\n",
      "  A12: 10,259  (spoof)\n",
      "  A13: 10,302  (spoof)\n",
      "  A14: 10,234  (spoof)\n",
      "  A15: 10,235  (spoof)\n",
      "  A16: 10,390  (spoof)\n",
      "  A17: 10,239  (spoof)\n",
      "  A18: 10,148  (spoof)\n",
      "  A19: 10,201  (spoof)\n",
      "  bonafide: 14,816  (spoof)\n",
      "\n",
      "Audio file existence check (first 5):\n",
      "  [OK] LA_E_9332881.flac  codec=alaw  attack=A07\n",
      "  [OK] LA_E_6866159.flac  codec=alaw  attack=A07\n",
      "  [OK] LA_E_5464494.flac  codec=alaw  attack=A07\n",
      "  [OK] LA_E_4759417.flac  codec=alaw  attack=A07\n",
      "  [OK] LA_E_2667748.flac  codec=alaw  attack=A07\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "import torch\n",
    "import torchaudio\n",
    "from tqdm import tqdm\n",
    "\n",
    "# Reload modules in case\n",
    "for mod in ['src.data.preprocessing', 'src.data.dataset',\n",
    "            'src.models.wav2vec_classifier']:\n",
    "    if mod in sys.modules:\n",
    "        importlib.reload(sys.modules[mod])\n",
    "\n",
    "from src.data.dataset import ASVspoofDataset\n",
    "from src.models.wav2vec_classifier import Wav2VecClassifier\n",
    "\n",
    "# We need utts_eval to look like the 2019 Utterance class for ASVspoofDataset.\n",
    "# The fields ASVspoofDataset uses: flac_path, label_int, utterance_id\n",
    "# All present in Utterance2021. So we can pass them directly.\n",
    "\n",
    "# Step 1: Measure all durations\n",
    "print(f\"Measuring durations on {len(utts_eval):,} 2021 LA eval utterances...\")\n",
    "print(\"Expected: ~7-10 min\\n\")\n",
    "\n",
    "eval_durs_2021 = []\n",
    "for u in tqdm(utts_eval, desc=\"2021 durations\"):\n",
    "    w, _ = torchaudio.load(u.flac_path)\n",
    "    eval_durs_2021.append(w.shape[1])\n",
    "\n",
    "# Step 2: Build dataset and loader\n",
    "eval_ds_2021 = ASVspoofDataset(utts_eval, durations_samples=eval_durs_2021)\n",
    "print(f\"\\n2021 eval dataset: {len(eval_ds_2021):,} windows from {len(utts_eval):,} utterances\")\n",
    "inflation = len(eval_ds_2021) / len(utts_eval)\n",
    "print(f\"Inflation factor: {inflation:.2f}x\")\n",
    "\n",
    "from torch.utils.data import DataLoader\n",
    "eval_loader_2021 = DataLoader(\n",
    "    eval_ds_2021, batch_size=16, shuffle=False, num_workers=2, pin_memory=True\n",
    ")\n",
    "\n",
    "# Step 3: Load Stage 2 model\n",
    "print(\"\\nLoading Stage 2 best checkpoint...\")\n",
    "device = 'cuda'\n",
    "model = Wav2VecClassifier(\n",
    "    backbone_name=\"facebook/wav2vec2-base\",\n",
    "    num_classes=2,\n",
    "    freeze_backbone=True,\n",
    ")\n",
    "ckpt = torch.load(\n",
    "    '/content/drive/MyDrive/deepfake_audio/checkpoints/stage2_best.pt',\n",
    "    map_location=device, weights_only=False,\n",
    ")\n",
    "model.load_state_dict(ckpt['model_state_dict'])\n",
    "model = model.to(device)\n",
    "model.eval()\n",
    "print(f\"Model loaded (epoch {ckpt['epoch']}, dev EER {ckpt['best_eer']*100:.4f}%)\")\n",
    "print(\"\\nReady for 2021 LA inference. Run the next cell.\")"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 512
    },
    "id": "uk15VjpCL6A1",
    "executionInfo": {
     "status": "error",
     "timestamp": 1777767592588,
     "user_tz": 420,
     "elapsed": 328927,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "b453bc62-5bf6-4675-fe37-2a0cc83ce1bd"
   },
   "execution_count": 14,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Measuring durations on 148,176 2021 LA eval utterances...\n",
      "Expected: ~7-10 min\n",
      "\n"
     ]
    },
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "2021 durations:  70%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588   | 104397/148176 [05:08<02:09, 338.37it/s]\n"
     ]
    },
    {
     "output_type": "error",
     "ename": "RuntimeError",
     "evalue": "Failed to decode audio samples: Could not flush decoder: Invalid data found when processing input",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
      "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torchaudio/_torchcodec.py\u001b[0m in \u001b[0;36mload_with_torchcodec\u001b[0;34m(uri, frame_offset, num_frames, normalize, channels_first, format, buffer_size, backend)\u001b[0m\n\u001b[1;32m    127\u001b[0m     \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 128\u001b[0;31m         \u001b[0maudio_samples\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdecoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_all_samples\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    129\u001b[0m     \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torchcodec/decoders/_audio_decoder.py\u001b[0m in \u001b[0;36mget_all_samples\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    107\u001b[0m         \"\"\"\n\u001b[0;32m--> 108\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_samples_played_in_range\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    109\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torchcodec/decoders/_audio_decoder.py\u001b[0m in \u001b[0;36mget_samples_played_in_range\u001b[0;34m(self, start_seconds, stop_seconds)\u001b[0m\n\u001b[1;32m    136\u001b[0m             )\n\u001b[0;32m--> 137\u001b[0;31m         frames, first_pts = core.get_frames_by_pts_in_range_audio(\n\u001b[0m\u001b[1;32m    138\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_decoder\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torch/_ops.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    818\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__call__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m/\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0m_P\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0m_P\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0m_T\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 819\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_op\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    820\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mRuntimeError\u001b[0m: Could not flush decoder: Invalid data found when processing input",
      "\nThe above exception was the direct cause of the following exception:\n",
      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
      "\u001b[0;32m/tmp/ipykernel_13317/263687011.py\u001b[0m in \u001b[0;36m<cell line: 0>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     22\u001b[0m \u001b[0meval_durs_2021\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     23\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mu\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mutts_eval\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdesc\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"2021 durations\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 24\u001b[0;31m     \u001b[0mw\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorchaudio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mu\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflac_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     25\u001b[0m     \u001b[0meval_durs_2021\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mw\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     26\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torchaudio/__init__.py\u001b[0m in \u001b[0;36mload\u001b[0;34m(uri, frame_offset, num_frames, normalize, channels_first, format, buffer_size, backend)\u001b[0m\n\u001b[1;32m     84\u001b[0m         \u001b[0mby\u001b[0m \u001b[0mTorchCodec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     85\u001b[0m     \"\"\"\n\u001b[0;32m---> 86\u001b[0;31m     return load_with_torchcodec(\n\u001b[0m\u001b[1;32m     87\u001b[0m         \u001b[0muri\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     88\u001b[0m         \u001b[0mframe_offset\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mframe_offset\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torchaudio/_torchcodec.py\u001b[0m in \u001b[0;36mload_with_torchcodec\u001b[0;34m(uri, frame_offset, num_frames, normalize, channels_first, format, buffer_size, backend)\u001b[0m\n\u001b[1;32m    128\u001b[0m         \u001b[0maudio_samples\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdecoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_all_samples\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    129\u001b[0m     \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 130\u001b[0;31m         \u001b[0;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Failed to decode audio samples: {e}\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    131\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    132\u001b[0m     \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0maudio_samples\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mRuntimeError\u001b[0m: Failed to decode audio samples: Could not flush decoder: Invalid data found when processing input"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "# Find which file failed\n",
    "bad_idx = len(eval_durs_2021)  # this is how many succeeded before crash\n",
    "print(f\"Files successfully measured: {bad_idx:,}\")\n",
    "print(f\"Failing file index: {bad_idx}\")\n",
    "\n",
    "# Show the bad file\n",
    "bad_utt = utts_eval[bad_idx]\n",
    "print(f\"\\nBad utterance:\")\n",
    "print(f\"  ID: {bad_utt.utterance_id}\")\n",
    "print(f\"  Path: {bad_utt.flac_path}\")\n",
    "print(f\"  Codec: {bad_utt.codec}\")\n",
    "print(f\"  Channel: {bad_utt.channel}\")\n",
    "print(f\"  Attack: {bad_utt.attack_id}\")\n",
    "\n",
    "import os\n",
    "print(f\"\\nFile size: {os.path.getsize(bad_utt.flac_path)} bytes\")\n",
    "\n",
    "# Try to load it directly to confirm the error\n",
    "import torchaudio\n",
    "try:\n",
    "    w, sr = torchaudio.load(bad_utt.flac_path)\n",
    "    print(f\"Loaded successfully: {w.shape}, sr={sr}\")\n",
    "except Exception as e:\n",
    "    print(f\"\\nConfirmed unreadable: {type(e).__name__}: {e}\")"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "CVUn9pKXOyqT",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777768020095,
     "user_tz": 420,
     "elapsed": 18,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "1ea07858-dced-4494-adaf-7d2b4ff1a8e5"
   },
   "execution_count": 15,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Files successfully measured: 104,397\n",
      "Failing file index: 104397\n",
      "\n",
      "Bad utterance:\n",
      "  ID: LA_E_1759547\n",
      "  Path: /content/kaggle_2021/ASVspoof2021_LA/ASVspoof2021_LA_eval/flac/LA_E_1759547.flac\n",
      "  Codec: pstn\n",
      "  Channel: mad_tx\n",
      "  Attack: A13\n",
      "\n",
      "File size: 28608 bytes\n",
      "\n",
      "Confirmed unreadable: RuntimeError: Failed to decode audio samples: Could not flush decoder: Invalid data found when processing input\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "import torchaudio\n",
    "import os\n",
    "from tqdm import tqdm\n",
    "\n",
    "# Sample 50 random pstn files and try to load each\n",
    "import random\n",
    "random.seed(42)\n",
    "\n",
    "pstn_utts = [u for u in utts_eval if u.codec == 'pstn']\n",
    "print(f\"Total pstn files in eval: {len(pstn_utts):,}\")\n",
    "\n",
    "sample = random.sample(pstn_utts, min(50, len(pstn_utts)))\n",
    "print(f\"Testing 50 random pstn files...\\n\")\n",
    "\n",
    "failed = []\n",
    "for u in tqdm(sample, desc=\"pstn check\"):\n",
    "    try:\n",
    "        w, _ = torchaudio.load(u.flac_path)\n",
    "    except Exception:\n",
    "        failed.append(u.utterance_id)\n",
    "\n",
    "print(f\"\\npstn files tested: {len(sample)}\")\n",
    "print(f\"Failed to load:     {len(failed)}  ({100*len(failed)/len(sample):.1f}%)\")\n",
    "\n",
    "if failed:\n",
    "    print(f\"\\nSample of failed IDs:\")\n",
    "    for f in failed[:5]:\n",
    "        print(f\"  - {f}\")"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "rHwjB5-nO7Js",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777768056155,
     "user_tz": 420,
     "elapsed": 37,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "93b2c649-c4b1-4e18-ab4e-e9673c33f699"
   },
   "execution_count": 16,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Total pstn files in eval: 19,384\n",
      "Testing 50 random pstn files...\n",
      "\n"
     ]
    },
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "pstn check: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 50/50 [00:00<00:00, 453.50it/s]"
     ]
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\n",
      "pstn files tested: 50\n",
      "Failed to load:     0  (0.0%)\n"
     ]
    },
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "import torchaudio\n",
    "from tqdm import tqdm\n",
    "\n",
    "# Track failures so we can analyze them later\n",
    "failed_ids = []\n",
    "\n",
    "# Continue from where we crashed\n",
    "print(f\"Already measured: {len(eval_durs_2021):,}\")\n",
    "print(f\"Resuming from index {len(eval_durs_2021)}...\\n\")\n",
    "\n",
    "remaining = utts_eval[len(eval_durs_2021):]\n",
    "\n",
    "for u in tqdm(remaining, desc=\"2021 durations (resume)\"):\n",
    "    try:\n",
    "        w, _ = torchaudio.load(u.flac_path)\n",
    "        eval_durs_2021.append(w.shape[1])\n",
    "    except Exception as e:\n",
    "        # Use a sentinel value that we'll filter out later\n",
    "        eval_durs_2021.append(None)\n",
    "        failed_ids.append(u.utterance_id)\n",
    "\n",
    "print(f\"\\nMeasurement complete.\")\n",
    "print(f\"Total durations recorded: {len(eval_durs_2021):,}\")\n",
    "print(f\"Failed to read:           {len(failed_ids):,}\")\n",
    "if failed_ids:\n",
    "    print(f\"\\nFirst 10 failed IDs:\")\n",
    "    for f in failed_ids[:10]:\n",
    "        print(f\"  - {f}\")"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "3wSfLHpEPGnr",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777768219664,
     "user_tz": 420,
     "elapsed": 117977,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "288b9971-c7e2-416f-9417-9a6add06677a"
   },
   "execution_count": 17,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Already measured: 104,397\n",
      "Resuming from index 104397...\n",
      "\n"
     ]
    },
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "2021 durations (resume): 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 43779/43779 [01:58<00:00, 370.57it/s]"
     ]
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\n",
      "Measurement complete.\n",
      "Total durations recorded: 148,176\n",
      "Failed to read:           1\n",
      "\n",
      "First 10 failed IDs:\n",
      "  - LA_E_1759547\n"
     ]
    },
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "# Filter: keep only utterances with valid durations\n",
    "valid_pairs = [\n",
    "    (u, d) for u, d in zip(utts_eval, eval_durs_2021)\n",
    "    if d is not None\n",
    "]\n",
    "utts_eval_clean = [p[0] for p in valid_pairs]\n",
    "durs_eval_clean = [p[1] for p in valid_pairs]\n",
    "\n",
    "print(f\"Original utterances: {len(utts_eval):,}\")\n",
    "print(f\"Valid utterances:    {len(utts_eval_clean):,}\")\n",
    "print(f\"Removed:             {len(utts_eval) - len(utts_eval_clean):,}\")\n",
    "\n",
    "# Rebuild dataset and loader with the clean data\n",
    "from src.data.dataset import ASVspoofDataset\n",
    "from torch.utils.data import DataLoader\n",
    "\n",
    "eval_ds_2021 = ASVspoofDataset(utts_eval_clean, durations_samples=durs_eval_clean)\n",
    "eval_loader_2021 = DataLoader(\n",
    "    eval_ds_2021, batch_size=16, shuffle=False, num_workers=2, pin_memory=True\n",
    ")\n",
    "\n",
    "print(f\"\\nFinal eval dataset: {len(eval_ds_2021):,} windows from {len(utts_eval_clean):,} utterances\")\n",
    "inflation = len(eval_ds_2021) / len(utts_eval_clean)\n",
    "print(f\"Inflation factor: {inflation:.2f}x\")\n",
    "print(f\"\\nReady for inference.\")"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "uEFGs2R0TTao",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777769203406,
     "user_tz": 420,
     "elapsed": 520,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "487635f2-e321-4fc1-b4e9-9e1b42358f0b"
   },
   "execution_count": 18,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Original utterances: 148,176\n",
      "Valid utterances:    148,175\n",
      "Removed:             1\n",
      "\n",
      "Final eval dataset: 173,149 windows from 148,175 utterances\n",
      "Inflation factor: 1.17x\n",
      "\n",
      "Ready for inference.\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "import torch\n",
    "import importlib, sys\n",
    "\n",
    "# Reload modules\n",
    "for mod in ['src.models.wav2vec_classifier']:\n",
    "    if mod in sys.modules:\n",
    "        importlib.reload(sys.modules[mod])\n",
    "from src.models.wav2vec_classifier import Wav2VecClassifier\n",
    "\n",
    "# Build model and load Stage 2 checkpoint\n",
    "device = 'cuda'\n",
    "model = Wav2VecClassifier(\n",
    "    backbone_name=\"facebook/wav2vec2-base\",\n",
    "    num_classes=2,\n",
    "    freeze_backbone=True,\n",
    ")\n",
    "ckpt_path = '/content/drive/MyDrive/deepfake_audio/checkpoints/stage2_best.pt'\n",
    "ckpt = torch.load(ckpt_path, map_location=device, weights_only=False)\n",
    "model.load_state_dict(ckpt['model_state_dict'])\n",
    "model = model.to(device)\n",
    "model.eval()\n",
    "print(f\"Model reloaded.\")\n",
    "print(f\"  Checkpoint: epoch {ckpt['epoch']}, dev EER {ckpt['best_eer']*100:.4f}%\")\n",
    "print(f\"  Device: {next(model.parameters()).device}\")\n",
    "print(f\"  Mode: {'eval' if not model.training else 'train'}\")"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 599,
     "referenced_widgets": [
      "53a720ab601240aeb4c6648242691f4b",
      "93c12098d7fc4534a6f55b5eee59b7d3",
      "dea9573389fe48ab9a70ba1a90816835",
      "7b1f1033c90d4aa69d43ab54ab0ffcb7",
      "6b7a97f8077344598300c9a3e9e7a502",
      "fe09d65683654145a1ae72cc4e47742a",
      "e891e562bba24bbeaabfe5a0a3bf60ff",
      "a275540d556d496fbd3e4a59f080e9ad",
      "963d086a1d4e4cb28aaf44272f9d58c6",
      "51a07670c9ad4bd085b840d3376b625a",
      "888a4264f5da4eb6988a59cb44ae94a4",
      "8361b5ae95d2412b98be9501f0c01caa",
      "26d29aa19cb64c37a6a025d5d5bd2b63",
      "23ae2a90462e4fd3b8f74244a71011cc",
      "271798236b394a97866fa3f66dbd933c",
      "412c7582608449a8b2b15e8be81e582c",
      "93a870cd79614edca379807b0e8c25b9",
      "7c487c1214b74db5a0419d894ed505a9",
      "9874bc54cca84bd989a223834a02f339",
      "19ea860465544e668fe40350be897677",
      "5ad4fa282b974391879c72c10ee7607f",
      "60b47b5c6f3e4369b53cc5fd60963be5",
      "685ad97214e948b48d03205d08b0a451",
      "7f02b1a2a6394b33a91ccf3b42cf679c",
      "a366903e51fd45bda050931eeba15270",
      "f6b2aa7a2ffb4830bc60d94c4bf7fd53",
      "de591daea11c4f3182692dc9d82db075",
      "de74f36f94bc4b399fdf482cd5964f42",
      "9d413beb9aba41cb97e07398e37b4940",
      "4976869e69254e659a07d2e3e12fa10f",
      "cb579ee62db04846a0f7a7cead5244dc",
      "aabeea06a76b4663b2eb83b50702ccdc",
      "19726602d830446ebcc5ae15249c8fbf",
      "635c36f8d6a74a32adfda89bb432f70d",
      "9548badf7f4f4ccb893eee3f9b8dcca7",
      "6c75e712042c41708367a1dc87fc28d3",
      "35d1917781354f21bd5b86bbdd7e5cdf",
      "50bf7c89c2d24479a7c4757e2edc97d3",
      "aaeacea9bc7a415ea23ad2cefc99de43",
      "a3709411901e46f8b6e6c62735d64ee9",
      "ef6743f2d246487fb03f51ee4dbfef83",
      "df509c08c6fe4b0b86b8372b20b097e0",
      "86f74258ca2448e591b466f6cb1a2abb",
      "4d99c78d5ba24241b2d3684be1bd1a66"
     ]
    },
    "id": "e6BTFl6UVGIa",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777769689288,
     "user_tz": 420,
     "elapsed": 16921,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "b73b617b-f8e7-4a41-cb14-36852c034f96"
   },
   "execution_count": 20,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:93: UserWarning: \n",
      "The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
      "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
      "You will be able to reuse this secret in all of your notebooks.\n",
      "Please note that authentication is recommended but still optional to access public models or datasets.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n",
      "WARNING:huggingface_hub.utils._http:Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n"
     ]
    },
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "Wav2Vec2Model LOAD REPORT from: facebook/wav2vec2-base\n",
      "Key                          | Status     |  | \n",
      "-----------------------------+------------+--+-\n",
      "quantizer.weight_proj.weight | UNEXPECTED |  | \n",
      "project_q.weight             | UNEXPECTED |  | \n",
      "project_hid.weight           | UNEXPECTED |  | \n",
      "quantizer.codevectors        | UNEXPECTED |  | \n",
      "quantizer.weight_proj.bias   | UNEXPECTED |  | \n",
      "project_hid.bias             | UNEXPECTED |  | \n",
      "project_q.bias               | UNEXPECTED |  | \n",
      "\n",
      "Notes:\n",
      "- UNEXPECTED\t:can be ignored when loading from different task/architecture; not ok if you expect identical arch.\n"
     ]
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Model reloaded.\n",
      "  Checkpoint: epoch 9, dev EER 0.6941%\n",
      "  Device: cuda:0\n",
      "  Mode: eval\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "import torch\n",
    "import numpy as np\n",
    "from tqdm import tqdm\n",
    "from collections import defaultdict\n",
    "import time\n",
    "\n",
    "from src.evaluation.metrics import compute_eer, compute_auc, aggregate_window_scores_to_utterance\n",
    "\n",
    "# Build lookup tables for breakdown analysis (codec/channel/attack per utterance)\n",
    "utt_codec_map   = {u.utterance_id: u.codec for u in utts_eval_clean}\n",
    "utt_channel_map = {u.utterance_id: u.channel for u in utts_eval_clean}\n",
    "utt_attack_map  = {u.utterance_id: u.attack_id for u in utts_eval_clean}\n",
    "\n",
    "# Run inference\n",
    "print(\"Running inference on 2021 LA eval set (mixed precision, batch=16)...\")\n",
    "print(\"Expected: ~25-35 min on T4\\n\")\n",
    "\n",
    "model.eval()\n",
    "all_window_scores = []\n",
    "all_window_labels = []\n",
    "all_window_utts = []\n",
    "\n",
    "start = time.time()\n",
    "with torch.no_grad():\n",
    "    autocast_ctx = torch.amp.autocast(device_type='cuda', enabled=True)\n",
    "    for waveforms, labels, utt_ids in tqdm(eval_loader_2021, desc=\"2021 inference\"):\n",
    "        waveforms = waveforms.to('cuda', non_blocking=True)\n",
    "        with autocast_ctx:\n",
    "            logits = model(waveforms)\n",
    "        probs = torch.softmax(logits.float(), dim=-1)\n",
    "        spoof_probs = probs[:, 1].detach().cpu().numpy()\n",
    "\n",
    "        all_window_scores.extend(spoof_probs.tolist())\n",
    "        all_window_labels.extend(labels.tolist())\n",
    "        all_window_utts.extend(list(utt_ids))\n",
    "\n",
    "inference_minutes = (time.time() - start) / 60\n",
    "print(f\"\\nInference complete in {inference_minutes:.1f} min over {len(all_window_scores):,} windows.\")\n",
    "\n",
    "# Aggregate to per-utterance\n",
    "print(\"\\nAggregating window scores to utterance scores (mean)...\")\n",
    "utt_scores, utt_ids_sorted = aggregate_window_scores_to_utterance(\n",
    "    np.array(all_window_scores), all_window_utts, method=\"mean\",\n",
    ")\n",
    "\n",
    "# Build per-utterance label arrays\n",
    "utt_label_map = {}\n",
    "for s, l, u in zip(all_window_scores, all_window_labels, all_window_utts):\n",
    "    if u not in utt_label_map:\n",
    "        utt_label_map[u] = l\n",
    "\n",
    "utt_labels = np.array([utt_label_map[u] for u in utt_ids_sorted])\n",
    "utt_codecs = np.array([utt_codec_map[u] for u in utt_ids_sorted])\n",
    "utt_channels = np.array([utt_channel_map[u] for u in utt_ids_sorted])\n",
    "utt_attacks = np.array([utt_attack_map[u] for u in utt_ids_sorted])\n",
    "\n",
    "# ---- Overall metrics ----\n",
    "print(f\"\\n{'='*70}\")\n",
    "print(f\"  SECONDARY EVALUATION \u2014 ASVspoof 2021 LA Eval Partition\")\n",
    "print(f\"{'='*70}\")\n",
    "n_bona = int((utt_labels == 0).sum())\n",
    "n_spoof = int((utt_labels == 1).sum())\n",
    "print(f\"Utterances: {len(utt_scores):,}\")\n",
    "print(f\"Bonafide:   {n_bona:,}\")\n",
    "print(f\"Spoof:      {n_spoof:,}\")\n",
    "\n",
    "eer_2021, threshold_2021 = compute_eer(utt_scores, utt_labels)\n",
    "auc_2021 = compute_auc(utt_scores, utt_labels)\n",
    "preds_2021 = (utt_scores > threshold_2021).astype(int)\n",
    "acc_2021 = float((preds_2021 == utt_labels).mean())\n",
    "\n",
    "print(f\"\\nOverall results (Stage 2 model on 2021 LA):\")\n",
    "print(f\"  EER:       {eer_2021*100:.4f}%\")\n",
    "print(f\"  AUC:       {auc_2021:.4f}\")\n",
    "print(f\"  Accuracy:  {acc_2021*100:.2f}%\")\n",
    "print(f\"  Threshold: {threshold_2021:.4f}\")\n",
    "\n",
    "# ---- Cross-dataset comparison ----\n",
    "print(f\"\\nCross-dataset comparison:\")\n",
    "print(f\"  Stage 2 dev EER (2019 LA, seen attacks):       0.69%\")\n",
    "print(f\"  Stage 2 eval EER (2019 LA, unseen attacks):    5.55%\")\n",
    "print(f\"  Stage 2 eval EER (2021 LA, unseen + codecs):   {eer_2021*100:.2f}%\")\n",
    "gap_2019_to_2021 = (eer_2021 - 0.0555) * 100\n",
    "print(f\"  Cross-dataset gap (2019 \u2192 2021):                {gap_2019_to_2021:+.2f} pp\")\n",
    "\n",
    "# ---- Per-codec EER ----\n",
    "print(f\"\\n{'='*70}\")\n",
    "print(f\"  PER-CODEC EER BREAKDOWN\")\n",
    "print(f\"{'='*70}\")\n",
    "bonafide_scores_all = utt_scores[utt_labels == 0]\n",
    "codecs_unique = sorted(set(utt_codecs))\n",
    "per_codec_results = {}\n",
    "\n",
    "for codec in codecs_unique:\n",
    "    mask = (utt_codecs == codec)\n",
    "    codec_scores = utt_scores[mask]\n",
    "    codec_labels = utt_labels[mask]\n",
    "    if len(np.unique(codec_labels)) < 2:\n",
    "        # Only one class \u2014 can't compute EER\n",
    "        per_codec_results[codec] = {\"n\": int(mask.sum()), \"eer\": None, \"note\": \"one class only\"}\n",
    "        print(f\"  {codec:<6}: n={mask.sum():>6,}  (single class, skipping EER)\")\n",
    "        continue\n",
    "    c_eer, _ = compute_eer(codec_scores, codec_labels)\n",
    "    per_codec_results[codec] = {\"n\": int(mask.sum()), \"eer\": float(c_eer)}\n",
    "    print(f\"  {codec:<6}: n={mask.sum():>6,}  EER={c_eer*100:>6.2f}%\")\n",
    "\n",
    "# ---- Per-channel EER ----\n",
    "print(f\"\\n{'='*70}\")\n",
    "print(f\"  PER-CHANNEL EER BREAKDOWN\")\n",
    "print(f\"{'='*70}\")\n",
    "channels_unique = sorted(set(utt_channels))\n",
    "per_channel_results = {}\n",
    "\n",
    "for ch in channels_unique:\n",
    "    mask = (utt_channels == ch)\n",
    "    ch_scores = utt_scores[mask]\n",
    "    ch_labels = utt_labels[mask]\n",
    "    if len(np.unique(ch_labels)) < 2:\n",
    "        per_channel_results[ch] = {\"n\": int(mask.sum()), \"eer\": None, \"note\": \"one class only\"}\n",
    "        print(f\"  {ch:<10}: n={mask.sum():>6,}  (single class, skipping EER)\")\n",
    "        continue\n",
    "    ch_eer, _ = compute_eer(ch_scores, ch_labels)\n",
    "    per_channel_results[ch] = {\"n\": int(mask.sum()), \"eer\": float(ch_eer)}\n",
    "    print(f\"  {ch:<10}: n={mask.sum():>6,}  EER={ch_eer*100:>6.2f}%\")\n",
    "\n",
    "# ---- Per-attack EER ----\n",
    "print(f\"\\n{'='*70}\")\n",
    "print(f\"  PER-ATTACK EER BREAKDOWN (vs all bonafide)\")\n",
    "print(f\"{'='*70}\")\n",
    "attack_ids_eval = sorted(a for a in set(utt_attacks) if a != '-')\n",
    "per_attack_results = {}\n",
    "\n",
    "for attack in attack_ids_eval:\n",
    "    mask = (utt_attacks == attack)\n",
    "    attack_scores = utt_scores[mask]\n",
    "    n = int(mask.sum())\n",
    "    combined_scores = np.concatenate([bonafide_scores_all, attack_scores])\n",
    "    combined_labels = np.concatenate([\n",
    "        np.zeros(len(bonafide_scores_all)),\n",
    "        np.ones(n),\n",
    "    ])\n",
    "    a_eer, _ = compute_eer(combined_scores, combined_labels)\n",
    "    per_attack_results[attack] = {\"n\": n, \"eer\": float(a_eer)}\n",
    "    print(f\"  {attack}: n={n:>6,}  EER={a_eer*100:>7.2f}%\")\n",
    "\n",
    "# Save raw scores\n",
    "import os\n",
    "SCORES_PATH = '/content/deepfake-audio-detection/results/scores/stage2_eval2021.npz'\n",
    "os.makedirs(os.path.dirname(SCORES_PATH), exist_ok=True)\n",
    "np.savez(\n",
    "    SCORES_PATH,\n",
    "    utt_ids=np.array(utt_ids_sorted),\n",
    "    utt_scores=utt_scores,\n",
    "    utt_labels=utt_labels,\n",
    "    utt_codecs=utt_codecs,\n",
    "    utt_channels=utt_channels,\n",
    "    utt_attacks=utt_attacks,\n",
    ")\n",
    "print(f\"\\nRaw scores saved to {SCORES_PATH}\")"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "4DkUUvQzUyqU",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777771160731,
     "user_tz": 420,
     "elapsed": 1308116,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "e29318d3-9420-4c11-aff6-6365a887d5a6"
   },
   "execution_count": 21,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Running inference on 2021 LA eval set (mixed precision, batch=16)...\n",
      "Expected: ~25-35 min on T4\n",
      "\n"
     ]
    },
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "2021 inference: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 10822/10822 [21:46<00:00,  8.29it/s]\n"
     ]
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\n",
      "Inference complete in 21.8 min over 173,149 windows.\n",
      "\n",
      "Aggregating window scores to utterance scores (mean)...\n",
      "\n",
      "======================================================================\n",
      "  SECONDARY EVALUATION \u2014 ASVspoof 2021 LA Eval Partition\n",
      "======================================================================\n",
      "Utterances: 148,175\n",
      "Bonafide:   14,816\n",
      "Spoof:      133,359\n",
      "\n",
      "Overall results (Stage 2 model on 2021 LA):\n",
      "  EER:       9.0850%\n",
      "  AUC:       0.9629\n",
      "  Accuracy:  90.91%\n",
      "  Threshold: 0.5148\n",
      "\n",
      "Cross-dataset comparison:\n",
      "  Stage 2 dev EER (2019 LA, seen attacks):       0.69%\n",
      "  Stage 2 eval EER (2019 LA, unseen attacks):    5.55%\n",
      "  Stage 2 eval EER (2021 LA, unseen + codecs):   9.09%\n",
      "  Cross-dataset gap (2019 \u2192 2021):                +3.54 pp\n",
      "\n",
      "======================================================================\n",
      "  PER-CODEC EER BREAKDOWN\n",
      "======================================================================\n",
      "  alaw  : n=19,436  EER=  8.37%\n",
      "  g722  : n=19,375  EER=  5.42%\n",
      "  gsm   : n=23,520  EER= 11.53%\n",
      "  none  : n=19,421  EER=  5.24%\n",
      "  opus  : n=23,520  EER=  5.30%\n",
      "  pstn  : n=19,383  EER= 11.14%\n",
      "  ulaw  : n=23,520  EER=  7.81%\n",
      "\n",
      "======================================================================\n",
      "  PER-CHANNEL EER BREAKDOWN\n",
      "======================================================================\n",
      "  -         : n=19,421  EER=  5.24%\n",
      "  ita_tx    : n=23,508  EER=  9.27%\n",
      "  loc_tx    : n=62,425  EER=  8.75%\n",
      "  mad_tx    : n=19,383  EER= 11.14%\n",
      "  sin_tx    : n=23,438  EER=  9.00%\n",
      "\n",
      "======================================================================\n",
      "  PER-ATTACK EER BREAKDOWN (vs all bonafide)\n",
      "======================================================================\n",
      "  A07: n=10,238  EER=   9.53%\n",
      "  A08: n=10,368  EER=   5.56%\n",
      "  A09: n=10,152  EER=   3.32%\n",
      "  A10: n=10,318  EER=  20.37%\n",
      "  A11: n=10,276  EER=   3.97%\n",
      "  A12: n=10,259  EER=   5.11%\n",
      "  A13: n=10,301  EER=   1.20%\n",
      "  A14: n=10,234  EER=  14.75%\n",
      "  A15: n=10,235  EER=  16.89%\n",
      "  A16: n=10,390  EER=   5.22%\n",
      "  A17: n=10,239  EER=   8.31%\n",
      "  A18: n=10,148  EER=   9.31%\n",
      "  A19: n=10,201  EER=   7.17%\n",
      "  bonafide: n=14,816  EER=  50.00%\n",
      "\n",
      "Raw scores saved to /content/deepfake-audio-detection/results/scores/stage2_eval2021.npz\n"
     ]
    }
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "\"The model retains primary-eval-level performance (~5%) on uncompressed and modern codecs (none, opus, g722) but degrades significantly on aggressive lossy compression (gsm at 11.53%, pstn at 11.14%). This suggests the model relies on high-frequency artifacts that are partially destroyed by GSM-style compression. Future work could include codec augmentation during training to improve robustness.\""
   ],
   "metadata": {
    "id": "Azth7nL_bPFh"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "import json, os\n",
    "from datetime import datetime\n",
    "\n",
    "results_2021 = {\n",
    "    \"phase\": \"Phase 5b \u2014 Secondary Evaluation on ASVspoof 2021 LA\",\n",
    "    \"completed_at\": datetime.now().isoformat(),\n",
    "    \"model_checkpoint\": \"/content/drive/MyDrive/deepfake_audio/checkpoints/stage2_best.pt\",\n",
    "    \"model_dev_eer\": 0.0069,\n",
    "    \"evaluation_dataset\": {\n",
    "        \"name\": \"ASVspoof 2021 LA \u2014 eval partition only\",\n",
    "        \"kaggle_source\": \"ajaysuryal/asvspoof2021-la (audio) + simontrann/asvspoof2021-la-key (labels)\",\n",
    "        \"utterances_total_in_partition\": 148176,\n",
    "        \"utterances_evaluated\": 148175,\n",
    "        \"utterances_skipped_corrupt\": 1,\n",
    "        \"windows\": 173149,\n",
    "        \"bonafide_count\": 14816,\n",
    "        \"spoof_count\": 133359,\n",
    "        \"attacks\": [\"A07\", \"A08\", \"A09\", \"A10\", \"A11\", \"A12\", \"A13\", \"A14\", \"A15\", \"A16\", \"A17\", \"A18\", \"A19\"],\n",
    "        \"codecs\": [\"none\", \"alaw\", \"ulaw\", \"g722\", \"gsm\", \"opus\", \"pstn\"],\n",
    "        \"channels\": [\"-\", \"ita_tx\", \"loc_tx\", \"mad_tx\", \"sin_tx\"],\n",
    "    },\n",
    "    \"inference\": {\n",
    "        \"batch_size\": 16,\n",
    "        \"mixed_precision\": True,\n",
    "        \"wall_clock_minutes\": 21.8,\n",
    "        \"windows_per_second\": 132,\n",
    "    },\n",
    "    \"overall_results\": {\n",
    "        \"eer\": 0.0909,\n",
    "        \"auc\": 0.9629,\n",
    "        \"accuracy\": 0.9091,\n",
    "        \"threshold\": 0.5148,\n",
    "    },\n",
    "    \"cross_dataset_comparison\": {\n",
    "        \"stage2_dev_2019_seen_attacks\": 0.0069,\n",
    "        \"stage2_eval_2019_unseen_attacks\": 0.0555,\n",
    "        \"stage2_eval_2021_unseen_attacks_plus_codecs\": 0.0909,\n",
    "        \"gap_2019_eval_to_2021_eval_pp\": 3.54,\n",
    "        \"interpretation\": \"Real-world codec degradation adds ~3.5 percentage points of error on top of 2019 unseen-attack eval.\",\n",
    "    },\n",
    "    \"per_codec_eer\": {\n",
    "        \"none\": 0.0524, \"opus\": 0.0530, \"g722\": 0.0542,\n",
    "        \"ulaw\": 0.0781, \"alaw\": 0.0837,\n",
    "        \"pstn\": 0.1114, \"gsm\": 0.1153,\n",
    "    },\n",
    "    \"per_codec_summary\": {\n",
    "        \"best_codec\": {\"id\": \"none\", \"eer\": 0.0524},\n",
    "        \"worst_codec\": {\"id\": \"gsm\", \"eer\": 0.1153},\n",
    "        \"interpretation\": \"Aggressive lossy compression (gsm, pstn) degrades performance by ~6 pp vs uncompressed. Modern codecs (opus, g722) preserve detection signal well.\",\n",
    "    },\n",
    "    \"per_channel_eer\": {\n",
    "        \"-\": 0.0524, \"ita_tx\": 0.0927, \"loc_tx\": 0.0875,\n",
    "        \"mad_tx\": 0.1114, \"sin_tx\": 0.0900,\n",
    "    },\n",
    "    \"per_attack_eer\": {\n",
    "        \"A07\": 0.0953, \"A08\": 0.0556, \"A09\": 0.0332, \"A10\": 0.2037,\n",
    "        \"A11\": 0.0397, \"A12\": 0.0511, \"A13\": 0.0120, \"A14\": 0.1475,\n",
    "        \"A15\": 0.1689, \"A16\": 0.0522, \"A17\": 0.0831, \"A18\": 0.0931,\n",
    "        \"A19\": 0.0717,\n",
    "    },\n",
    "    \"per_attack_summary\": {\n",
    "        \"n_attacks\": 13,\n",
    "        \"mean_eer_across_attacks\": 0.0890,\n",
    "        \"median_eer_across_attacks\": 0.0717,\n",
    "        \"worst_attack\": {\"id\": \"A10\", \"eer\": 0.2037, \"consistent_with_2019\": True},\n",
    "        \"best_attack\": {\"id\": \"A13\", \"eer\": 0.0120, \"consistent_with_2019\": True},\n",
    "    },\n",
    "    \"comparisons_to_published_baselines_2021\": {\n",
    "        \"lfcc_gmm_eer\": 0.2556,\n",
    "        \"cqcc_gmm_eer\": 0.1930,\n",
    "        \"lfcc_lcnn_eer\": 0.0926,\n",
    "        \"rawnet2_eer\": 0.0950,\n",
    "        \"our_eer\": 0.0909,\n",
    "        \"interpretation\": \"Stage 2 model matches the strongest neural baselines (LFCC-LCNN 9.26%, RawNet2 9.50%) on 2021 LA, despite being trained only on 2019 data with zero codec augmentation.\"\n",
    "    },\n",
    "    \"raw_scores_path\": \"/content/deepfake-audio-detection/results/scores/stage2_eval2021.npz\",\n",
    "    \"wandb_run_training\": \"https://wandb.ai/sara-jaffrani17-dlp/deepfake-audio-detection/runs/l1q4dvsx\",\n",
    "    \"notes\": [\n",
    "        \"Cross-dataset evaluation on ASVspoof 2021 LA. Model was trained on 2019 LA only.\",\n",
    "        \"9.09% EER overall is competitive with strong published 2021 LA baselines (LFCC-LCNN 9.26%, RawNet2 9.50%).\",\n",
    "        \"Per-codec analysis reveals model's vulnerability to aggressive lossy compression (gsm 11.53%, pstn 11.14%).\",\n",
    "        \"Per-attack rankings consistent with 2019: A10/A14/A15 hardest, A13/A09 easiest.\",\n",
    "        \"Phase 5c next: cross-dataset evaluation on WaveFake (vocoder-only synthesis).\",\n",
    "    ]\n",
    "}\n",
    "\n",
    "OUTPUT = '/content/deepfake-audio-detection/results/metrics/stage2_eval2021_results.json'\n",
    "os.makedirs(os.path.dirname(OUTPUT), exist_ok=True)\n",
    "with open(OUTPUT, 'w') as f:\n",
    "    json.dump(results_2021, f, indent=2)\n",
    "\n",
    "print(f\"Wrote {OUTPUT}\")\n",
    "print(f\"Size: {os.path.getsize(OUTPUT)} bytes\")"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "DWmsb3ypbgVG",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777771358190,
     "user_tz": 420,
     "elapsed": 12,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "80741e73-062e-45d7-a88f-17a0af5b2a3b"
   },
   "execution_count": 22,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Wrote /content/deepfake-audio-detection/results/metrics/stage2_eval2021_results.json\n",
      "Size: 3954 bytes\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "from google.colab import userdata\n",
    "import os\n",
    "\n",
    "GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')\n",
    "os.chdir('/content/deepfake-audio-detection')\n",
    "\n",
    "!git config user.email \"95262824+Saracasm@users.noreply.github.com\"\n",
    "!git config user.name \"Sara Iqbal\"\n",
    "\n",
    "# Stage all the new files\n",
    "!git add results/metrics/stage2_eval2021_results.json\n",
    "!git add results/scores/stage2_eval2021.npz\n",
    "!git add src/data/protocols_2021.py\n",
    "!git status\n",
    "\n",
    "!git commit -m \"Phase 5b: 2021 LA cross-dataset eval \u2014 9.09% EER, matches strongest baseline\"\n",
    "\n",
    "push_url = f\"https://Saracasm:{GITHUB_TOKEN}@github.com/Saracasm/deepfake-audio-detection.git\"\n",
    "!git push {push_url} main 2>&1 | grep -v {GITHUB_TOKEN}"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "NELdErV0bo5L",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777771394847,
     "user_tz": 420,
     "elapsed": 7177,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "aa07955a-1a74-4dc7-ce4f-5c54ef2269af"
   },
   "execution_count": 23,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "On branch main\n",
      "Your branch is up to date with 'origin/main'.\n",
      "\n",
      "Changes to be committed:\n",
      "  (use \"git restore --staged <file>...\" to unstage)\n",
      "\t\u001b[32mnew file:   results/metrics/stage2_eval2021_results.json\u001b[m\n",
      "\t\u001b[32mnew file:   results/scores/stage2_eval2021.npz\u001b[m\n",
      "\t\u001b[32mnew file:   src/data/protocols_2021.py\u001b[m\n",
      "\n",
      "[main 6b144b9] Phase 5b: 2021 LA cross-dataset eval \u2014 9.09% EER, matches strongest baseline\n",
      " 3 files changed, 219 insertions(+)\n",
      " create mode 100644 results/metrics/stage2_eval2021_results.json\n",
      " create mode 100644 results/scores/stage2_eval2021.npz\n",
      " create mode 100644 src/data/protocols_2021.py\n",
      "To https://github.com/Saracasm/deepfake-audio-detection.git\n",
      "   888b5b5..6b144b9  main -> main\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "print(\"Searching Kaggle for WaveFake datasets...\\n\")\n",
    "print(\"--- Search 1: 'wavefake' ---\")\n",
    "!kaggle datasets list -s \"wavefake\" --max-size 100000000000 2>&1 | head -15\n",
    "\n",
    "print(\"\\n--- Search 2: 'wave fake audio' ---\")\n",
    "!kaggle datasets list -s \"wave fake audio\" --max-size 100000000000 2>&1 | head -10\n",
    "\n",
    "print(\"\\n--- Search 3: 'audio deepfake vocoder' ---\")\n",
    "!kaggle datasets list -s \"audio deepfake vocoder\" --max-size 100000000000 2>&1 | head -10"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "WWAMPtyRcLj0",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777771531983,
     "user_tz": 420,
     "elapsed": 2797,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "b28eb80a-b0c4-47b0-99dc-7e32896a2e4b"
   },
   "execution_count": 24,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Searching Kaggle for WaveFake datasets...\n",
      "\n",
      "--- Search 1: 'wavefake' ---\n",
      "ref                                         title                                              size  lastUpdated                 downloadCount  voteCount  usabilityRating  \n",
      "------------------------------------------  ------------------------------------------  -----------  --------------------------  -------------  ---------  ---------------  \n",
      "andreadiubaldo/wavefake-test                wavefake                                    28915177091  2023-04-03 18:48:36.313000           1222          4  0.3125           \n",
      "walimuhammadahmad/fakeaudio                 WaveFake: DeepFake Audio Detection Dataset  28915177091  2023-08-09 04:55:59.317000           4935          8  0.875            \n",
      "dinaahmed11/wavefake                        wavefake                                    56767983528  2026-02-05 18:12:47.467000              1          0  0.125            \n",
      "gustavovrr/mel-image-ljspeech-and-wavefake  Mel Image LJspeech and WaveFake              8481190515  2024-08-06 05:45:02.543000             14          0  0.3125           \n",
      "rohan576/wavefake-vocoders-subset           wavefake-vocoders-subset                     3415496664  2026-04-10 17:33:32.973000              0          0  0.125            \n",
      "utsavavaiya/wavefake-jsut-25                WaveFake_Jsut_25                               71174763  2024-10-08 09:00:09.790000              4          0  0.25             \n",
      "utsavavaiya/wavefake-1500                   wavefake_1500                                  71387688  2024-10-08 09:37:51.230000              4          0  0.25             \n",
      "maryamkhan2025/wavefakedatasetformodel      wavefakedataset                             33905505462  2026-02-03 18:47:16.080000             20          0  0.23529412       \n",
      "\n",
      "--- Search 2: 'wave fake audio' ---\n",
      "ref                          title                                              size  lastUpdated                 downloadCount  voteCount  usabilityRating  \n",
      "---------------------------  ------------------------------------------  -----------  --------------------------  -------------  ---------  ---------------  \n",
      "walimuhammadahmad/fakeaudio  WaveFake: DeepFake Audio Detection Dataset  28915177091  2023-08-09 04:55:59.317000           4935          8  0.875            \n",
      "\n",
      "--- Search 3: 'audio deepfake vocoder' ---\n",
      "ref                         title                                                size  lastUpdated                 downloadCount  voteCount  usabilityRating  \n",
      "--------------------------  ---------------------------------------------  ----------  --------------------------  -------------  ---------  ---------------  \n",
      "ameythakur20/deepfakeaudio  Neural Voice Cloning: Deepfake Audio & Models   576855751  2026-01-29 08:24:10.073000            140          3                1  \n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "print(\"Inspecting utsavavaiya/wavefake-1500 file structure...\\n\")\n",
    "!kaggle datasets files utsavavaiya/wavefake-1500 2>&1 | head -50"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "IEYwB49ecfsm",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777771612339,
     "user_tz": 420,
     "elapsed": 666,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "6954152c-6d67-4a47-8243-23b0cbcf7b00"
   },
   "execution_count": 25,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Inspecting utsavavaiya/wavefake-1500 file structure...\n",
      "\n",
      "Next Page Token = CfDJ8OqP5ZkTT9ZGj66XXRbxXyAzyWwK6AN5v2GabJ7gDPZhzZu1UH1lHISiRxNputpyoIfveTiCWiZVesQJrRHwU68t9LwFE7s-3dajvU47VG_qaGbYeI0hVUviZY6SlsvWa396g_c0vT9mBY9jNKyz8KCpuOKSDkyfBQcFyKlYO7uyHuVLY8p2xnoMYhi4snd2s-Wc\n",
      "name                                                          size  creationDate                \n",
      "-----------------------------------------------------------  -----  --------------------------  \n",
      "wavefake_sample/fake/BASIC5000_0001_gen_mel_spectrogram.png  24024  2024-10-08 09:37:53.275000  \n",
      "wavefake_sample/fake/BASIC5000_0002_gen_mel_spectrogram.png  22988  2024-10-08 09:37:53.251000  \n",
      "wavefake_sample/fake/BASIC5000_0003_gen_mel_spectrogram.png  24395  2024-10-08 09:37:53.232000  \n",
      "wavefake_sample/fake/BASIC5000_0004_gen_mel_spectrogram.png  23114  2024-10-08 09:37:53.239000  \n",
      "wavefake_sample/fake/BASIC5000_0005_gen_mel_spectrogram.png  24481  2024-10-08 09:37:53.232000  \n",
      "wavefake_sample/fake/BASIC5000_0006_gen_mel_spectrogram.png  24173  2024-10-08 09:37:53.239000  \n",
      "wavefake_sample/fake/BASIC5000_0007_gen_mel_spectrogram.png  24986  2024-10-08 09:37:53.232000  \n",
      "wavefake_sample/fake/BASIC5000_0008_gen_mel_spectrogram.png  23405  2024-10-08 09:37:53.245000  \n",
      "wavefake_sample/fake/BASIC5000_0009_gen_mel_spectrogram.png  24403  2024-10-08 09:37:53.239000  \n",
      "wavefake_sample/fake/BASIC5000_0010_gen_mel_spectrogram.png  23255  2024-10-08 09:37:53.245000  \n",
      "wavefake_sample/fake/BASIC5000_0011_gen_mel_spectrogram.png  24027  2024-10-08 09:37:53.245000  \n",
      "wavefake_sample/fake/BASIC5000_0012_gen_mel_spectrogram.png  24173  2024-10-08 09:37:53.257000  \n",
      "wavefake_sample/fake/BASIC5000_0013_gen_mel_spectrogram.png  23707  2024-10-08 09:37:53.263000  \n",
      "wavefake_sample/fake/BASIC5000_0014_gen_mel_spectrogram.png  23346  2024-10-08 09:37:53.275000  \n",
      "wavefake_sample/fake/BASIC5000_0015_gen_mel_spectrogram.png  23602  2024-10-08 09:37:53.263000  \n",
      "wavefake_sample/fake/BASIC5000_0016_gen_mel_spectrogram.png  22401  2024-10-08 09:37:53.275000  \n",
      "wavefake_sample/fake/BASIC5000_0017_gen_mel_spectrogram.png  23905  2024-10-08 09:37:53.322000  \n",
      "wavefake_sample/fake/BASIC5000_0018_gen_mel_spectrogram.png  24210  2024-10-08 09:37:53.251000  \n",
      "wavefake_sample/fake/BASIC5000_0019_gen_mel_spectrogram.png  23930  2024-10-08 09:37:53.257000  \n",
      "wavefake_sample/fake/BASIC5000_0020_gen_mel_spectrogram.png  21199  2024-10-08 09:37:53.275000  \n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "print(\"Inspecting walimuhammadahmad/fakeaudio structure...\\n\")\n",
    "!kaggle datasets files walimuhammadahmad/fakeaudio 2>&1 | head -50"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "ojQA28G3crCR",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777771658821,
     "user_tz": 420,
     "elapsed": 712,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "75862e71-f640-4431-b26f-11ba7daf42af"
   },
   "execution_count": 26,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Inspecting walimuhammadahmad/fakeaudio structure...\n",
      "\n",
      "Next Page Token = CfDJ8OqP5ZkTT9ZGj66XXRbxXyBSiZQWZWX3OETQuaaRtanntVBQnEzjHglWqjEUOP4aGsbr3DtRgwuw5CNK9bjBUbiAazu9dC7w1TKd1UJTnkjMCgtCx0xULSeC50Vq2BdMQqJZ3JgqJ8kBUjPXeKKJ2qa9wZv7hGoxjWQX-iOB8M8K9MPSUnAfE79U6cU4E9QFNjnKp2KjiEXC_wYZoOGpu9VXg4AFw8v_Ozkytcvpf8tXjpQ\n",
      "name                                                                                           size  creationDate                \n",
      "-------------------------------------------------------------------------------------------  ------  --------------------------  \n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_0.wav      160812  2023-08-09 09:26:04.307000  \n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_1.wav       54316  2023-08-09 09:26:07.753000  \n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10.wav     313900  2023-08-09 09:26:13.366000  \n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_100.wav    106540  2023-08-09 09:26:06.029000  \n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_1000.wav   186924  2023-08-09 09:26:21.576000  \n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10000.wav   91180  2023-08-09 09:26:40.292000  \n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10001.wav  231980  2023-08-09 09:26:37.497000  \n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10002.wav  178732  2023-08-09 09:26:34.107000  \n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10003.wav  238636  2023-08-09 09:26:36.553000  \n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10004.wav  217644  2023-08-09 09:26:26.943000  \n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10005.wav  291884  2023-08-09 09:26:29.391000  \n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10006.wav  179244  2023-08-09 09:26:32.823000  \n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10007.wav   88620  2023-08-09 09:26:30.645000  \n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10008.wav  237612  2023-08-09 09:26:23.381000  \n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10009.wav  227884  2023-08-09 09:26:25.659000  \n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_1001.wav    90156  2023-08-09 09:26:19.720000  \n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10010.wav  137260  2023-08-09 09:26:26.951000  \n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10011.wav  134700  2023-08-09 09:26:29.294000  \n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10012.wav  233516  2023-08-09 09:26:32.976000  \n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10013.wav  192556  2023-08-09 09:26:30.592000  \n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "import subprocess\n",
    "\n",
    "print(\"Getting full file listing for walimuhammadahmad/fakeaudio...\")\n",
    "print(\"(this might take a few seconds \u2014 large dataset)\\n\")\n",
    "\n",
    "# Run kaggle datasets files and capture all output\n",
    "result = subprocess.run(\n",
    "    ['kaggle', 'datasets', 'files', 'walimuhammadahmad/fakeaudio', '-v'],\n",
    "    capture_output=True, text=True, timeout=60\n",
    ")\n",
    "output = result.stdout\n",
    "\n",
    "# Extract unique folder paths\n",
    "import re\n",
    "folders = set()\n",
    "for line in output.split('\\n'):\n",
    "    parts = line.split('/')\n",
    "    if len(parts) >= 2 and parts[0] in ['generated_audio', 'real_audio', 'training_audio', 'fake', 'real']:\n",
    "        # First two levels of path\n",
    "        folder = f\"{parts[0]}/{parts[1]}\"\n",
    "        folders.add(folder)\n",
    "    elif len(parts) >= 1 and parts[0] not in ['name', '-' * 5, '']:\n",
    "        folders.add(parts[0])\n",
    "\n",
    "print(\"Top-level folders detected:\")\n",
    "for f in sorted(folders):\n",
    "    if '/' in f:\n",
    "        print(f\"  {f}/\")\n",
    "    else:\n",
    "        print(f\"  {f}/  (top-level)\")\n",
    "\n",
    "# Also try paginated listing\n",
    "print(\"\\nFirst 200 file paths (sample to identify all vocoder folders):\")\n",
    "print(output[:5000])  # show first 5KB of listing\n"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "5F9o0D4NdZ-H",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777771853274,
     "user_tz": 420,
     "elapsed": 716,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "788e03f4-dc05-4892-e6a5-7422b4517741"
   },
   "execution_count": 27,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Getting full file listing for walimuhammadahmad/fakeaudio...\n",
      "(this might take a few seconds \u2014 large dataset)\n",
      "\n",
      "Top-level folders detected:\n",
      "  Next Page Token = CfDJ8OqP5ZkTT9ZGj66XXRbxXyBoI2XKwzEDNs4rPWYF8XEvV6A6AtAEPlyA_QntgBaZqDQYTQaKhZmMUri3lcPgEmSU2njgsXOqWqsgAl3aWs9EbSRQNuAVNKK0t8kEbK4wW5Qn-1MDdSUwQUnKW8hBUrJ41dSnVvxoGlSB7kvFgUkEDkcCiccTnTrluynAMyDO3z4xgov5QyCiVhHKLNbx02wFRatYdcCLiVwy-GuwgSFaxJs/  (top-level)\n",
      "  generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/\n",
      "  name,size,creationDate/  (top-level)\n",
      "\n",
      "First 200 file paths (sample to identify all vocoder folders):\n",
      "Next Page Token = CfDJ8OqP5ZkTT9ZGj66XXRbxXyBoI2XKwzEDNs4rPWYF8XEvV6A6AtAEPlyA_QntgBaZqDQYTQaKhZmMUri3lcPgEmSU2njgsXOqWqsgAl3aWs9EbSRQNuAVNKK0t8kEbK4wW5Qn-1MDdSUwQUnKW8hBUrJ41dSnVvxoGlSB7kvFgUkEDkcCiccTnTrluynAMyDO3z4xgov5QyCiVhHKLNbx02wFRatYdcCLiVwy-GuwgSFaxJs\n",
      "name,size,creationDate\n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_0.wav,160812,2023-08-09 09:26:04.307000\n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_1.wav,54316,2023-08-09 09:26:07.753000\n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10.wav,313900,2023-08-09 09:26:13.366000\n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_100.wav,106540,2023-08-09 09:26:06.029000\n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_1000.wav,186924,2023-08-09 09:26:21.576000\n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10000.wav,91180,2023-08-09 09:26:40.292000\n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10001.wav,231980,2023-08-09 09:26:37.497000\n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10002.wav,178732,2023-08-09 09:26:34.107000\n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10003.wav,238636,2023-08-09 09:26:36.553000\n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10004.wav,217644,2023-08-09 09:26:26.943000\n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10005.wav,291884,2023-08-09 09:26:29.391000\n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10006.wav,179244,2023-08-09 09:26:32.823000\n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10007.wav,88620,2023-08-09 09:26:30.645000\n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10008.wav,237612,2023-08-09 09:26:23.381000\n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10009.wav,227884,2023-08-09 09:26:25.659000\n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_1001.wav,90156,2023-08-09 09:26:19.720000\n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10010.wav,137260,2023-08-09 09:26:26.951000\n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10011.wav,134700,2023-08-09 09:26:29.294000\n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10012.wav,233516,2023-08-09 09:26:32.976000\n",
      "generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/gen_10013.wav,192556,2023-08-09 09:26:30.592000\n",
      "\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "import os, time\n",
    "\n",
    "DOWNLOAD_DIR = '/content/wavefake'\n",
    "os.makedirs(DOWNLOAD_DIR, exist_ok=True)\n",
    "\n",
    "# Check disk space first\n",
    "print(\"Checking disk space...\")\n",
    "!df -h /content | tail -1\n",
    "print()\n",
    "\n",
    "print(\"Downloading WaveFake (~29 GB)...\")\n",
    "print(\"Expected: 8-15 min on Colab\\n\")\n",
    "\n",
    "start = time.time()\n",
    "!kaggle datasets download -d walimuhammadahmad/fakeaudio -p {DOWNLOAD_DIR} --unzip --force --quiet\n",
    "elapsed_min = (time.time() - start) / 60\n",
    "print(f\"\\nDownload+unzip done in {elapsed_min:.1f} min.\")\n",
    "\n",
    "# Show actual structure\n",
    "print(\"\\nTop-level structure:\")\n",
    "!ls -la {DOWNLOAD_DIR}/\n",
    "\n",
    "print(\"\\nIf there's a 'generated_audio' folder, list its subfolders:\")\n",
    "gen_dir = f'{DOWNLOAD_DIR}/generated_audio'\n",
    "if os.path.exists(gen_dir):\n",
    "    !ls -la {gen_dir}/"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "pYZZ0MbNeKu1",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777772913411,
     "user_tz": 420,
     "elapsed": 863123,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "12869aa4-1f10-4213-9765-b27b929a3b81"
   },
   "execution_count": 28,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Checking disk space...\n",
      "overlay         236G   60G  177G  26% /\n",
      "\n",
      "Downloading WaveFake (~29 GB)...\n",
      "Expected: 8-15 min on Colab\n",
      "\n",
      "Dataset URL: https://www.kaggle.com/datasets/walimuhammadahmad/fakeaudio\n",
      "License(s): ODC Public Domain Dedication and Licence (PDDL)\n",
      "\n",
      "Download+unzip done in 14.4 min.\n",
      "\n",
      "Top-level structure:\n",
      "total 12\n",
      "drwxr-xr-x  3 root root 4096 May  3 01:48 .\n",
      "drwxr-xr-x  1 root root 4096 May  3 01:34 ..\n",
      "drwxr-xr-x 12 root root 4096 May  3 01:47 generated_audio\n",
      "\n",
      "If there's a 'generated_audio' folder, list its subfolders:\n",
      "total 4700\n",
      "drwxr-xr-x 12 root root   4096 May  3 01:47 .\n",
      "drwxr-xr-x  3 root root   4096 May  3 01:48 ..\n",
      "drwxr-xr-x  3 root root 520192 May  3 01:41 common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech\n",
      "drwxr-xr-x  2 root root 233472 May  3 01:42 jsut_multi_band_melgan\n",
      "drwxr-xr-x  2 root root 233472 May  3 01:42 jsut_parallel_wavegan\n",
      "drwxr-xr-x  2 root root 536576 May  3 01:43 ljspeech_full_band_melgan\n",
      "drwxr-xr-x  2 root root 589824 May  3 01:44 ljspeech_hifiGAN\n",
      "drwxr-xr-x  2 root root 536576 May  3 01:45 ljspeech_melgan\n",
      "drwxr-xr-x  2 root root 536576 May  3 01:45 ljspeech_melgan_large\n",
      "drwxr-xr-x  2 root root 536576 May  3 01:46 ljspeech_multi_band_melgan\n",
      "drwxr-xr-x  2 root root 536576 May  3 01:47 ljspeech_parallel_wavegan\n",
      "drwxr-xr-x  2 root root 503808 May  3 01:48 ljspeech_waveglow\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "print(\"Searching Kaggle for LJSpeech...\\n\")\n",
    "!kaggle datasets list -s \"ljspeech\" --max-size 10000000000 2>&1 | head -20"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "b51Ym3gxiBuZ",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777773063295,
     "user_tz": 420,
     "elapsed": 1008,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "0157f260-2d98-4e34-ec1a-cf3b0fa8125e"
   },
   "execution_count": 29,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Searching Kaggle for LJSpeech...\n",
      "\n",
      "ref                                                        title                             size  lastUpdated                 downloadCount  voteCount  usabilityRating  \n",
      "---------------------------------------------------------  --------------------------  ----------  --------------------------  -------------  ---------  ---------------  \n",
      "dromosys/ljspeech                                          LJSpeech                    6422684440  2018-09-17 00:20:03.623000            972         18  0.1764706        \n",
      "awsaf49/ljspeech-sr16k-dataset                             LJSpeech sr16k Dataset      2342225987  2023-09-13 21:16:44.393000           1260          7  0.5294118        \n",
      "maxbr0wn/culledjane-eyre-ljspeech                          Cleaned Jane Eyre LJSpeech  1597594099  2024-04-22 12:54:41.707000            264          4  0.9411765        \n",
      "mathurinache/the-lj-speech-dataset                         The LJ Speech Dataset       3211137023  2021-02-15 09:19:54.243000          11034        172  1                \n",
      "juliocaquino/ljspeech                                      LJSpeech                     354243568  2024-04-10 13:07:40.557000             17          1  0.3125           \n",
      "victorling/ljspeech                                        LJSpeech                    3211137023  2021-10-25 11:42:35.830000             21          2  0.3529412        \n",
      "ryanrudes/ljspeech                                         LJSpeech                    3212652913  2022-04-10 13:36:24.367000             16          2  0.23529412       \n",
      "prasannakasar/ljspeech-1-1-with-mel-and-mag-of-each-audio  LJSpeech-1.1                         0  2024-10-08 04:00:06.253000              5          1  0.4117647        \n",
      "rahulbhalley/ljspeech11                                    LJSpeech-1.1                3211137023  2021-12-05 10:32:25.580000            306          2  0.1764706        \n",
      "maxbr0wn/janeeyre                                          Jane Eyre LJSpeech          1615805879  2024-01-29 16:37:04.650000             62          0  0.9411765        \n",
      "tttzof351/ljspeech-meta                                    ljspeech_meta                   613174  2023-06-20 21:33:07.487000            575          2  0.1764706        \n",
      "phhasian0710/ljspeech                                      LJSpeech                    6422684440  2019-06-15 07:30:23.807000             76          1  0.11764706       \n",
      "awsaf49/ljspeech-dataset                                   LJSpeech: Dataset           3211137023  2022-02-06 15:15:30.893000            337          0  0.5294118        \n",
      "mobassir/comprehensive-bangla-tts                          comprehensive bangla tts    5860311293  2023-08-24 07:49:28.087000            557          7  0.6875           \n",
      "fredrelec/ljspeech-indian                                  Ljspeech Indian             2198467078  2025-12-10 03:48:47.830000              0          1  0.3125           \n",
      "fag9897/ljspeech                                           LJSpeech                    3211137023  2022-01-10 17:43:02.657000              4          1  0.11764706       \n",
      "ashokneupane/ljspeecj                                      ljspeech                    1514409347  2024-05-29 20:21:59.727000              2          0  0.1764706        \n",
      "saramedhat38/ljspeech                                      LJSPEECH                    3211137023  2025-04-20 22:49:51.600000              7          0  0.1764706        \n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "import os, time\n",
    "\n",
    "LJSPEECH_DIR = '/content/ljspeech'\n",
    "os.makedirs(LJSPEECH_DIR, exist_ok=True)\n",
    "\n",
    "print(\"Downloading LJSpeech (~3.2 GB)...\")\n",
    "print(\"Expected: 2-5 min\\n\")\n",
    "\n",
    "start = time.time()\n",
    "!kaggle datasets download -d mathurinache/the-lj-speech-dataset -p {LJSPEECH_DIR} --unzip --force --quiet\n",
    "elapsed_min = (time.time() - start) / 60\n",
    "print(f\"\\nDownload+unzip done in {elapsed_min:.1f} min.\")\n",
    "\n",
    "# Show structure\n",
    "print(\"\\nTop-level structure:\")\n",
    "!ls -la {LJSPEECH_DIR}/\n",
    "\n",
    "# Look for the wavs folder\n",
    "import glob\n",
    "wav_candidates = []\n",
    "for root, dirs, files in os.walk(LJSPEECH_DIR):\n",
    "    if 'wav' in root.lower() and any(f.endswith('.wav') for f in files):\n",
    "        n_wavs = len([f for f in files if f.endswith('.wav')])\n",
    "        wav_candidates.append((root, n_wavs))\n",
    "\n",
    "print(\"\\nWAV folders found:\")\n",
    "for path, n in wav_candidates:\n",
    "    print(f\"  {path}: {n:,} files\")\n",
    "\n",
    "# Check disk space after\n",
    "print(\"\\nDisk space:\")\n",
    "!df -h /content | tail -1"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "JiLh9UuciNQT",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777773202398,
     "user_tz": 420,
     "elapsed": 93056,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "3fc161c4-7cfa-47a8-97e6-c75f0b6cf897"
   },
   "execution_count": 30,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Downloading LJSpeech (~3.2 GB)...\n",
      "Expected: 2-5 min\n",
      "\n",
      "Dataset URL: https://www.kaggle.com/datasets/mathurinache/the-lj-speech-dataset\n",
      "License(s): CC0-1.0\n",
      "\n",
      "Download+unzip done in 1.5 min.\n",
      "\n",
      "Top-level structure:\n",
      "total 12\n",
      "drwxr-xr-x 3 root root 4096 May  3 01:53 .\n",
      "drwxr-xr-x 1 root root 4096 May  3 01:51 ..\n",
      "drwxr-xr-x 3 root root 4096 May  3 01:52 LJSpeech-1.1\n",
      "\n",
      "WAV folders found:\n",
      "  /content/ljspeech/LJSpeech-1.1/wavs: 13,100 files\n",
      "\n",
      "Disk space:\n",
      "overlay         236G   96G  141G  41% /\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "import os, random, glob\n",
    "\n",
    "random.seed(42)  # reproducibility\n",
    "\n",
    "# ---- Bonafide pool: LJSpeech ----\n",
    "LJ_WAV_DIR = '/content/ljspeech/LJSpeech-1.1/wavs'\n",
    "all_lj_files = sorted(glob.glob(f'{LJ_WAV_DIR}/*.wav'))\n",
    "print(f\"LJSpeech total: {len(all_lj_files):,} files\")\n",
    "\n",
    "# Sample 1,500 bonafide\n",
    "sampled_bonafide = random.sample(all_lj_files, 1500)\n",
    "print(f\"Sampled bonafide: {len(sampled_bonafide)}\")\n",
    "\n",
    "# ---- Spoof pool: 9 vocoders ----\n",
    "WAVEFAKE_DIR = '/content/wavefake/generated_audio'\n",
    "target_vocoders = [\n",
    "    'ljspeech_melgan',\n",
    "    'ljspeech_melgan_large',\n",
    "    'ljspeech_multi_band_melgan',\n",
    "    'ljspeech_full_band_melgan',\n",
    "    'ljspeech_parallel_wavegan',\n",
    "    'ljspeech_waveglow',\n",
    "    'ljspeech_hifiGAN',\n",
    "    'jsut_multi_band_melgan',\n",
    "    'jsut_parallel_wavegan',\n",
    "]\n",
    "\n",
    "vocoder_samples = {}\n",
    "print(\"\\nSampling spoofed audio per vocoder:\")\n",
    "for vocoder in target_vocoders:\n",
    "    folder = f'{WAVEFAKE_DIR}/{vocoder}'\n",
    "    files = sorted(glob.glob(f'{folder}/*.wav'))\n",
    "    if len(files) == 0:\n",
    "        print(f\"  WARNING: {vocoder} \u2014 no .wav files found, checking other extensions\")\n",
    "        files = sorted(glob.glob(f'{folder}/*'))\n",
    "    n_avail = len(files)\n",
    "    n_target = min(1000, n_avail)\n",
    "    sampled = random.sample(files, n_target) if n_avail >= 1000 else files\n",
    "    vocoder_samples[vocoder] = sampled\n",
    "    print(f\"  {vocoder}: {n_avail:,} available \u2192 sampled {len(sampled):,}\")\n",
    "\n",
    "# ---- Build the unified utterance list ----\n",
    "# Each entry: (file_path, label_int, vocoder_or_bonafide_id, utterance_id)\n",
    "class Utt:\n",
    "    \"\"\"Lightweight utterance record for WaveFake eval.\"\"\"\n",
    "    def __init__(self, flac_path, label_int, vocoder, utterance_id):\n",
    "        self.flac_path = flac_path  # named flac_path for compat with ASVspoofDataset\n",
    "        self.label_int = label_int\n",
    "        self.vocoder = vocoder  # custom field for breakdown\n",
    "        self.utterance_id = utterance_id\n",
    "        self.label = 'bonafide' if label_int == 0 else 'spoof'\n",
    "\n",
    "utts_wavefake = []\n",
    "\n",
    "# Add bonafide\n",
    "for path in sampled_bonafide:\n",
    "    uid = os.path.basename(path).replace('.wav', '')  # e.g. LJ001-0001\n",
    "    utts_wavefake.append(Utt(path, 0, 'bonafide_LJSpeech', uid))\n",
    "\n",
    "# Add spoof per vocoder\n",
    "for vocoder, files in vocoder_samples.items():\n",
    "    for path in files:\n",
    "        uid = f\"{vocoder}_{os.path.basename(path).replace('.wav', '')}\"\n",
    "        utts_wavefake.append(Utt(path, 1, vocoder, uid))\n",
    "\n",
    "# Shuffle for randomness in batching\n",
    "random.shuffle(utts_wavefake)\n",
    "\n",
    "# Summary\n",
    "from collections import Counter\n",
    "print(f\"\\nTotal utterances: {len(utts_wavefake):,}\")\n",
    "print(f\"Bonafide: {sum(1 for u in utts_wavefake if u.label_int == 0):,}\")\n",
    "print(f\"Spoof:    {sum(1 for u in utts_wavefake if u.label_int == 1):,}\")\n",
    "print(f\"\\nVocoder distribution:\")\n",
    "for vocoder, n in Counter(u.vocoder for u in utts_wavefake).most_common():\n",
    "    print(f\"  {vocoder}: {n:,}\")"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "u-s9T5-ulgIG",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777773974319,
     "user_tz": 420,
     "elapsed": 1726,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "125a4910-b5f3-40e5-a40f-e32d154cce6d"
   },
   "execution_count": 31,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "LJSpeech total: 13,100 files\n",
      "Sampled bonafide: 1500\n",
      "\n",
      "Sampling spoofed audio per vocoder:\n",
      "  ljspeech_melgan: 13,100 available \u2192 sampled 1,000\n",
      "  ljspeech_melgan_large: 13,100 available \u2192 sampled 1,000\n",
      "  ljspeech_multi_band_melgan: 13,100 available \u2192 sampled 1,000\n",
      "  ljspeech_full_band_melgan: 13,100 available \u2192 sampled 1,000\n",
      "  ljspeech_parallel_wavegan: 13,100 available \u2192 sampled 1,000\n",
      "  ljspeech_waveglow: 13,100 available \u2192 sampled 1,000\n",
      "  ljspeech_hifiGAN: 13,100 available \u2192 sampled 1,000\n",
      "  jsut_multi_band_melgan: 5,000 available \u2192 sampled 1,000\n",
      "  jsut_parallel_wavegan: 5,000 available \u2192 sampled 1,000\n",
      "\n",
      "Total utterances: 10,500\n",
      "Bonafide: 1,500\n",
      "Spoof:    9,000\n",
      "\n",
      "Vocoder distribution:\n",
      "  bonafide_LJSpeech: 1,500\n",
      "  ljspeech_melgan: 1,000\n",
      "  ljspeech_full_band_melgan: 1,000\n",
      "  ljspeech_parallel_wavegan: 1,000\n",
      "  ljspeech_waveglow: 1,000\n",
      "  jsut_parallel_wavegan: 1,000\n",
      "  ljspeech_melgan_large: 1,000\n",
      "  ljspeech_multi_band_melgan: 1,000\n",
      "  jsut_multi_band_melgan: 1,000\n",
      "  ljspeech_hifiGAN: 1,000\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "import torchaudio\n",
    "import torch\n",
    "\n",
    "print(\"Sanity-checking a few files from each category...\\n\")\n",
    "\n",
    "samples_to_test = [\n",
    "    (\"LJSpeech bonafide\", utts_wavefake[0]),\n",
    "]\n",
    "\n",
    "seen_vocoders = {'bonafide_LJSpeech'}\n",
    "for u in utts_wavefake:\n",
    "    if u.vocoder not in seen_vocoders:\n",
    "        samples_to_test.append((u.vocoder, u))\n",
    "        seen_vocoders.add(u.vocoder)\n",
    "    if len(samples_to_test) >= 10:\n",
    "        break\n",
    "\n",
    "for label, u in samples_to_test:\n",
    "    try:\n",
    "        w, sr = torchaudio.load(u.flac_path)\n",
    "        duration = w.shape[1] / sr\n",
    "        print(f\"  [{label:<35}] sr={sr}, shape={tuple(w.shape)}, duration={duration:.2f}s\")\n",
    "    except Exception as e:\n",
    "        print(f\"  [{label:<35}] FAILED: {type(e).__name__}: {e}\")"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "DO-omUogmPUM",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777774165833,
     "user_tz": 420,
     "elapsed": 175,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "00f969a5-800b-4583-d2ce-7d3193a68ae7"
   },
   "execution_count": 32,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Sanity-checking a few files from each category...\n",
      "\n",
      "  [LJSpeech bonafide                  ] sr=22050, shape=(1, 84224), duration=3.82s\n",
      "  [ljspeech_melgan                    ] sr=22050, shape=(1, 84224), duration=3.82s\n",
      "  [ljspeech_full_band_melgan          ] sr=22050, shape=(1, 42496), duration=1.93s\n",
      "  [ljspeech_parallel_wavegan          ] sr=22050, shape=(1, 94976), duration=4.31s\n",
      "  [ljspeech_waveglow                  ] sr=22050, shape=(1, 154880), duration=7.02s\n",
      "  [jsut_parallel_wavegan              ] sr=24000, shape=(1, 123900), duration=5.16s\n",
      "  [ljspeech_melgan_large              ] sr=22050, shape=(1, 122880), duration=5.57s\n",
      "  [ljspeech_multi_band_melgan         ] sr=22050, shape=(1, 87040), duration=3.95s\n",
      "  [jsut_multi_band_melgan             ] sr=24000, shape=(1, 87900), duration=3.66s\n",
      "  [ljspeech_hifiGAN                   ] sr=22050, shape=(1, 208896), duration=9.47s\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "import sys, importlib\n",
    "\n",
    "# Make sure we have the latest preprocessing module\n",
    "if 'src.data.preprocessing' in sys.modules:\n",
    "    importlib.reload(sys.modules['src.data.preprocessing'])\n",
    "from src.data.preprocessing import load_audio, WINDOW_SAMPLES, SAMPLE_RATE\n",
    "\n",
    "print(f\"Pipeline target SR: {SAMPLE_RATE} Hz\")\n",
    "print(f\"Window samples (4 sec at target SR): {WINDOW_SAMPLES}\\n\")\n",
    "\n",
    "# Test on one LJSpeech file (22050 Hz source)\n",
    "test_lj = utts_wavefake[0]  # first one is bonafide LJSpeech (we shuffled but bonafide are most common at start of list)\n",
    "# Find a bonafide one explicitly\n",
    "test_lj = next(u for u in utts_wavefake if u.vocoder == 'bonafide_LJSpeech')\n",
    "print(f\"Testing LJSpeech file: {test_lj.utterance_id}\")\n",
    "\n",
    "w_loaded = load_audio(test_lj.flac_path)\n",
    "print(f\"  Loaded shape: {w_loaded.shape}\")\n",
    "print(f\"  Implied duration at 16 kHz: {w_loaded.shape[0] / 16000:.2f}s\")\n",
    "print(f\"  Min/max: {w_loaded.min():.3f} / {w_loaded.max():.3f}\")\n",
    "\n",
    "# Test on a JSUT file (24000 Hz source)\n",
    "test_jsut = next(u for u in utts_wavefake if u.vocoder == 'jsut_parallel_wavegan')\n",
    "print(f\"\\nTesting JSUT file: {test_jsut.utterance_id}\")\n",
    "w_loaded = load_audio(test_jsut.flac_path)\n",
    "print(f\"  Loaded shape: {w_loaded.shape}\")\n",
    "print(f\"  Implied duration at 16 kHz: {w_loaded.shape[0] / 16000:.2f}s\")\n",
    "print(f\"  Min/max: {w_loaded.min():.3f} / {w_loaded.max():.3f}\")"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "L8SQKEj6mdYi",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777774224630,
     "user_tz": 420,
     "elapsed": 508,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "357b042e-971b-416c-f010-2a2289efead2"
   },
   "execution_count": 33,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Pipeline target SR: 16000 Hz\n",
      "Window samples (4 sec at target SR): 64000\n",
      "\n",
      "Testing LJSpeech file: LJ016-0377\n",
      "  Loaded shape: torch.Size([155967])\n",
      "  Implied duration at 16 kHz: 9.75s\n",
      "  Min/max: -0.502 / 0.559\n",
      "\n",
      "Testing JSUT file: jsut_parallel_wavegan_BASIC5000_4802_gen\n",
      "  Loaded shape: torch.Size([82600])\n",
      "  Implied duration at 16 kHz: 5.16s\n",
      "  Min/max: -0.315 / 0.284\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "import torchaudio\n",
    "from tqdm import tqdm\n",
    "\n",
    "print(\"Measuring durations on full WaveFake eval set (10,500 utterances)...\")\n",
    "print(\"Expected: ~3-4 min (resampling overhead included)\\n\")\n",
    "\n",
    "eval_durs_wf = []\n",
    "failed_ids_wf = []\n",
    "\n",
    "for u in tqdm(utts_wavefake, desc=\"WaveFake durations\"):\n",
    "    try:\n",
    "        # Use load_audio which resamples to 16 kHz\n",
    "        from src.data.preprocessing import load_audio\n",
    "        w = load_audio(u.flac_path)\n",
    "        eval_durs_wf.append(w.shape[0])\n",
    "    except Exception as e:\n",
    "        eval_durs_wf.append(None)\n",
    "        failed_ids_wf.append((u.utterance_id, str(e)))\n",
    "\n",
    "n_valid = sum(1 for d in eval_durs_wf if d is not None)\n",
    "print(f\"\\nMeasurement complete.\")\n",
    "print(f\"Total recorded:  {len(eval_durs_wf):,}\")\n",
    "print(f\"Valid:           {n_valid:,}\")\n",
    "print(f\"Failed:          {len(failed_ids_wf):,}\")\n",
    "\n",
    "if failed_ids_wf:\n",
    "    print(f\"\\nFirst 10 failures:\")\n",
    "    for uid, err in failed_ids_wf[:10]:\n",
    "        print(f\"  {uid}: {err[:80]}\")"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "EQ50sl-PmnYy",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777774473102,
     "user_tz": 420,
     "elapsed": 205818,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "937ea57a-f775-42e1-8fcb-33600945747c"
   },
   "execution_count": 34,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Measuring durations on full WaveFake eval set (10,500 utterances)...\n",
      "Expected: ~3-4 min (resampling overhead included)\n",
      "\n"
     ]
    },
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "WaveFake durations: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 10500/10500 [03:25<00:00, 51.02it/s]"
     ]
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\n",
      "Measurement complete.\n",
      "Total recorded:  10,500\n",
      "Valid:           10,500\n",
      "Failed:          0\n"
     ]
    },
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "from src.data.dataset import ASVspoofDataset\n",
    "from torch.utils.data import DataLoader\n",
    "import torch\n",
    "import sys, importlib\n",
    "\n",
    "# Make sure model class is fresh\n",
    "if 'src.models.wav2vec_classifier' in sys.modules:\n",
    "    importlib.reload(sys.modules['src.models.wav2vec_classifier'])\n",
    "from src.models.wav2vec_classifier import Wav2VecClassifier\n",
    "\n",
    "# Build dataset (ASVspoofDataset works because Utt has the same fields it needs)\n",
    "eval_ds_wf = ASVspoofDataset(utts_wavefake, durations_samples=eval_durs_wf)\n",
    "eval_loader_wf = DataLoader(\n",
    "    eval_ds_wf, batch_size=16, shuffle=False, num_workers=2, pin_memory=True\n",
    ")\n",
    "\n",
    "print(f\"WaveFake dataset: {len(eval_ds_wf):,} windows from {len(utts_wavefake):,} utterances\")\n",
    "inflation = len(eval_ds_wf) / len(utts_wavefake)\n",
    "print(f\"Inflation factor: {inflation:.2f}x\")\n",
    "\n",
    "# Reload Stage 2 model (in case it got cleared)\n",
    "print(\"\\nLoading Stage 2 best checkpoint...\")\n",
    "device = 'cuda'\n",
    "model = Wav2VecClassifier(\n",
    "    backbone_name=\"facebook/wav2vec2-base\",\n",
    "    num_classes=2,\n",
    "    freeze_backbone=True,\n",
    ")\n",
    "ckpt = torch.load(\n",
    "    '/content/drive/MyDrive/deepfake_audio/checkpoints/stage2_best.pt',\n",
    "    map_location=device, weights_only=False,\n",
    ")\n",
    "model.load_state_dict(ckpt['model_state_dict'])\n",
    "model = model.to(device)\n",
    "model.eval()\n",
    "print(f\"Model loaded (epoch {ckpt['epoch']}, dev EER {ckpt['best_eer']*100:.4f}%)\")\n",
    "\n",
    "# Run inference\n",
    "import numpy as np\n",
    "import time\n",
    "from tqdm import tqdm\n",
    "from src.evaluation.metrics import compute_eer, compute_auc, aggregate_window_scores_to_utterance\n",
    "\n",
    "# Build lookup for vocoder per utterance\n",
    "utt_vocoder_map = {u.utterance_id: u.vocoder for u in utts_wavefake}\n",
    "\n",
    "print(f\"\\nRunning inference (mixed precision, batch=16)...\")\n",
    "print(f\"Expected: ~3-5 min\\n\")\n",
    "\n",
    "all_window_scores = []\n",
    "all_window_labels = []\n",
    "all_window_utts = []\n",
    "\n",
    "start = time.time()\n",
    "with torch.no_grad():\n",
    "    autocast_ctx = torch.amp.autocast(device_type='cuda', enabled=True)\n",
    "    for waveforms, labels, utt_ids in tqdm(eval_loader_wf, desc=\"WaveFake inference\"):\n",
    "        waveforms = waveforms.to('cuda', non_blocking=True)\n",
    "        with autocast_ctx:\n",
    "            logits = model(waveforms)\n",
    "        probs = torch.softmax(logits.float(), dim=-1)\n",
    "        spoof_probs = probs[:, 1].detach().cpu().numpy()\n",
    "\n",
    "        all_window_scores.extend(spoof_probs.tolist())\n",
    "        all_window_labels.extend(labels.tolist())\n",
    "        all_window_utts.extend(list(utt_ids))\n",
    "\n",
    "inference_minutes = (time.time() - start) / 60\n",
    "print(f\"\\nInference complete in {inference_minutes:.1f} min over {len(all_window_scores):,} windows.\")\n",
    "\n",
    "# Aggregate to per-utterance\n",
    "print(\"\\nAggregating window scores to utterance scores (mean)...\")\n",
    "utt_scores, utt_ids_sorted = aggregate_window_scores_to_utterance(\n",
    "    np.array(all_window_scores), all_window_utts, method=\"mean\",\n",
    ")\n",
    "\n",
    "# Build per-utterance label and vocoder arrays\n",
    "utt_label_map = {}\n",
    "for s, l, u in zip(all_window_scores, all_window_labels, all_window_utts):\n",
    "    if u not in utt_label_map:\n",
    "        utt_label_map[u] = l\n",
    "\n",
    "utt_labels = np.array([utt_label_map[u] for u in utt_ids_sorted])\n",
    "utt_vocoders = np.array([utt_vocoder_map[u] for u in utt_ids_sorted])\n",
    "\n",
    "# ---- Overall metrics ----\n",
    "print(f\"\\n{'='*70}\")\n",
    "print(f\"  SUPPLEMENTARY EVALUATION \u2014 WaveFake (LJSpeech + JSUT)\")\n",
    "print(f\"{'='*70}\")\n",
    "n_bona = int((utt_labels == 0).sum())\n",
    "n_spoof = int((utt_labels == 1).sum())\n",
    "print(f\"Utterances: {len(utt_scores):,}\")\n",
    "print(f\"Bonafide:   {n_bona:,}\")\n",
    "print(f\"Spoof:      {n_spoof:,}\")\n",
    "\n",
    "eer_wf, threshold_wf = compute_eer(utt_scores, utt_labels)\n",
    "auc_wf = compute_auc(utt_scores, utt_labels)\n",
    "preds_wf = (utt_scores > threshold_wf).astype(int)\n",
    "acc_wf = float((preds_wf == utt_labels).mean())\n",
    "\n",
    "print(f\"\\nOverall results (Stage 2 model on WaveFake):\")\n",
    "print(f\"  EER:       {eer_wf*100:.4f}%\")\n",
    "print(f\"  AUC:       {auc_wf:.4f}\")\n",
    "print(f\"  Accuracy:  {acc_wf*100:.2f}%\")\n",
    "print(f\"  Threshold: {threshold_wf:.4f}\")\n",
    "\n",
    "# ---- Cross-dataset comparison ----\n",
    "print(f\"\\nCross-dataset comparison:\")\n",
    "print(f\"  Stage 2 dev EER (2019 LA, seen attacks):       0.69%\")\n",
    "print(f\"  Stage 2 eval EER (2019 LA, unseen attacks):    5.55%\")\n",
    "print(f\"  Stage 2 eval EER (2021 LA, codec degraded):    9.09%\")\n",
    "print(f\"  Stage 2 eval EER (WaveFake, novel vocoders):   {eer_wf*100:.2f}%\")\n",
    "\n",
    "# ---- Per-vocoder EER ----\n",
    "print(f\"\\n{'='*70}\")\n",
    "print(f\"  PER-VOCODER EER BREAKDOWN (vs LJSpeech bonafide)\")\n",
    "print(f\"{'='*70}\")\n",
    "bonafide_scores_all = utt_scores[utt_labels == 0]\n",
    "spoof_vocoders = sorted(set(v for v in utt_vocoders if v != 'bonafide_LJSpeech'))\n",
    "\n",
    "per_vocoder_results = {}\n",
    "for vocoder in spoof_vocoders:\n",
    "    mask = (utt_vocoders == vocoder)\n",
    "    voc_scores = utt_scores[mask]\n",
    "    n = int(mask.sum())\n",
    "    combined_scores = np.concatenate([bonafide_scores_all, voc_scores])\n",
    "    combined_labels = np.concatenate([\n",
    "        np.zeros(len(bonafide_scores_all)),\n",
    "        np.ones(n),\n",
    "    ])\n",
    "    v_eer, _ = compute_eer(combined_scores, combined_labels)\n",
    "    per_vocoder_results[vocoder] = {\"n\": n, \"eer\": float(v_eer)}\n",
    "    print(f\"  {vocoder:<35}: n={n:>5,}  EER={v_eer*100:>6.2f}%\")\n",
    "\n",
    "# Save raw scores\n",
    "import os\n",
    "SCORES_PATH = '/content/deepfake-audio-detection/results/scores/stage2_eval_wavefake.npz'\n",
    "os.makedirs(os.path.dirname(SCORES_PATH), exist_ok=True)\n",
    "np.savez(\n",
    "    SCORES_PATH,\n",
    "    utt_ids=np.array(utt_ids_sorted),\n",
    "    utt_scores=utt_scores,\n",
    "    utt_labels=utt_labels,\n",
    "    utt_vocoders=utt_vocoders,\n",
    ")\n",
    "print(f\"\\nRaw scores saved to {SCORES_PATH}\")"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 1000,
     "referenced_widgets": [
      "bb896bf13a8a48c7b31ee6f97ff9c387",
      "104dd6aba61b4ad4ac0f520328cc1fb4",
      "5cbd77bc7a3f48c98423dd04c79b22a6",
      "447f31f2587a418c9133d3d139e8ba4d",
      "2bfeb4a9753e48fc854c0f1a64a61318",
      "69b8155b46784de887c14f710d5de5d0",
      "2426e9f09a0e47c5afc37980c0f43a87",
      "778e478f9a114526b094a713275b6a3e",
      "63205811a7904068a7b3a69a7ae7cd89",
      "59d6be54ba9344f7b9fd3a6b4257a2fb",
      "1110ef40cbb24e9aaa1bd6cfea6c0f2a"
     ]
    },
    "id": "WmOcBoRXn8Cm",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777775090716,
     "user_tz": 420,
     "elapsed": 479257,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "d53a3e7a-4286-4a48-db2e-c56ab04bf918"
   },
   "execution_count": 35,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "WaveFake dataset: 27,483 windows from 10,500 utterances\n",
      "Inflation factor: 2.62x\n",
      "\n",
      "Loading Stage 2 best checkpoint...\n"
     ]
    },
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "Wav2Vec2Model LOAD REPORT from: facebook/wav2vec2-base\n",
      "Key                          | Status     |  | \n",
      "-----------------------------+------------+--+-\n",
      "quantizer.weight_proj.weight | UNEXPECTED |  | \n",
      "project_q.weight             | UNEXPECTED |  | \n",
      "project_hid.weight           | UNEXPECTED |  | \n",
      "quantizer.codevectors        | UNEXPECTED |  | \n",
      "quantizer.weight_proj.bias   | UNEXPECTED |  | \n",
      "project_hid.bias             | UNEXPECTED |  | \n",
      "project_q.bias               | UNEXPECTED |  | \n",
      "\n",
      "Notes:\n",
      "- UNEXPECTED\t:can be ignored when loading from different task/architecture; not ok if you expect identical arch.\n"
     ]
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Model loaded (epoch 9, dev EER 0.6941%)\n",
      "\n",
      "Running inference (mixed precision, batch=16)...\n",
      "Expected: ~3-5 min\n",
      "\n"
     ]
    },
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "WaveFake inference: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1718/1718 [07:52<00:00,  3.63it/s]\n"
     ]
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\n",
      "Inference complete in 7.9 min over 27,483 windows.\n",
      "\n",
      "Aggregating window scores to utterance scores (mean)...\n",
      "\n",
      "======================================================================\n",
      "  SUPPLEMENTARY EVALUATION \u2014 WaveFake (LJSpeech + JSUT)\n",
      "======================================================================\n",
      "Utterances: 10,500\n",
      "Bonafide:   1,500\n",
      "Spoof:      9,000\n",
      "\n",
      "Overall results (Stage 2 model on WaveFake):\n",
      "  EER:       26.3333%\n",
      "  AUC:       0.8250\n",
      "  Accuracy:  73.68%\n",
      "  Threshold: 0.0000\n",
      "\n",
      "Cross-dataset comparison:\n",
      "  Stage 2 dev EER (2019 LA, seen attacks):       0.69%\n",
      "  Stage 2 eval EER (2019 LA, unseen attacks):    5.55%\n",
      "  Stage 2 eval EER (2021 LA, codec degraded):    9.09%\n",
      "  Stage 2 eval EER (WaveFake, novel vocoders):   26.33%\n",
      "\n",
      "======================================================================\n",
      "  PER-VOCODER EER BREAKDOWN (vs LJSpeech bonafide)\n",
      "======================================================================\n",
      "  jsut_multi_band_melgan             : n=1,000  EER=  1.13%\n",
      "  jsut_parallel_wavegan              : n=1,000  EER=  0.83%\n",
      "  ljspeech_full_band_melgan          : n=1,000  EER= 30.60%\n",
      "  ljspeech_hifiGAN                   : n=1,000  EER= 33.23%\n",
      "  ljspeech_melgan                    : n=1,000  EER= 31.12%\n",
      "  ljspeech_melgan_large              : n=1,000  EER= 33.85%\n",
      "  ljspeech_multi_band_melgan         : n=1,000  EER= 21.92%\n",
      "  ljspeech_parallel_wavegan          : n=1,000  EER= 26.12%\n",
      "  ljspeech_waveglow                  : n=1,000  EER= 29.60%\n",
      "\n",
      "Raw scores saved to /content/deepfake-audio-detection/results/scores/stage2_eval_wavefake.npz\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "import json, os\n",
    "from datetime import datetime\n",
    "\n",
    "results_wf = {\n",
    "    \"phase\": \"Phase 5c \u2014 Supplementary Evaluation on WaveFake\",\n",
    "    \"completed_at\": datetime.now().isoformat(),\n",
    "    \"model_checkpoint\": \"/content/drive/MyDrive/deepfake_audio/checkpoints/stage2_best.pt\",\n",
    "    \"model_dev_eer\": 0.0069,\n",
    "    \"evaluation_dataset\": {\n",
    "        \"name\": \"WaveFake (Frank et al., 2021) \u2014 sampled subset\",\n",
    "        \"kaggle_source_spoof\": \"walimuhammadahmad/fakeaudio\",\n",
    "        \"kaggle_source_bonafide\": \"mathurinache/the-lj-speech-dataset\",\n",
    "        \"sampling_strategy\": \"Random sample of 1,500 LJSpeech bonafide + 1,000 spoof per vocoder \u00d7 9 vocoders\",\n",
    "        \"utterances_total\": 10500,\n",
    "        \"windows\": 27483,\n",
    "        \"bonafide_count\": 1500,\n",
    "        \"spoof_count\": 9000,\n",
    "        \"vocoders\": [\n",
    "            \"ljspeech_melgan\", \"ljspeech_melgan_large\", \"ljspeech_multi_band_melgan\",\n",
    "            \"ljspeech_full_band_melgan\", \"ljspeech_parallel_wavegan\",\n",
    "            \"ljspeech_waveglow\", \"ljspeech_hifiGAN\",\n",
    "            \"jsut_multi_band_melgan\", \"jsut_parallel_wavegan\"\n",
    "        ],\n",
    "    },\n",
    "    \"inference\": {\n",
    "        \"batch_size\": 16,\n",
    "        \"mixed_precision\": True,\n",
    "        \"wall_clock_minutes\": 7.9,\n",
    "        \"windows_per_second\": 58,\n",
    "        \"note\": \"Slower windows/sec than ASVspoof because of resampling 22050/24000 \u2192 16000\",\n",
    "    },\n",
    "    \"overall_results\": {\n",
    "        \"eer\": 0.2633,\n",
    "        \"auc\": 0.8250,\n",
    "        \"accuracy\": 0.7368,\n",
    "        \"threshold\": 0.0000,\n",
    "    },\n",
    "    \"cross_dataset_comparison\": {\n",
    "        \"stage2_dev_2019_seen_attacks\": 0.0069,\n",
    "        \"stage2_eval_2019_unseen_attacks\": 0.0555,\n",
    "        \"stage2_eval_2021_unseen_attacks_plus_codecs\": 0.0909,\n",
    "        \"stage2_eval_wavefake_novel_vocoders\": 0.2633,\n",
    "        \"interpretation\": \"Largest cross-dataset gap. Model trained on ASVspoof attacks generalizes only weakly to standalone neural vocoder pipelines.\",\n",
    "    },\n",
    "    \"per_vocoder_eer\": {\n",
    "        \"ljspeech_melgan\": 0.3112,\n",
    "        \"ljspeech_melgan_large\": 0.3385,\n",
    "        \"ljspeech_multi_band_melgan\": 0.2192,\n",
    "        \"ljspeech_full_band_melgan\": 0.3060,\n",
    "        \"ljspeech_parallel_wavegan\": 0.2612,\n",
    "        \"ljspeech_waveglow\": 0.2960,\n",
    "        \"ljspeech_hifiGAN\": 0.3323,\n",
    "        \"jsut_multi_band_melgan\": 0.0113,\n",
    "        \"jsut_parallel_wavegan\": 0.0083,\n",
    "    },\n",
    "    \"methodological_caveats\": [\n",
    "        \"JSUT vocoder EERs (~1%) are likely inflated by domain shortcuts: bonafide is English LJSpeech, JSUT spoofs are Japanese audio at different sample rate (24 kHz vs 22 kHz). Model may be classifying language/speaker rather than detecting spoofing.\",\n",
    "        \"The LJSpeech-based vocoder EERs (22-34%) are the methodologically meaningful results: same speaker, same content, same recording quality as bonafide; only the synthesis differs.\",\n",
    "        \"High EERs on LJSpeech vocoders (mean 29.4%) reveal that ASVspoof-trained models generalize poorly to clean neural vocoder pipelines. This matches the original WaveFake paper's observations.\",\n",
    "        \"Model has not been adapted to WaveFake \u2014 pure cross-dataset evaluation.\",\n",
    "    ],\n",
    "    \"key_findings\": [\n",
    "        \"Cross-dataset robustness varies substantially by distribution shift type:\",\n",
    "        \"  - Unseen attack types in same dataset: +4.86 pp (0.69% \u2192 5.55%)\",\n",
    "        \"  - Real-world codec degradation: +3.54 pp (5.55% \u2192 9.09%)\",\n",
    "        \"  - Novel vocoder pipelines on different domain: +17.24 pp (9.09% \u2192 26.33%)\",\n",
    "        \"Model has learned to detect ASVspoof-specific synthesis artifacts but not pure vocoder artifacts.\",\n",
    "        \"Future work direction: include vocoder-only spoofing data during training to improve cross-dataset generalization.\",\n",
    "    ],\n",
    "    \"raw_scores_path\": \"/content/deepfake-audio-detection/results/scores/stage2_eval_wavefake.npz\",\n",
    "}\n",
    "\n",
    "OUTPUT = '/content/deepfake-audio-detection/results/metrics/stage2_eval_wavefake_results.json'\n",
    "os.makedirs(os.path.dirname(OUTPUT), exist_ok=True)\n",
    "with open(OUTPUT, 'w') as f:\n",
    "    json.dump(results_wf, f, indent=2)\n",
    "\n",
    "print(f\"Wrote {OUTPUT}\")\n",
    "print(f\"Size: {os.path.getsize(OUTPUT)} bytes\")"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "IeogEOWqrFxv",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777775437818,
     "user_tz": 420,
     "elapsed": 75,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "45839087-0591-4039-fb45-7a854ffc6508"
   },
   "execution_count": 36,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Wrote /content/deepfake-audio-detection/results/metrics/stage2_eval_wavefake_results.json\n",
      "Size: 3479 bytes\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "from google.colab import userdata\n",
    "import os\n",
    "\n",
    "GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')\n",
    "os.chdir('/content/deepfake-audio-detection')\n",
    "\n",
    "!git config user.email \"95262824+Saracasm@users.noreply.github.com\"\n",
    "!git config user.name \"Sara Iqbal\"\n",
    "\n",
    "!git add results/metrics/stage2_eval_wavefake_results.json\n",
    "!git add results/scores/stage2_eval_wavefake.npz\n",
    "!git status\n",
    "\n",
    "!git commit -m \"Phase 5c: WaveFake eval \u2014 26.33% EER, reveals ASVspoof-specific overfitting\"\n",
    "\n",
    "push_url = f\"https://Saracasm:{GITHUB_TOKEN}@github.com/Saracasm/deepfake-audio-detection.git\"\n",
    "!git push {push_url} main 2>&1 | grep -v {GITHUB_TOKEN}"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "o6M26soOrUYq",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777775499812,
     "user_tz": 420,
     "elapsed": 2350,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "3b4afe21-70c1-463a-d4c8-d58453f645dc"
   },
   "execution_count": 37,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "On branch main\n",
      "Your branch is ahead of 'origin/main' by 1 commit.\n",
      "  (use \"git push\" to publish your local commits)\n",
      "\n",
      "Changes to be committed:\n",
      "  (use \"git restore --staged <file>...\" to unstage)\n",
      "\t\u001b[32mnew file:   results/metrics/stage2_eval_wavefake_results.json\u001b[m\n",
      "\t\u001b[32mnew file:   results/scores/stage2_eval_wavefake.npz\u001b[m\n",
      "\n",
      "[main 258c630] Phase 5c: WaveFake eval \u2014 26.33% EER, reveals ASVspoof-specific overfitting\n",
      " 2 files changed, 73 insertions(+)\n",
      " create mode 100644 results/metrics/stage2_eval_wavefake_results.json\n",
      " create mode 100644 results/scores/stage2_eval_wavefake.npz\n",
      "To https://github.com/Saracasm/deepfake-audio-detection.git\n",
      "   6b144b9..258c630  main -> main\n"
     ]
    }
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "We fine-tune Wav2Vec 2.0 for synthetic speech detection on ASVspoof 2019 LA and characterize its cross-dataset robustness across three distribution shift types: (1) unseen attacks in the original dataset (5.55% EER), (2) real-world codec degradation in ASVspoof 2021 LA (9.09% EER, matching the strongest published baselines), and (3) novel vocoder pipelines on a different domain (WaveFake, 26.33% EER). We show that fine-tuned Wav2Vec features generalize well to attack and codec variations but degrade significantly on out-of-distribution vocoder synthesis, suggesting the model has learned ASVspoof-specific synthesis artifacts rather than universal spoofing detection."
   ],
   "metadata": {
    "id": "Di-Vv52nrupT"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "PREDICT_PY = '''\"\"\"\n",
    "Inference module for deepfake audio detection.\n",
    "\n",
    "Wraps the Stage 2 Wav2Vec 2.0 classifier with a clean public API.\n",
    "\n",
    "Usage:\n",
    "    from src.inference.predict import DeepfakeDetector\n",
    "    detector = DeepfakeDetector(checkpoint_path=\"path/to/stage2_best.pt\")\n",
    "    result = detector.predict(\"path/to/audio.wav\")\n",
    "    print(result)\n",
    "    # {\"spoof_probability\": 0.84, \"prediction\": \"spoof\", \"confidence\": 0.84,\n",
    "    #  \"utterance_duration_sec\": 3.42, \"n_windows\": 1, \"model_version\": \"stage2\"}\n",
    "\"\"\"\n",
    "\n",
    "import os\n",
    "from typing import Dict, Optional, Union\n",
    "import torch\n",
    "import torch.nn.functional as F\n",
    "import numpy as np\n",
    "\n",
    "from src.models.wav2vec_classifier import Wav2VecClassifier\n",
    "from src.data.preprocessing import load_audio, segment_waveform, WINDOW_SAMPLES\n",
    "\n",
    "\n",
    "# Default classifier threshold. 0.5 is naive; we tuned it during eval.\n",
    "# Values closer to 0.5 = balanced; lower = more sensitive (more false alarms);\n",
    "# higher = more conservative (more misses).\n",
    "DEFAULT_THRESHOLD = 0.5\n",
    "\n",
    "\n",
    "class DeepfakeDetector:\n",
    "    \"\"\"Anti-spoofing classifier wrapper for one-shot inference.\"\"\"\n",
    "\n",
    "    def __init__(\n",
    "        self,\n",
    "        checkpoint_path: str,\n",
    "        device: Optional[str] = None,\n",
    "        backbone_name: str = \"facebook/wav2vec2-base\",\n",
    "        threshold: float = DEFAULT_THRESHOLD,\n",
    "        use_mixed_precision: bool = True,\n",
    "    ):\n",
    "        \"\"\"\n",
    "        Args:\n",
    "            checkpoint_path: path to a Stage 2 .pt checkpoint\n",
    "            device: 'cuda', 'cpu', or None (auto-detect)\n",
    "            backbone_name: HuggingFace model name for Wav2Vec backbone\n",
    "            threshold: probability threshold above which we predict \"spoof\"\n",
    "            use_mixed_precision: use fp16 inference (faster on GPU)\n",
    "        \"\"\"\n",
    "        if device is None:\n",
    "            device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "        self.device = device\n",
    "        self.threshold = threshold\n",
    "        self.use_mixed_precision = use_mixed_precision and (device == \"cuda\")\n",
    "\n",
    "        # Build model and load weights\n",
    "        self.model = Wav2VecClassifier(\n",
    "            backbone_name=backbone_name,\n",
    "            num_classes=2,\n",
    "            freeze_backbone=True,\n",
    "        )\n",
    "        ckpt = torch.load(checkpoint_path, map_location=device, weights_only=False)\n",
    "        self.model.load_state_dict(ckpt[\"model_state_dict\"])\n",
    "        self.model = self.model.to(device)\n",
    "        self.model.eval()\n",
    "\n",
    "        # Store metadata for transparency\n",
    "        self.checkpoint_metadata = {\n",
    "            \"epoch\": ckpt.get(\"epoch\"),\n",
    "            \"best_eer\": ckpt.get(\"best_eer\"),\n",
    "            \"checkpoint_path\": checkpoint_path,\n",
    "        }\n",
    "\n",
    "    @torch.no_grad()\n",
    "    def predict(\n",
    "        self,\n",
    "        audio_input: Union[str, torch.Tensor, np.ndarray],\n",
    "        return_per_window: bool = False,\n",
    "    ) -> Dict:\n",
    "        \"\"\"Predict bonafide vs spoof for a single audio input.\n",
    "\n",
    "        Args:\n",
    "            audio_input: either a file path (str), a 1-D Tensor at 16 kHz, or\n",
    "                         a 1-D numpy array at 16 kHz.\n",
    "            return_per_window: if True, include per-window probabilities in\n",
    "                               the result for debugging.\n",
    "\n",
    "        Returns:\n",
    "            Dict with keys:\n",
    "                spoof_probability: float in [0, 1]\n",
    "                bonafide_probability: float in [0, 1]\n",
    "                prediction: \"bonafide\" or \"spoof\"\n",
    "                confidence: float in [0, 1] (probability of the predicted class)\n",
    "                utterance_duration_sec: total audio length\n",
    "                n_windows: number of 4-sec windows the audio was split into\n",
    "                window_scores: (only if return_per_window=True) list of per-window spoof probs\n",
    "        \"\"\"\n",
    "        # Step 1: Load and resample audio if needed\n",
    "        if isinstance(audio_input, str):\n",
    "            waveform = load_audio(audio_input)  # returns 1-D tensor at 16 kHz\n",
    "        elif isinstance(audio_input, np.ndarray):\n",
    "            waveform = torch.from_numpy(audio_input.astype(np.float32))\n",
    "        elif isinstance(audio_input, torch.Tensor):\n",
    "            waveform = audio_input.float()\n",
    "            if waveform.dim() > 1:\n",
    "                waveform = waveform.squeeze()\n",
    "        else:\n",
    "            raise ValueError(\n",
    "                f\"audio_input must be str, np.ndarray, or torch.Tensor; got {type(audio_input)}\"\n",
    "            )\n",
    "\n",
    "        duration_sec = float(waveform.shape[0] / 16000)\n",
    "\n",
    "        # Step 2: Segment into 4-sec windows\n",
    "        windows = segment_waveform(waveform)  # list of 1-D tensors of length 64000\n",
    "        n_windows = len(windows)\n",
    "\n",
    "        # Step 3: Stack into a batch and run inference\n",
    "        batch = torch.stack(windows, dim=0).to(self.device, non_blocking=True)\n",
    "\n",
    "        if self.use_mixed_precision:\n",
    "            with torch.amp.autocast(device_type=\"cuda\", enabled=True):\n",
    "                logits = self.model(batch)\n",
    "        else:\n",
    "            logits = self.model(batch)\n",
    "\n",
    "        # Step 4: Compute per-window probabilities, then aggregate (mean)\n",
    "        probs = torch.softmax(logits.float(), dim=-1).cpu().numpy()  # (n_windows, 2)\n",
    "        window_spoof_probs = probs[:, 1].tolist()\n",
    "        utt_spoof_prob = float(np.mean(window_spoof_probs))\n",
    "        utt_bonafide_prob = 1.0 - utt_spoof_prob\n",
    "\n",
    "        # Step 5: Apply threshold for hard prediction\n",
    "        prediction = \"spoof\" if utt_spoof_prob > self.threshold else \"bonafide\"\n",
    "        confidence = utt_spoof_prob if prediction == \"spoof\" else utt_bonafide_prob\n",
    "\n",
    "        result = {\n",
    "            \"spoof_probability\": utt_spoof_prob,\n",
    "            \"bonafide_probability\": utt_bonafide_prob,\n",
    "            \"prediction\": prediction,\n",
    "            \"confidence\": confidence,\n",
    "            \"utterance_duration_sec\": duration_sec,\n",
    "            \"n_windows\": n_windows,\n",
    "            \"threshold_used\": self.threshold,\n",
    "        }\n",
    "        if return_per_window:\n",
    "            result[\"window_scores\"] = window_spoof_probs\n",
    "        return result\n",
    "\n",
    "    def info(self) -> Dict:\n",
    "        \"\"\"Return metadata about this model checkpoint.\"\"\"\n",
    "        return {\n",
    "            **self.checkpoint_metadata,\n",
    "            \"device\": self.device,\n",
    "            \"threshold\": self.threshold,\n",
    "            \"mixed_precision\": self.use_mixed_precision,\n",
    "        }\n",
    "'''\n",
    "\n",
    "PATH = '/content/deepfake-audio-detection/src/inference/predict.py'\n",
    "import os\n",
    "os.makedirs(os.path.dirname(PATH), exist_ok=True)\n",
    "\n",
    "# Also create __init__.py for the module\n",
    "init_path = '/content/deepfake-audio-detection/src/inference/__init__.py'\n",
    "if not os.path.exists(init_path):\n",
    "    open(init_path, 'w').close()\n",
    "\n",
    "with open(PATH, 'w') as f:\n",
    "    f.write(PREDICT_PY)\n",
    "\n",
    "print(f\"Wrote {PATH} ({len(PREDICT_PY)} bytes)\")"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "1FmHjpqjtUqO",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777776024694,
     "user_tz": 420,
     "elapsed": 56,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "9ebc3240-cad4-4127-9848-43d4fda0bf6c"
   },
   "execution_count": 38,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Wrote /content/deepfake-audio-detection/src/inference/predict.py (6047 bytes)\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "import sys\n",
    "sys.path.insert(0, '/content/deepfake-audio-detection')\n",
    "from src.data.protocols import parse_all_partitions\n",
    "\n",
    "LA_ROOT = '/content/kaggle_download/LA'\n",
    "splits = parse_all_partitions(LA_ROOT)\n",
    "print(f\"Re-parsed:\")\n",
    "for name, utts in splits.items():\n",
    "    n_bonafide = sum(1 for u in utts if u.label == 'bonafide')\n",
    "    print(f\"  {name}: {len(utts):,} (bonafide: {n_bonafide:,})\")"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "qu09Ku0at_Sf",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777776197587,
     "user_tz": 420,
     "elapsed": 387,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "f9acc39e-0f22-431e-ac0e-6a8a00610578"
   },
   "execution_count": 40,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Re-parsed:\n",
      "  train: 25,380 (bonafide: 2,580)\n",
      "  dev: 24,844 (bonafide: 2,548)\n",
      "  eval: 71,237 (bonafide: 7,355)\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "import sys, importlib\n",
    "\n",
    "# Reload modules\n",
    "for mod in ['src.inference.predict']:\n",
    "    if mod in sys.modules:\n",
    "        importlib.reload(sys.modules[mod])\n",
    "from src.inference.predict import DeepfakeDetector\n",
    "\n",
    "# Build the detector once\n",
    "print(\"Loading detector...\")\n",
    "CKPT = '/content/drive/MyDrive/deepfake_audio/checkpoints/stage2_best.pt'\n",
    "detector = DeepfakeDetector(checkpoint_path=CKPT)\n",
    "print(f\"\\nDetector loaded. Info:\")\n",
    "for k, v in detector.info().items():\n",
    "    print(f\"  {k}: {v}\")\n",
    "\n",
    "# Pick test samples from 2019 LA eval set\n",
    "print(\"\\n\" + \"=\" * 70)\n",
    "print(\"  TESTING ON REAL AUDIO\")\n",
    "print(\"=\" * 70)\n",
    "\n",
    "test_cases = []\n",
    "\n",
    "# 1. Bonafide from 2019 eval\n",
    "bonafide_eval = [u for u in splits['eval'] if u.label == 'bonafide']\n",
    "test_cases.append((\"2019 eval bonafide\", bonafide_eval[0]))\n",
    "\n",
    "# 2. Easy attack (A13)\n",
    "attack_a13 = [u for u in splits['eval'] if u.attack_id == 'A13']\n",
    "test_cases.append((\"2019 eval spoof (A13, easy)\", attack_a13[0]))\n",
    "\n",
    "# 3. Hard attack (A10)\n",
    "attack_a10 = [u for u in splits['eval'] if u.attack_id == 'A10']\n",
    "test_cases.append((\"2019 eval spoof (A10, hard)\", attack_a10[0]))\n",
    "\n",
    "# 4. Medium attack (A07)\n",
    "attack_a07 = [u for u in splits['eval'] if u.attack_id == 'A07']\n",
    "test_cases.append((\"2019 eval spoof (A07, medium)\", attack_a07[0]))\n",
    "\n",
    "# 5. WaveFake spoof (LJSpeech-based, model struggles here)\n",
    "import glob\n",
    "wf_files = sorted(glob.glob('/content/wavefake/generated_audio/ljspeech_hifiGAN/*.wav'))\n",
    "if wf_files:\n",
    "    class _LightUtt:\n",
    "        def __init__(self, path, uid):\n",
    "            self.flac_path = path\n",
    "            self.utterance_id = uid\n",
    "            self.label = 'spoof'  # WaveFake is all spoof\n",
    "    test_cases.append((\"WaveFake spoof (HiFi-GAN)\", _LightUtt(wf_files[0], 'wavefake_hifigan_0')))\n",
    "\n",
    "# 6. Real LJSpeech (bonafide, but the model wasn't trained on this domain)\n",
    "lj_files = sorted(glob.glob('/content/ljspeech/LJSpeech-1.1/wavs/*.wav'))\n",
    "if lj_files:\n",
    "    test_cases.append((\"LJSpeech bonafide (out-of-domain)\", _LightUtt(lj_files[0], 'lj_bonafide_0')))\n",
    "\n",
    "# Run predictions\n",
    "import time\n",
    "for label, utt in test_cases:\n",
    "    start = time.time()\n",
    "    result = detector.predict(utt.flac_path)\n",
    "    elapsed_ms = (time.time() - start) * 1000\n",
    "\n",
    "    expected = utt.label\n",
    "    actual = result['prediction']\n",
    "    correct = \"\u2713\" if expected == actual else \"\u2717\"\n",
    "\n",
    "    print(f\"\\n  [{label}]\")\n",
    "    print(f\"    File: {utt.utterance_id}\")\n",
    "    print(f\"    Expected: {expected}\")\n",
    "    print(f\"    Predicted: {actual}  {correct}\")\n",
    "    print(f\"    Spoof probability: {result['spoof_probability']:.4f}\")\n",
    "    print(f\"    Confidence: {result['confidence']:.4f}\")\n",
    "    print(f\"    Duration: {result['utterance_duration_sec']:.2f}s ({result['n_windows']} window{'s' if result['n_windows'] != 1 else ''})\")\n",
    "    print(f\"    Inference time: {elapsed_ms:.0f}ms\")"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 1000,
     "referenced_widgets": [
      "154d0f8dc23e4b8fb191661027beb270",
      "e983a97e0345486c9c7b8c767ab0b40e",
      "69fba5a3128e4877b6ac264e7ed954c2",
      "9c48d6067614434e8ac17be30d67faeb",
      "835875abf0e049a38d7e7355b3f2341a",
      "5ce1e1c41b47497cb9a087c1ff0b1704",
      "6fcb3d97485a4d719e7b3b310526ff07",
      "8993ebc97b3d4621a74a08a3df02eaf5",
      "1c5f39a4a4b3436c9f6bf91e67ae5796",
      "9c444522fbcd43fb8bcfdd2c3c8c79da",
      "0b029b9c45c4447d907b1276be045cd0"
     ]
    },
    "id": "h0DqZhbAtix3",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777776221825,
     "user_tz": 420,
     "elapsed": 4169,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "0f3c8ba9-195b-41d1-b0b0-446ecfec0b1c"
   },
   "execution_count": 41,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Loading detector...\n"
     ]
    },
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "Wav2Vec2Model LOAD REPORT from: facebook/wav2vec2-base\n",
      "Key                          | Status     |  | \n",
      "-----------------------------+------------+--+-\n",
      "quantizer.weight_proj.weight | UNEXPECTED |  | \n",
      "project_q.weight             | UNEXPECTED |  | \n",
      "project_hid.weight           | UNEXPECTED |  | \n",
      "quantizer.codevectors        | UNEXPECTED |  | \n",
      "quantizer.weight_proj.bias   | UNEXPECTED |  | \n",
      "project_hid.bias             | UNEXPECTED |  | \n",
      "project_q.bias               | UNEXPECTED |  | \n",
      "\n",
      "Notes:\n",
      "- UNEXPECTED\t:can be ignored when loading from different task/architecture; not ok if you expect identical arch.\n"
     ]
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\n",
      "Detector loaded. Info:\n",
      "  epoch: 9\n",
      "  best_eer: 0.006940865275480051\n",
      "  checkpoint_path: /content/drive/MyDrive/deepfake_audio/checkpoints/stage2_best.pt\n",
      "  device: cuda\n",
      "  threshold: 0.5\n",
      "  mixed_precision: True\n",
      "\n",
      "======================================================================\n",
      "  TESTING ON REAL AUDIO\n",
      "======================================================================\n",
      "\n",
      "  [2019 eval bonafide]\n",
      "    File: LA_E_5849185\n",
      "    Expected: bonafide\n",
      "    Predicted: bonafide  \u2713\n",
      "    Spoof probability: 0.0000\n",
      "    Confidence: 1.0000\n",
      "    Duration: 4.39s (2 windows)\n",
      "    Inference time: 240ms\n",
      "\n",
      "  [2019 eval spoof (A13, easy)]\n",
      "    File: LA_E_5932896\n",
      "    Expected: spoof\n",
      "    Predicted: spoof  \u2713\n",
      "    Spoof probability: 1.0000\n",
      "    Confidence: 1.0000\n",
      "    Duration: 5.80s (2 windows)\n",
      "    Inference time: 73ms\n",
      "\n",
      "  [2019 eval spoof (A10, hard)]\n",
      "    File: LA_E_8339197\n",
      "    Expected: spoof\n",
      "    Predicted: bonafide  \u2717\n",
      "    Spoof probability: 0.0001\n",
      "    Confidence: 0.9999\n",
      "    Duration: 1.46s (1 window)\n",
      "    Inference time: 93ms\n",
      "\n",
      "  [2019 eval spoof (A07, medium)]\n",
      "    File: LA_E_8844552\n",
      "    Expected: spoof\n",
      "    Predicted: spoof  \u2713\n",
      "    Spoof probability: 0.6621\n",
      "    Confidence: 0.6621\n",
      "    Duration: 4.12s (2 windows)\n",
      "    Inference time: 59ms\n",
      "\n",
      "  [WaveFake spoof (HiFi-GAN)]\n",
      "    File: wavefake_hifigan_0\n",
      "    Expected: spoof\n",
      "    Predicted: bonafide  \u2717\n",
      "    Spoof probability: 0.2500\n",
      "    Confidence: 0.7500\n",
      "    Duration: 9.65s (4 windows)\n",
      "    Inference time: 85ms\n",
      "\n",
      "  [LJSpeech bonafide (out-of-domain)]\n",
      "    File: lj_bonafide_0\n",
      "    Expected: spoof\n",
      "    Predicted: bonafide  \u2717\n",
      "    Spoof probability: 0.0000\n",
      "    Confidence: 1.0000\n",
      "    Duration: 9.66s (4 windows)\n",
      "    Inference time: 83ms\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "from google.colab import userdata\n",
    "import os\n",
    "\n",
    "GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')\n",
    "os.chdir('/content/deepfake-audio-detection')\n",
    "\n",
    "!git config user.email \"95262824+Saracasm@users.noreply.github.com\"\n",
    "!git config user.name \"Sara Iqbal\"\n",
    "\n",
    "!git add src/inference/__init__.py src/inference/predict.py\n",
    "!git status\n",
    "!git commit -m \"Phase 6: add production inference module (DeepfakeDetector wrapper)\"\n",
    "\n",
    "push_url = f\"https://Saracasm:{GITHUB_TOKEN}@github.com/Saracasm/deepfake-audio-detection.git\"\n",
    "!git push {push_url} main 2>&1 | grep -v {GITHUB_TOKEN}"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "y5K__qKRuVyl",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1777776291704,
     "user_tz": 420,
     "elapsed": 1898,
     "user": {
      "displayName": "Sara Jaffrani",
      "userId": "07677779715251349607"
     }
    },
    "outputId": "1ae864ee-974e-48a6-944e-6b5af6895cad"
   },
   "execution_count": 42,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "On branch main\n",
      "Your branch is ahead of 'origin/main' by 2 commits.\n",
      "  (use \"git push\" to publish your local commits)\n",
      "\n",
      "Changes to be committed:\n",
      "  (use \"git restore --staged <file>...\" to unstage)\n",
      "\t\u001b[32mnew file:   src/inference/__init__.py\u001b[m\n",
      "\t\u001b[32mnew file:   src/inference/predict.py\u001b[m\n",
      "\n",
      "[main 0e975e7] Phase 6: add production inference module (DeepfakeDetector wrapper)\n",
      " 2 files changed, 157 insertions(+)\n",
      " create mode 100644 src/inference/__init__.py\n",
      " create mode 100644 src/inference/predict.py\n",
      "To https://github.com/Saracasm/deepfake-audio-detection.git\n",
      "   258c630..0e975e7  main -> main\n"
     ]
    }
   ]
  }
 ]
}