ropedia-xperience-10m-task-baselines / scripts /verify_live_publication.py
cy0307's picture
Publish Ropedia Xperience-10M task baseline cards
f5818d7 verified
Raw
History Blame
34.8 kB
#!/usr/bin/env python3
"""Verify the already-published GitHub Pages and Hugging Face mirrors.
This is the post-publish companion to the local publication gates. It fetches
public URLs and compares them with the local release artifacts so a reader can
see that the live surfaces match the repo/HF bundles that were prepared.
"""
from __future__ import annotations
import argparse
import hashlib
import json
import subprocess
import tempfile
from datetime import datetime, timezone
from pathlib import Path
from urllib.error import HTTPError, URLError
from urllib.parse import urlsplit, urlunsplit
from urllib.request import Request, urlopen
ROOT = Path(__file__).resolve().parents[1]
DEFAULT_OUTPUT = ROOT / "docs/data/live_publication_status.json"
TIMEOUT_SECONDS = 30
USER_AGENT = "ropedia-xperience-10m-live-verifier/1.0"
LOCAL_PATH_FORBIDDEN_MARKERS = ["/" + "Users/", "/" + "private/"]
HASH_GROUPS = [
{
"id": "task_suite_infographic",
"title": "Task-suite infographic",
"local_path": "docs/assets/task_suite_infographic.png",
"urls": {
"github_pages": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/assets/task_suite_infographic.png",
"hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/resolve/main/assets/task_suite_infographic.png",
"hf_artifacts": "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts/resolve/main/docs/assets/task_suite_infographic.png",
"hf_model": "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines/resolve/main/assets/task_suite_infographic.png",
},
},
{
"id": "quality_gates_json",
"title": "Quality-gate JSON",
"local_path": "docs/data/quality_gates.json",
"urls": {
"github_pages": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/data/quality_gates.json",
"hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/raw/main/data/quality_gates.json",
"hf_artifacts": "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts/resolve/main/docs/data/quality_gates.json",
"hf_model": "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines/resolve/main/metrics/quality_gates.json",
},
},
{
"id": "single_episode_explorer_json",
"title": "Single-episode explorer JSON",
"local_path": "docs/data/single_episode_explorer.json",
"urls": {
"github_pages": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/data/single_episode_explorer.json",
"hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/raw/main/data/single_episode_explorer.json",
"hf_artifacts": "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts/resolve/main/docs/data/single_episode_explorer.json",
"hf_model": "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines/resolve/main/metrics/single_episode_explorer.json",
},
},
{
"id": "single_episode_explorer_html",
"title": "Single-episode explorer HTML",
"local_path": "docs/single_episode_explorer.html",
"urls": {
"github_pages": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/single_episode_explorer.html",
"hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/raw/main/single_episode_explorer.html",
"hf_artifacts": "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts/resolve/main/docs/single_episode_explorer.html",
"hf_model": "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines/resolve/main/single_episode_explorer.html",
},
},
{
"id": "research_roadmap_html",
"title": "Interactive research roadmap HTML",
"local_path": "docs/research_roadmap.html",
"urls": {
"github_pages": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/research_roadmap.html",
"hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/raw/main/research_roadmap.html",
"hf_artifacts": "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts/resolve/main/docs/research_roadmap.html",
"hf_model": "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines/resolve/main/research_roadmap.html",
},
},
{
"id": "single_episode_diagnostics_provenance",
"title": "Single-episode diagnostics provenance",
"local_path": "results/single_episode_diagnostics/provenance.json",
"urls": {
"github_raw": "https://raw.githubusercontent.com/ChaoYue0307/ropedia-xperience-10m-task-suite/main/results/single_episode_diagnostics/provenance.json",
"hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/raw/main/results/single_episode_diagnostics/provenance.json",
"hf_artifacts": "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts/resolve/main/results/single_episode_diagnostics/provenance.json",
"hf_model": "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines/resolve/main/results/single_episode_diagnostics/provenance.json",
},
},
{
"id": "single_episode_object_vocab",
"title": "Single-episode object vocabulary",
"local_path": "results/single_episode_diagnostics/object_labels/object_vocab.json",
"urls": {
"github_raw": "https://raw.githubusercontent.com/ChaoYue0307/ropedia-xperience-10m-task-suite/main/results/single_episode_diagnostics/object_labels/object_vocab.json",
"hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/raw/main/results/single_episode_diagnostics/object_labels/object_vocab.json",
"hf_artifacts": "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts/resolve/main/results/single_episode_diagnostics/object_labels/object_vocab.json",
"hf_model": "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines/resolve/main/results/single_episode_diagnostics/object_labels/object_vocab.json",
},
},
{
"id": "public_surface_qa_json",
"title": "Public presentation JSON",
"local_path": "docs/data/public_surface_qa.json",
"urls": {
"github_pages": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/data/public_surface_qa.json",
"hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/raw/main/data/public_surface_qa.json",
"hf_artifacts": "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts/resolve/main/docs/data/public_surface_qa.json",
"hf_model": "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines/resolve/main/metrics/public_surface_qa.json",
},
},
{
"id": "rendered_site_check_json",
"title": "Rendered website check JSON",
"local_path": "docs/data/rendered_site_check.json",
"urls": {
"github_pages": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/data/rendered_site_check.json",
"hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/raw/main/data/rendered_site_check.json",
"hf_artifacts": "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts/resolve/main/docs/data/rendered_site_check.json",
"hf_model": "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines/resolve/main/metrics/rendered_site_check.json",
},
},
{
"id": "xperience10m_dataset_card_alignment_json",
"title": "Official Xperience-10M dataset-card alignment JSON",
"local_path": "docs/data/xperience10m_dataset_card_alignment.json",
"urls": {
"github_pages": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/data/xperience10m_dataset_card_alignment.json",
"hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/raw/main/data/xperience10m_dataset_card_alignment.json",
"hf_artifacts": "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts/resolve/main/docs/data/xperience10m_dataset_card_alignment.json",
"hf_model": "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines/resolve/main/metrics/xperience10m_dataset_card_alignment.json",
},
},
{
"id": "source_alignment_json",
"title": "Source-alignment JSON",
"local_path": "docs/data/source_alignment_audit.json",
"urls": {
"github_pages": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/data/source_alignment_audit.json",
"hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/raw/main/data/source_alignment_audit.json",
"hf_artifacts": "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts/resolve/main/docs/data/source_alignment_audit.json",
"hf_model": "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines/resolve/main/metrics/source_alignment_audit.json",
},
},
{
"id": "project_status_json",
"title": "Project status JSON",
"local_path": "docs/data/project_status.json",
"urls": {
"github_pages": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/data/project_status.json",
"hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/raw/main/data/project_status.json",
"hf_artifacts": "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts/resolve/main/docs/data/project_status.json",
"hf_model": "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines/resolve/main/metrics/project_status.json",
},
},
{
"id": "evaluation_protocol_json",
"title": "Evaluation protocol JSON",
"local_path": "docs/data/evaluation_protocol.json",
"urls": {
"github_pages": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/data/evaluation_protocol.json",
"hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/raw/main/data/evaluation_protocol.json",
"hf_artifacts": "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts/resolve/main/docs/data/evaluation_protocol.json",
"hf_model": "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines/resolve/main/metrics/evaluation_protocol.json",
},
},
{
"id": "research_takeaways_json",
"title": "Research takeaways JSON",
"local_path": "docs/data/research_takeaways.json",
"urls": {
"github_pages": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/data/research_takeaways.json",
"hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/raw/main/data/research_takeaways.json",
"hf_artifacts": "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts/resolve/main/docs/data/research_takeaways.json",
"hf_model": "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines/resolve/main/metrics/research_takeaways.json",
},
},
{
"id": "research_roadmap_json",
"title": "Research roadmap JSON",
"local_path": "docs/data/research_roadmap.json",
"urls": {
"github_pages": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/data/research_roadmap.json",
"hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/raw/main/data/research_roadmap.json",
"hf_artifacts": "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts/resolve/main/docs/data/research_roadmap.json",
"hf_model": "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines/resolve/main/metrics/research_roadmap.json",
},
},
{
"id": "research_roadmap_interactive_json",
"title": "Interactive research roadmap JSON",
"local_path": "docs/data/research_roadmap_interactive.json",
"urls": {
"github_pages": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/data/research_roadmap_interactive.json",
"hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/raw/main/data/research_roadmap_interactive.json",
"hf_artifacts": "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts/resolve/main/docs/data/research_roadmap_interactive.json",
"hf_model": "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines/resolve/main/metrics/research_roadmap_interactive.json",
},
},
{
"id": "figure_index_json",
"title": "Figure index JSON",
"local_path": "docs/data/figure_index.json",
"urls": {
"github_pages": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/data/figure_index.json",
"hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/raw/main/data/figure_index.json",
"hf_artifacts": "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts/resolve/main/docs/data/figure_index.json",
"hf_model": "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines/resolve/main/metrics/figure_index.json",
},
},
{
"id": "task_walkthroughs_json",
"title": "Task walkthrough JSON",
"local_path": "docs/data/task_walkthroughs.json",
"urls": {
"github_pages": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/data/task_walkthroughs.json",
"hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/raw/main/data/task_walkthroughs.json",
"hf_artifacts": "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts/resolve/main/docs/data/task_walkthroughs.json",
"hf_model": "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines/resolve/main/metrics/task_walkthroughs.json",
},
},
{
"id": "task_surface_integrity_json",
"title": "Task-surface integrity JSON",
"local_path": "docs/data/task_surface_integrity.json",
"urls": {
"github_pages": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/data/task_surface_integrity.json",
"hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/raw/main/data/task_surface_integrity.json",
"hf_artifacts": "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts/resolve/main/docs/data/task_surface_integrity.json",
"hf_model": "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines/resolve/main/metrics/task_surface_integrity.json",
},
},
{
"id": "brand_assets_json",
"title": "Brand assets JSON",
"local_path": "docs/data/brand_assets.json",
"urls": {
"github_pages": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/data/brand_assets.json",
"hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/raw/main/data/brand_assets.json",
"hf_artifacts": "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts/resolve/main/docs/data/brand_assets.json",
"hf_model": "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines/resolve/main/metrics/brand_assets.json",
},
},
{
"id": "brand_logo_social_card",
"title": "Brand logo social card",
"local_path": "docs/assets/brand/xperience10m-logo-social-card.png",
"urls": {
"github_pages": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/assets/brand/xperience10m-logo-social-card.png",
"hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/resolve/main/assets/brand/xperience10m-logo-social-card.png",
"hf_artifacts": "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts/resolve/main/docs/assets/brand/xperience10m-logo-social-card.png",
"hf_model": "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines/resolve/main/assets/brand/xperience10m-logo-social-card.png",
},
},
{
"id": "quality_gates_markdown",
"title": "Quality-gate Markdown",
"local_path": "QUALITY_GATES.md",
"urls": {
"github_raw": "https://raw.githubusercontent.com/ChaoYue0307/ropedia-xperience-10m-task-suite/main/QUALITY_GATES.md",
"hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/raw/main/QUALITY_GATES.md",
"hf_artifacts": "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts/raw/main/QUALITY_GATES.md",
"hf_model": "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines/raw/main/QUALITY_GATES.md",
},
},
]
MARKER_CHECKS = [
{
"id": "github_pages_index_current",
"title": "GitHub Pages index contains current publication markers",
"url": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/",
"required": [
"Release checks are collected",
"quality_gates.json",
"xperience10m_dataset_card_alignment.json",
"source_alignment_audit.json",
"evaluation_protocol.json",
"research_takeaways.json",
"Research Takeaways",
"figure_index.json",
"brand_assets.json",
"xperience10m-logo-social-card.png",
"project_status.json",
"cc-by-nc-4.0",
"12,103 episode folders",
"xperience10m-taskfirst-v13-modality-xl",
"Interactive task walkthrough.",
"taskPlayer",
"Action Recognition",
"data/task_walkthroughs.json",
"task_surface_integrity.json",
"public_surface_qa.json",
"Public project surface",
"research_roadmap.html",
"research_roadmap_interactive.json",
],
"forbidden": [
"xperience10m-" + "taskfirst-v10",
"xperience10m-" + "modalities-v9-large-atlas",
"artifact-id",
],
},
{
"id": "hf_space_index_current",
"title": "HF Space index contains current publication markers",
"url": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/raw/main/index.html",
"required": [
"Release checks are collected",
"quality_gates.json",
"xperience10m_dataset_card_alignment.json",
"source_alignment_audit.json",
"evaluation_protocol.json",
"research_takeaways.json",
"Research Takeaways",
"figure_index.json",
"brand_assets.json",
"xperience10m-logo-social-card.png",
"project_status.json",
"cc-by-nc-4.0",
"12,103 episode folders",
"xperience10m-taskfirst-v13-modality-xl",
"Interactive task walkthrough.",
"taskPlayer",
"Action Recognition",
"data/task_walkthroughs.json",
"task_surface_integrity.json",
"public_surface_qa.json",
"Public project surface",
"research_roadmap.html",
"research_roadmap_interactive.json",
],
"forbidden": [
"xperience10m-" + "taskfirst-v10",
"xperience10m-" + "modalities-v9-large-atlas",
"artifact-id",
],
},
{
"id": "hf_artifacts_card_current",
"title": "HF artifact card links release checks",
"url": "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts/raw/main/README.md",
"required": [
"QUALITY_GATES.md",
"docs/data/quality_gates.json",
"xperience10m_dataset_card_alignment.json",
"source_alignment_audit.json",
"evaluation_protocol.json",
"research_takeaways.json",
"Research Takeaways",
"figure_index.json",
"brand_assets.json",
"xperience10m-logo-social-card.png",
"project_status.json",
"cc-by-nc-4.0",
"12,103 episode folders",
"interactive scrub/play walkthrough storyboard",
"task_surface_integrity.json",
"public_surface_qa.json",
"Public project surface",
],
"forbidden": ["xperience10m-" + "taskfirst-v10"],
},
{
"id": "github_pages_explorer_current",
"title": "GitHub Pages explorer contains current diagnostics markers",
"url": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/single_episode_explorer.html",
"required": [
"Single-Episode Research Explorer",
"data/single_episode_explorer.json",
"window-level exported artifacts only",
"Feature Blocks",
"Diagnostics",
"object labels",
"prediction rows",
"ablation",
"alignment",
],
"forbidden": LOCAL_PATH_FORBIDDEN_MARKERS,
},
{
"id": "hf_space_explorer_current",
"title": "HF Space explorer contains current diagnostics markers",
"url": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/raw/main/single_episode_explorer.html",
"required": [
"Single-Episode Research Explorer",
"data/single_episode_explorer.json",
"window-level exported artifacts only",
"Feature Blocks",
"Diagnostics",
"object labels",
"prediction rows",
"ablation",
"alignment",
],
"forbidden": LOCAL_PATH_FORBIDDEN_MARKERS,
},
{
"id": "hf_model_card_current",
"title": "HF model card links release checks",
"url": "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines/raw/main/README.md",
"required": [
"QUALITY_GATES.md",
"metrics/quality_gates.json",
"xperience10m_dataset_card_alignment.json",
"source_alignment_audit.json",
"evaluation_protocol.json",
"research_takeaways.json",
"Research Takeaways",
"figure_index.json",
"brand_assets.json",
"xperience10m-logo-social-card.png",
"project_status.json",
"cc-by-nc-4.0",
"12,103 episode folders",
"interactive scrub/play walkthrough storyboard",
"task_surface_integrity.json",
"public_surface_qa.json",
"Public project surface",
],
"forbidden": ["xperience10m-" + "taskfirst-v10"],
},
]
PATH_HYGIENE_REPORTS = {
"mirror_parity": {
"title": "Mirror parity JSON excludes local paths",
"paths": {
"github_pages": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/data/mirror_parity.json",
"hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/raw/main/data/mirror_parity.json",
"hf_artifacts": "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts/resolve/main/docs/data/mirror_parity.json",
"hf_model": "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines/resolve/main/metrics/mirror_parity.json",
},
},
"publication_audit": {
"title": "Publication package JSON excludes local paths",
"paths": {
"github_pages": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/data/publication_audit.json",
"hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/raw/main/data/publication_audit.json",
"hf_artifacts": "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts/resolve/main/docs/data/publication_audit.json",
"hf_model": "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines/resolve/main/metrics/publication_audit.json",
},
},
"website_integrity": {
"title": "Website integrity JSON excludes local paths",
"paths": {
"github_pages": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/data/website_integrity.json",
"hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/raw/main/data/website_integrity.json",
"hf_artifacts": "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts/resolve/main/docs/data/website_integrity.json",
"hf_model": "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines/resolve/main/metrics/website_integrity.json",
},
},
"public_surface_qa": {
"title": "Public presentation JSON excludes local paths",
"paths": {
"github_pages": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/data/public_surface_qa.json",
"hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite/raw/main/data/public_surface_qa.json",
"hf_artifacts": "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts/resolve/main/docs/data/public_surface_qa.json",
"hf_model": "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines/resolve/main/metrics/public_surface_qa.json",
},
},
}
def local_path_checks() -> list[dict]:
checks = []
for report_id, report in PATH_HYGIENE_REPORTS.items():
for surface, url in report["paths"].items():
checks.append({
"id": f"{surface}_{report_id}_local_path_check",
"title": f"{surface}: {report['title']}",
"url": url,
"required": ['"status": "pass"'],
"forbidden": LOCAL_PATH_FORBIDDEN_MARKERS,
})
return checks
def sha256_bytes(data: bytes) -> str:
return hashlib.sha256(data).hexdigest()
def sha256_file(path: Path) -> str:
digest = hashlib.sha256()
with path.open("rb") as handle:
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
digest.update(chunk)
return digest.hexdigest()
def sanitize_url(url: str) -> str:
parts = urlsplit(url)
return urlunsplit((parts.scheme, parts.netloc, parts.path, "", ""))
def fetch(url: str) -> dict:
request = Request(url, headers={"User-Agent": USER_AGENT})
try:
with urlopen(request, timeout=TIMEOUT_SECONDS) as response:
body = response.read()
return {
"ok": True,
"status_code": int(getattr(response, "status", 200)),
"bytes": len(body),
"sha256": sha256_bytes(body),
"body": body,
"final_url": sanitize_url(response.geturl()),
}
except HTTPError as exc:
return {
"ok": False,
"status_code": exc.code,
"bytes": 0,
"sha256": None,
"error": str(exc),
"final_url": url,
}
except (TimeoutError, OSError) as exc:
fallback = fetch_with_curl(url)
if fallback["ok"]:
return fallback
return {
"ok": False,
"status_code": None,
"bytes": 0,
"sha256": None,
"error": str(exc),
"final_url": url,
}
except URLError as exc:
fallback = fetch_with_curl(url)
if fallback["ok"]:
return fallback
return {
"ok": False,
"status_code": None,
"bytes": 0,
"sha256": None,
"error": str(exc.reason),
"final_url": url,
}
def fetch_with_curl(url: str) -> dict:
with tempfile.NamedTemporaryFile(delete=False) as tmp:
tmp_path = Path(tmp.name)
try:
result = subprocess.run(
[
"curl",
"-L",
"-sS",
"--max-time",
str(TIMEOUT_SECONDS),
"-A",
USER_AGENT,
"-o",
str(tmp_path),
"-w",
"%{http_code}\n%{url_effective}",
url,
],
check=False,
capture_output=True,
text=True,
)
status_text, _, final_url = result.stdout.partition("\n")
status_code = int(status_text.strip() or "0")
body = tmp_path.read_bytes() if tmp_path.exists() else b""
if result.returncode != 0:
return {
"ok": False,
"status_code": status_code or None,
"bytes": 0,
"sha256": None,
"error": result.stderr.strip() or f"curl exited {result.returncode}",
"final_url": sanitize_url(final_url.strip() or url),
}
return {
"ok": 200 <= status_code < 400,
"status_code": status_code,
"bytes": len(body),
"sha256": sha256_bytes(body),
"body": body,
"final_url": sanitize_url(final_url.strip() or url),
}
finally:
tmp_path.unlink(missing_ok=True)
def hash_group_record(group: dict) -> dict:
local_path = ROOT / group["local_path"]
local = {
"path": group["local_path"],
"exists": local_path.exists(),
"bytes": local_path.stat().st_size if local_path.exists() else 0,
"sha256": sha256_file(local_path) if local_path.exists() else None,
}
mirrors = {}
failures = []
if not local["exists"]:
failures.append({"surface": "local", "kind": "missing", "path": group["local_path"]})
for surface, url in group["urls"].items():
result = fetch(url)
record = {key: value for key, value in result.items() if key != "body"}
record["url"] = url
mirrors[surface] = record
if not result["ok"]:
failures.append({"surface": surface, "kind": "fetch_failed", "url": url, "error": result.get("error")})
continue
if local["exists"] and result["sha256"] != local["sha256"]:
failures.append(
{
"surface": surface,
"kind": "hash_mismatch",
"url": url,
"expected_sha256": local["sha256"],
"actual_sha256": result["sha256"],
}
)
return {
"id": group["id"],
"title": group["title"],
"status": "pass" if not failures else "fail",
"local": local,
"mirrors": mirrors,
"failures": failures,
}
def marker_record(check: dict) -> dict:
result = fetch(check["url"])
failures = []
missing = []
forbidden_hits = []
if not result["ok"]:
failures.append({"kind": "fetch_failed", "url": check["url"], "error": result.get("error")})
text = ""
else:
text = result["body"].decode("utf-8", errors="ignore")
missing = [marker for marker in check["required"] if marker not in text]
forbidden_hits = [marker for marker in check["forbidden"] if marker in text]
if missing:
failures.append({"kind": "missing_required_markers", "markers": missing})
if forbidden_hits:
failures.append({"kind": "forbidden_markers_present", "markers": forbidden_hits})
return {
"id": check["id"],
"title": check["title"],
"url": check["url"],
"status": "pass" if not failures else "fail",
"fetch": {key: value for key, value in result.items() if key != "body"},
"required_marker_count": len(check["required"]),
"missing_markers": missing,
"forbidden_markers_present": forbidden_hits,
"failures": failures,
}
def build_report() -> dict:
hash_records = [hash_group_record(group) for group in HASH_GROUPS]
marker_records = [marker_record(check) for check in [*MARKER_CHECKS, *local_path_checks()]]
failures = [
{"check": record["id"], **failure}
for record in [*hash_records, *marker_records]
for failure in record["failures"]
]
return {
"title": "Ropedia Xperience-10M Live Publication Status",
"status": "pass" if not failures else "fail",
"checked_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"),
"scope": "Live GitHub Pages, GitHub raw, Hugging Face Space, artifact dataset, and model card mirrors.",
"hash_groups": hash_records,
"marker_checks": marker_records,
"failure_count": len(failures),
"failures": failures,
}
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT)
parser.add_argument("--no-write", action="store_true", help="Verify and print status without updating the report file.")
args = parser.parse_args()
report = build_report()
if not args.no_write:
args.output.parent.mkdir(parents=True, exist_ok=True)
args.output.write_text(json.dumps(report, indent=2) + "\n", encoding="utf-8")
print(f"{report['status'].upper()}: wrote {args.output}")
else:
print(f"{report['status'].upper()}: live publication verification")
if report["status"] != "pass":
for failure in report["failures"][:20]:
print(f"- {failure}")
if len(report["failures"]) > 20:
print(f"- ... {len(report['failures']) - 20} more failures")
return 1
return 0
if __name__ == "__main__":
raise SystemExit(main())