Robotics
PyTorch
Cosmos
xperience10m_task_baseline_suite
embodied-ai
multimodal
xperience-10m
baseline
evaluation
qwen3-omni
Instructions to use cy0307/ropedia-xperience-10m-task-baselines with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Cosmos
How to use cy0307/ropedia-xperience-10m-task-baselines with Cosmos:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
| #!/usr/bin/env python3 | |
| """ | |
| End-to-end task suite for one Ropedia/Xperience episode. | |
| The purpose is not to prove generalization from one sample episode. It is to | |
| turn the episode into multiple meaningful supervised/self-supervised learning | |
| problems and write reproducible artifacts for each one. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import csv | |
| import json | |
| import math | |
| import sys | |
| from collections import Counter, OrderedDict | |
| from pathlib import Path | |
| import numpy as np | |
| from train_all_modalities_model import ( | |
| extract_all_window_features, | |
| prepare_modalities, | |
| ) | |
| from train_min_action_model import ( | |
| add_toolkit_to_path, | |
| compute_metrics, | |
| encode_labels, | |
| fit_scaler, | |
| frame_label, | |
| majority_label, | |
| predict, | |
| portable_path, | |
| softmax, | |
| train_softmax_classifier, | |
| ) | |
| TASKS = [ | |
| "timeline_action", | |
| "timeline_subtask", | |
| "transition_detection", | |
| "next_action", | |
| "hand_trajectory_forecast", | |
| "contact_prediction", | |
| "object_relevance", | |
| "caption_grounding", | |
| "cross_modal_retrieval", | |
| "modality_reconstruction", | |
| "temporal_order", | |
| "misalignment_detection", | |
| ] | |
| def parse_args() -> argparse.Namespace: | |
| workspace_default = Path(__file__).resolve().parents[1] | |
| annotation_default = workspace_default / "data/sample/xperience-10m-sample/annotation.hdf5" | |
| parser = argparse.ArgumentParser(description="Run an end-to-end task suite on one Ropedia episode.") | |
| parser.add_argument("--workspace", type=Path, default=workspace_default) | |
| parser.add_argument("--annotation", type=Path, default=annotation_default) | |
| parser.add_argument("--output-dir", type=Path, default=workspace_default / "outputs/episode_task_suite") | |
| parser.add_argument("--cache-dir", type=Path, default=workspace_default / "outputs/feature_cache") | |
| parser.add_argument("--window-frames", type=int, default=20) | |
| parser.add_argument("--stride-frames", type=int, default=5) | |
| parser.add_argument("--min-label-fraction", type=float, default=0.6) | |
| parser.add_argument("--test-fraction", type=float, default=0.30) | |
| parser.add_argument("--epochs", type=int, default=400) | |
| parser.add_argument("--learning-rate", type=float, default=0.12) | |
| parser.add_argument("--l2", type=float, default=2e-3) | |
| parser.add_argument("--ridge-l2", type=float, default=10.0) | |
| parser.add_argument("--seed", type=int, default=7) | |
| parser.add_argument("--future-frames", type=int, default=20, help="Future offset for next-action prediction.") | |
| parser.add_argument("--forecast-frames", type=int, default=10, help="Future hand trajectory length.") | |
| parser.add_argument("--boundary-tolerance-frames", type=int, default=10) | |
| parser.add_argument("--misalignment-shift-windows", type=int, default=8) | |
| parser.add_argument("--tasks", default="all", help="Comma-separated task list or 'all'.") | |
| # Match train_all_modalities_model defaults used by prepare_modalities. | |
| parser.add_argument("--force-rebuild-cache", action="store_true") | |
| parser.add_argument("--video-image-size", type=int, default=32) | |
| parser.add_argument("--video-grid-size", type=int, default=8) | |
| parser.add_argument("--video-hist-bins", type=int, default=8) | |
| parser.add_argument("--depth-grid-size", type=int, default=8) | |
| parser.add_argument("--text-hash-dim", type=int, default=128) | |
| parser.add_argument("--include-label-text", action="store_true") | |
| parser.add_argument("--no-class-weights", action="store_true") | |
| return parser.parse_args() | |
| def selected_tasks(spec: str) -> list[str]: | |
| if spec.strip().lower() == "all": | |
| return TASKS | |
| chosen = [x.strip() for x in spec.split(",") if x.strip()] | |
| unknown = [x for x in chosen if x not in TASKS] | |
| if unknown: | |
| raise ValueError(f"Unknown tasks: {unknown}. Valid tasks: {TASKS}") | |
| return chosen | |
| def write_json(path: Path, data: dict | list) -> None: | |
| path.parent.mkdir(parents=True, exist_ok=True) | |
| path.write_text(json.dumps(data, indent=2), encoding="utf-8") | |
| def write_csv(path: Path, rows: list[dict], fieldnames: list[str]) -> None: | |
| path.parent.mkdir(parents=True, exist_ok=True) | |
| with path.open("w", newline="", encoding="utf-8") as fp: | |
| writer = csv.DictWriter(fp, fieldnames=fieldnames) | |
| writer.writeheader() | |
| writer.writerows(rows) | |
| def write_confusion(path: Path, cm: np.ndarray, class_names: list[str]) -> None: | |
| path.parent.mkdir(parents=True, exist_ok=True) | |
| with path.open("w", newline="", encoding="utf-8") as fp: | |
| writer = csv.writer(fp) | |
| writer.writerow(["true\\pred"] + class_names) | |
| for i, name in enumerate(class_names): | |
| writer.writerow([name] + [int(v) for v in cm[i]]) | |
| def chronological_split_indices(n: int, test_fraction: float) -> tuple[np.ndarray, np.ndarray]: | |
| if n < 2: | |
| raise ValueError("Need at least two samples for train/test split.") | |
| split = int(round(n * (1.0 - test_fraction))) | |
| split = max(1, min(split, n - 1)) | |
| return np.arange(split, dtype=np.int64), np.arange(split, n, dtype=np.int64) | |
| def build_windows(args: argparse.Namespace, ann: dict, extras: dict): | |
| frame_info = ann["caption_frame_info_map"] | |
| n_frames = len(ann["img_names"]) | |
| rows = [] | |
| X = [] | |
| feature_manifest = None | |
| for start in range(0, n_frames - args.window_frames + 1, args.stride_frames): | |
| end = start + args.window_frames | |
| action_labels = [frame_label(frame_info.get(i, {}), "action") for i in range(start, end)] | |
| subtask_labels = [frame_label(frame_info.get(i, {}), "subtask") for i in range(start, end)] | |
| action, action_frac = majority_label(action_labels, args.min_label_fraction) | |
| subtask, subtask_frac = majority_label(subtask_labels, args.min_label_fraction) | |
| if feature_manifest is None: | |
| vec, blocks = extract_all_window_features(ann, extras, start, end, return_blocks=True) | |
| offset = 0 | |
| feature_manifest = [] | |
| for name, dim in blocks: | |
| feature_manifest.append({"name": name, "start": offset, "end": offset + dim, "dim": dim}) | |
| offset += dim | |
| else: | |
| vec = extract_all_window_features(ann, extras, start, end) | |
| X.append(vec) | |
| rows.append({ | |
| "window_index": len(rows), | |
| "start_frame": start, | |
| "end_frame": end - 1, | |
| "center_frame": (start + end - 1) // 2, | |
| "action_label": action, | |
| "action_fraction": action_frac, | |
| "subtask_label": subtask, | |
| "subtask_fraction": subtask_frac, | |
| }) | |
| return np.stack(X).astype(np.float32), rows, feature_manifest or [] | |
| def block_indices(feature_manifest: list[dict], include: list[str] | None = None, exclude: list[str] | None = None) -> np.ndarray: | |
| include = include or [] | |
| exclude = exclude or [] | |
| idxs = [] | |
| for block in feature_manifest: | |
| name = block["name"] | |
| if include and not any(name == p or name.startswith(p) for p in include): | |
| continue | |
| if exclude and any(name == p or name.startswith(p) for p in exclude): | |
| continue | |
| idxs.extend(range(int(block["start"]), int(block["end"]))) | |
| return np.asarray(idxs, dtype=np.int64) | |
| def label_array(rows: list[dict], key: str) -> np.ndarray: | |
| return np.asarray([str(row.get(key, "") or "") for row in rows], dtype=object) | |
| def classification_task( | |
| out_dir: Path, | |
| X: np.ndarray, | |
| labels: np.ndarray, | |
| rows: list[dict], | |
| args: argparse.Namespace, | |
| task_name: str, | |
| input_description: str, | |
| ) -> dict: | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| valid = np.asarray([bool(x) for x in labels]) | |
| valid_idx = np.flatnonzero(valid) | |
| Xv = X[valid_idx] | |
| labelv = labels[valid_idx] | |
| rowv = [rows[int(i)] for i in valid_idx] | |
| y, class_names = encode_labels(labelv) | |
| train_local, test_local = chronological_split_indices(len(y), args.test_fraction) | |
| train_classes = set(int(x) for x in y[train_local]) | |
| test_classes = set(int(x) for x in y[test_local]) | |
| unseen_test_classes = sorted(class_names[i] for i in (test_classes - train_classes)) | |
| mean, std = fit_scaler(Xv[train_local]) | |
| Xs = (Xv - mean) / std | |
| W, b, history = train_softmax_classifier( | |
| Xs[train_local], | |
| y[train_local], | |
| n_classes=len(class_names), | |
| epochs=args.epochs, | |
| lr=args.learning_rate, | |
| l2=args.l2, | |
| use_class_weights=not args.no_class_weights, | |
| seed=args.seed, | |
| ) | |
| pred, probs = predict(Xs[test_local], W, b) | |
| metrics, per_class, cm = compute_metrics(y[test_local], pred, class_names) | |
| majority = Counter(y[train_local]).most_common(1)[0][0] | |
| metrics.update({ | |
| "task": task_name, | |
| "input": input_description, | |
| "split": "chronological", | |
| "num_windows": int(len(y)), | |
| "num_train_windows": int(len(train_local)), | |
| "num_test_windows": int(len(test_local)), | |
| "num_classes": int(len(class_names)), | |
| "feature_dim": int(X.shape[1]), | |
| "majority_baseline_accuracy": float(np.mean(y[test_local] == majority)), | |
| "train_final_accuracy": float(history[-1]["train_accuracy"]), | |
| "train_final_loss": float(history[-1]["loss"]), | |
| "unseen_test_classes": unseen_test_classes, | |
| }) | |
| pred_rows = [] | |
| for local_pos, pred_id in zip(test_local, pred): | |
| row = rowv[int(local_pos)] | |
| true_id = int(y[int(local_pos)]) | |
| pred_rows.append({ | |
| "window_index": row["window_index"], | |
| "start_frame": row["start_frame"], | |
| "end_frame": row["end_frame"], | |
| "center_frame": row["center_frame"], | |
| "true_label": class_names[true_id], | |
| "predicted_label": class_names[int(pred_id)], | |
| "confidence": float(probs[list(test_local).index(local_pos), int(pred_id)]), | |
| "correct": int(true_id == int(pred_id)), | |
| }) | |
| write_json(out_dir / "metrics.json", metrics) | |
| write_csv(out_dir / "per_class_metrics.csv", per_class, ["class_id", "class_name", "support", "predicted", "precision", "recall", "f1"]) | |
| write_confusion(out_dir / "confusion_matrix.csv", cm, class_names) | |
| write_csv(out_dir / "predictions.csv", pred_rows, ["window_index", "start_frame", "end_frame", "center_frame", "true_label", "predicted_label", "confidence", "correct"]) | |
| np.savez_compressed(out_dir / "model.npz", mean=mean, std=std, W=W, b=b, class_names=np.asarray(class_names, dtype=object)) | |
| return metrics | |
| def binary_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> dict: | |
| y_true = y_true.astype(np.int64) | |
| y_pred = y_pred.astype(np.int64) | |
| tp = int(np.sum((y_true == 1) & (y_pred == 1))) | |
| tn = int(np.sum((y_true == 0) & (y_pred == 0))) | |
| fp = int(np.sum((y_true == 0) & (y_pred == 1))) | |
| fn = int(np.sum((y_true == 1) & (y_pred == 0))) | |
| precision = tp / (tp + fp) if tp + fp else 0.0 | |
| recall = tp / (tp + fn) if tp + fn else 0.0 | |
| f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0 | |
| return { | |
| "accuracy": float((tp + tn) / max(len(y_true), 1)), | |
| "precision": precision, | |
| "recall": recall, | |
| "f1": f1, | |
| "tp": tp, | |
| "tn": tn, | |
| "fp": fp, | |
| "fn": fn, | |
| "positive_rate_true": float(np.mean(y_true)) if len(y_true) else 0.0, | |
| "positive_rate_pred": float(np.mean(y_pred)) if len(y_pred) else 0.0, | |
| } | |
| def boundary_f1(true_frames: list[int], pred_frames: list[int], tolerance: int) -> dict: | |
| used = set() | |
| matches = 0 | |
| errors = [] | |
| for pf in pred_frames: | |
| candidates = [(abs(pf - tf), j, tf) for j, tf in enumerate(true_frames) if j not in used and abs(pf - tf) <= tolerance] | |
| if not candidates: | |
| continue | |
| diff, j, tf = min(candidates) | |
| used.add(j) | |
| matches += 1 | |
| errors.append(diff) | |
| precision = matches / len(pred_frames) if pred_frames else 0.0 | |
| recall = matches / len(true_frames) if true_frames else 0.0 | |
| f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0 | |
| return { | |
| "boundary_precision": precision, | |
| "boundary_recall": recall, | |
| "boundary_f1": f1, | |
| "matched_boundaries": matches, | |
| "true_boundaries": len(true_frames), | |
| "predicted_boundaries": len(pred_frames), | |
| "mean_abs_timing_error_frames": float(np.mean(errors)) if errors else None, | |
| } | |
| def task_transition_detection(out_dir: Path, X: np.ndarray, rows: list[dict], ann: dict, args: argparse.Namespace) -> dict: | |
| frame_info = ann["caption_frame_info_map"] | |
| n_frames = len(ann["img_names"]) | |
| per_frame = [frame_label(frame_info.get(i, {}), "action") for i in range(n_frames)] | |
| true_boundaries = [i for i in range(1, n_frames) if per_frame[i] and per_frame[i - 1] and per_frame[i] != per_frame[i - 1]] | |
| y = [] | |
| for row in rows: | |
| c = int(row["center_frame"]) | |
| y.append(int(any(abs(c - b) <= args.boundary_tolerance_frames for b in true_boundaries))) | |
| labels = np.asarray(["transition" if v else "steady" for v in y], dtype=object) | |
| metrics = classification_task(out_dir, X, labels, rows, args, "transition_detection", "all modalities -> action boundary/steady") | |
| pred_path = out_dir / "predictions.csv" | |
| pred_rows = [] | |
| with pred_path.open("r", encoding="utf-8") as fp: | |
| for row in csv.DictReader(fp): | |
| pred_rows.append(row) | |
| pred_frames = [int(r["center_frame"]) for r in pred_rows if r["predicted_label"] == "transition"] | |
| test_start = min((int(r["center_frame"]) for r in pred_rows), default=0) | |
| test_end = max((int(r["center_frame"]) for r in pred_rows), default=0) | |
| true_test = [b for b in true_boundaries if test_start <= b <= test_end] | |
| metrics.update(boundary_f1(true_test, pred_frames, args.boundary_tolerance_frames)) | |
| write_json(out_dir / "metrics.json", metrics) | |
| write_csv(out_dir / "true_boundaries.csv", [{"frame": x} for x in true_boundaries], ["frame"]) | |
| return metrics | |
| def task_next_action(out_dir: Path, X: np.ndarray, rows: list[dict], ann: dict, args: argparse.Namespace) -> dict: | |
| frame_info = ann["caption_frame_info_map"] | |
| labels = [] | |
| for row in rows: | |
| future_frame = min(len(ann["img_names"]) - 1, int(row["end_frame"]) + args.future_frames) | |
| labels.append(frame_label(frame_info.get(future_frame, {}), "action")) | |
| return classification_task(out_dir, X, np.asarray(labels, dtype=object), rows, args, "next_action", f"all modalities at t -> action at t+{args.future_frames} frames") | |
| def ridge_fit_predict(X_train: np.ndarray, Y_train: np.ndarray, X_test: np.ndarray, l2: float): | |
| x_mean, x_std = fit_scaler(X_train) | |
| y_mean = Y_train.mean(axis=0) | |
| y_std = Y_train.std(axis=0) | |
| y_std = np.where(y_std < 1e-6, 1.0, y_std) | |
| Xtr = (X_train - x_mean) / x_std | |
| Xte = (X_test - x_mean) / x_std | |
| Ytr = (Y_train - y_mean) / y_std | |
| Xtr_aug = np.concatenate([Xtr, np.ones((len(Xtr), 1), dtype=np.float32)], axis=1) | |
| Xte_aug = np.concatenate([Xte, np.ones((len(Xte), 1), dtype=np.float32)], axis=1) | |
| K = Xtr_aug @ Xtr_aug.T | |
| alpha = np.linalg.solve(K + l2 * np.eye(K.shape[0], dtype=np.float32), Ytr) | |
| W = Xtr_aug.T @ alpha | |
| pred = (Xte_aug @ W) * y_std + y_mean | |
| return pred.astype(np.float32), {"x_mean": x_mean, "x_std": x_std, "y_mean": y_mean.astype(np.float32), "y_std": y_std.astype(np.float32), "W": W.astype(np.float32)} | |
| def regression_metrics(Y_true: np.ndarray, Y_pred: np.ndarray) -> dict: | |
| mse = float(np.mean((Y_true - Y_pred) ** 2)) | |
| mae = float(np.mean(np.abs(Y_true - Y_pred))) | |
| ss_res = float(np.sum((Y_true - Y_pred) ** 2)) | |
| ss_tot = float(np.sum((Y_true - Y_true.mean(axis=0)) ** 2)) | |
| r2 = 1.0 - ss_res / ss_tot if ss_tot > 0 else 0.0 | |
| return {"mse": mse, "mae": mae, "r2": r2} | |
| def task_hand_forecast(out_dir: Path, X: np.ndarray, rows: list[dict], ann: dict, args: argparse.Namespace) -> dict: | |
| left = ann.get("hand_left_joints") | |
| right = ann.get("hand_right_joints") | |
| body = ann.get("smplh_body_joints") | |
| if left is None or right is None: | |
| raise ValueError("Hand joints not available.") | |
| valid_idx, Y = [], [] | |
| n_frames = len(left) | |
| for i, row in enumerate(rows): | |
| future_start = int(row["end_frame"]) + 1 | |
| future_end = future_start + args.forecast_frames | |
| if future_end > n_frames: | |
| continue | |
| hand = np.concatenate([left[future_start:future_end], right[future_start:future_end]], axis=1) | |
| if body is not None and future_end <= len(body): | |
| root = body[future_start:future_end, :1, :] | |
| hand = hand - root | |
| valid_idx.append(i) | |
| Y.append(hand.reshape(-1)) | |
| valid_idx = np.asarray(valid_idx, dtype=np.int64) | |
| Y = np.stack(Y).astype(np.float32) | |
| train, test = chronological_split_indices(len(valid_idx), args.test_fraction) | |
| pred, model = ridge_fit_predict(X[valid_idx[train]], Y[train], X[valid_idx[test]], args.ridge_l2) | |
| metrics = regression_metrics(Y[test], pred) | |
| true_hand = Y[test].reshape(len(test), args.forecast_frames, 42, 3) | |
| pred_hand = pred.reshape(len(test), args.forecast_frames, 42, 3) | |
| mpjpe = np.linalg.norm(true_hand - pred_hand, axis=-1).mean() | |
| final_error = np.linalg.norm(true_hand[:, -1] - pred_hand[:, -1], axis=-1).mean() | |
| metrics.update({ | |
| "task": "hand_trajectory_forecast", | |
| "input": "all modalities at t -> future left/right hand 3D joints", | |
| "split": "chronological", | |
| "num_windows": int(len(valid_idx)), | |
| "num_train_windows": int(len(train)), | |
| "num_test_windows": int(len(test)), | |
| "forecast_frames": int(args.forecast_frames), | |
| "mpjpe": float(mpjpe), | |
| "final_frame_mpjpe": float(final_error), | |
| "target_dim": int(Y.shape[1]), | |
| }) | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| write_json(out_dir / "metrics.json", metrics) | |
| np.savez_compressed(out_dir / "predictions.npz", y_true=Y[test], y_pred=pred, test_window_indices=valid_idx[test], **model) | |
| return metrics | |
| def task_contact_prediction(out_dir: Path, X: np.ndarray, rows: list[dict], ann: dict, manifest: list[dict], args: argparse.Namespace) -> dict: | |
| contacts = ann.get("contacts") | |
| if contacts is None: | |
| raise ValueError("Contacts not available.") | |
| y = [] | |
| for row in rows: | |
| c = contacts[int(row["start_frame"]):int(row["end_frame"]) + 1] | |
| y.append("contact" if np.any(c > 0) else "no_contact") | |
| keep = block_indices(manifest, exclude=["body_contacts", "caption_objects_interaction_text"]) | |
| return classification_task(out_dir, X[:, keep], np.asarray(y, dtype=object), rows, args, "contact_prediction", "all non-contact/non-caption-label modalities -> any body contact") | |
| def extract_objects(info: dict) -> list[str]: | |
| objects = info.get("objects") | |
| if isinstance(objects, list): | |
| return [str(x).strip() for x in objects if str(x).strip()] | |
| if objects: | |
| return [str(objects).strip()] | |
| return [] | |
| def sigmoid(z: np.ndarray) -> np.ndarray: | |
| return 1.0 / (1.0 + np.exp(-np.clip(z, -40, 40))) | |
| def train_multilabel_logistic(X: np.ndarray, Y: np.ndarray, epochs: int, lr: float, l2: float, seed: int): | |
| rng = np.random.default_rng(seed) | |
| n, d = X.shape | |
| c = Y.shape[1] | |
| W = rng.normal(0, 0.01, size=(d, c)).astype(np.float32) | |
| b = np.zeros(c, dtype=np.float32) | |
| counts = Y.sum(axis=0) | |
| pos_weight = (n - counts) / np.maximum(counts, 1.0) | |
| pos_weight = np.clip(pos_weight, 1.0, 20.0).astype(np.float32) | |
| history = [] | |
| for epoch in range(1, epochs + 1): | |
| P = sigmoid(X @ W + b) | |
| weights = np.where(Y > 0, pos_weight[None, :], 1.0) | |
| diff = (P - Y) * weights / n | |
| W -= lr * (X.T @ diff + l2 * W) | |
| b -= lr * diff.sum(axis=0) | |
| if epoch == 1 or epoch == epochs or epoch % max(1, epochs // 5) == 0: | |
| pred = (P >= 0.5).astype(np.float32) | |
| history.append({"epoch": epoch, **multilabel_metrics(Y, pred)}) | |
| return W.astype(np.float32), b.astype(np.float32), history | |
| def multilabel_metrics(Y: np.ndarray, P: np.ndarray) -> dict: | |
| Y = Y.astype(np.int64) | |
| P = P.astype(np.int64) | |
| tp = int(np.sum((Y == 1) & (P == 1))) | |
| fp = int(np.sum((Y == 0) & (P == 1))) | |
| fn = int(np.sum((Y == 1) & (P == 0))) | |
| precision = tp / (tp + fp) if tp + fp else 0.0 | |
| recall = tp / (tp + fn) if tp + fn else 0.0 | |
| micro_f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0 | |
| per_f1 = [] | |
| for j in range(Y.shape[1]): | |
| tpj = np.sum((Y[:, j] == 1) & (P[:, j] == 1)) | |
| fpj = np.sum((Y[:, j] == 0) & (P[:, j] == 1)) | |
| fnj = np.sum((Y[:, j] == 1) & (P[:, j] == 0)) | |
| pj = tpj / (tpj + fpj) if tpj + fpj else 0.0 | |
| rj = tpj / (tpj + fnj) if tpj + fnj else 0.0 | |
| per_f1.append(2 * pj * rj / (pj + rj) if pj + rj else 0.0) | |
| exact = float(np.mean(np.all(Y == P, axis=1))) | |
| return {"micro_f1": float(micro_f1), "macro_f1": float(np.mean(per_f1)), "exact_match": exact, "precision": precision, "recall": recall} | |
| def task_object_relevance(out_dir: Path, X: np.ndarray, rows: list[dict], ann: dict, manifest: list[dict], args: argparse.Namespace) -> dict: | |
| frame_info = ann["caption_frame_info_map"] | |
| vocab = OrderedDict() | |
| labels = [] | |
| for row in rows: | |
| counts = Counter() | |
| for frame in range(int(row["start_frame"]), int(row["end_frame"]) + 1): | |
| counts.update(extract_objects(frame_info.get(frame, {}))) | |
| objects = [obj for obj, count in counts.items() if count > 0] | |
| for obj in objects: | |
| if obj not in vocab: | |
| vocab[obj] = len(vocab) | |
| labels.append(objects) | |
| if not vocab: | |
| raise ValueError("No object labels found.") | |
| Y = np.zeros((len(rows), len(vocab)), dtype=np.float32) | |
| for i, objects in enumerate(labels): | |
| for obj in objects: | |
| Y[i, vocab[obj]] = 1.0 | |
| keep = block_indices(manifest, exclude=["caption_objects_interaction_text"]) | |
| Xo = X[:, keep] | |
| train, test = chronological_split_indices(len(rows), args.test_fraction) | |
| mean, std = fit_scaler(Xo[train]) | |
| Xs = (Xo - mean) / std | |
| W, b, history = train_multilabel_logistic(Xs[train], Y[train], args.epochs, 0.05, args.l2, args.seed) | |
| prob = sigmoid(Xs[test] @ W + b) | |
| pred = (prob >= 0.5).astype(np.float32) | |
| # Ensure at least one object is emitted per row. | |
| empty = np.where(pred.sum(axis=1) == 0)[0] | |
| if len(empty): | |
| pred[empty, np.argmax(prob[empty], axis=1)] = 1 | |
| metrics = multilabel_metrics(Y[test], pred) | |
| metrics.update({ | |
| "task": "object_relevance", | |
| "input": "all non-caption modalities -> current relevant object set", | |
| "split": "chronological", | |
| "num_windows": int(len(rows)), | |
| "num_train_windows": int(len(train)), | |
| "num_test_windows": int(len(test)), | |
| "num_objects": int(len(vocab)), | |
| }) | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| write_json(out_dir / "metrics.json", metrics) | |
| write_json(out_dir / "object_vocab.json", list(vocab.keys())) | |
| rows_out = [] | |
| names = list(vocab.keys()) | |
| for local_i, global_i in enumerate(test): | |
| true_objs = [names[j] for j in np.flatnonzero(Y[global_i] > 0)] | |
| pred_objs = [names[j] for j in np.flatnonzero(pred[local_i] > 0)] | |
| rows_out.append({ | |
| "window_index": int(global_i), | |
| "start_frame": rows[int(global_i)]["start_frame"], | |
| "end_frame": rows[int(global_i)]["end_frame"], | |
| "true_objects": "|".join(true_objs), | |
| "predicted_objects": "|".join(pred_objs), | |
| }) | |
| write_csv(out_dir / "predictions.csv", rows_out, ["window_index", "start_frame", "end_frame", "true_objects", "predicted_objects"]) | |
| np.savez_compressed(out_dir / "model.npz", mean=mean, std=std, W=W, b=b, object_vocab=np.asarray(names, dtype=object), history=np.asarray(history, dtype=object)) | |
| return metrics | |
| def normalize_rows(A: np.ndarray) -> np.ndarray: | |
| norm = np.linalg.norm(A, axis=1, keepdims=True) | |
| return A / np.maximum(norm, 1e-8) | |
| def retrieval_metrics(query: np.ndarray, candidates: np.ndarray, positive_indices: np.ndarray, topks=(1, 5, 10)) -> dict: | |
| Q = normalize_rows(query) | |
| C = normalize_rows(candidates) | |
| sims = Q @ C.T | |
| ranks = [] | |
| for i, pos in enumerate(positive_indices): | |
| order = np.argsort(-sims[i]) | |
| rank = int(np.where(order == pos)[0][0]) + 1 | |
| ranks.append(rank) | |
| ranks = np.asarray(ranks) | |
| out = { | |
| "mrr": float(np.mean(1.0 / ranks)), | |
| "median_rank": float(np.median(ranks)), | |
| "mean_rank": float(np.mean(ranks)), | |
| "num_queries": int(len(ranks)), | |
| } | |
| for k in topks: | |
| out[f"top{k}_accuracy"] = float(np.mean(ranks <= k)) | |
| return out | |
| def task_caption_grounding(out_dir: Path, X: np.ndarray, manifest: list[dict], args: argparse.Namespace) -> dict: | |
| text_idx = block_indices(manifest, include=["caption_objects_interaction_text"]) | |
| sensor_idx = block_indices(manifest, exclude=["caption_objects_interaction_text"]) | |
| train, test = chronological_split_indices(len(X), args.test_fraction) | |
| pred_text, model = ridge_fit_predict(X[train][:, sensor_idx], X[train][:, text_idx], X[test][:, sensor_idx], args.ridge_l2) | |
| # Query is true text; candidates are sensor windows projected into text space. | |
| metrics = retrieval_metrics(X[test][:, text_idx], pred_text, np.arange(len(test))) | |
| metrics.update({ | |
| "task": "caption_grounding", | |
| "input": "caption objects/interaction text query + candidate sensor windows", | |
| "output": "matching time window", | |
| "split": "chronological", | |
| "num_train_windows": int(len(train)), | |
| "num_test_windows": int(len(test)), | |
| }) | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| write_json(out_dir / "metrics.json", metrics) | |
| np.savez_compressed(out_dir / "model.npz", **model) | |
| return metrics | |
| def task_cross_modal_retrieval(out_dir: Path, X: np.ndarray, manifest: list[dict], args: argparse.Namespace) -> dict: | |
| motion_idx = block_indices(manifest, include=["hand_", "body_joints", "body_contacts", "camera_", "imu_"]) | |
| visual_idx = block_indices(manifest, include=["depth_confidence", "video_"]) | |
| train, test = chronological_split_indices(len(X), args.test_fraction) | |
| pred_visual, model = ridge_fit_predict(X[train][:, motion_idx], X[train][:, visual_idx], X[test][:, motion_idx], args.ridge_l2) | |
| metrics = retrieval_metrics(pred_visual, X[test][:, visual_idx], np.arange(len(test))) | |
| metrics.update({ | |
| "task": "cross_modal_retrieval", | |
| "input": "motion/IMU/camera query", | |
| "output": "matching depth/video window", | |
| "split": "chronological", | |
| "num_train_windows": int(len(train)), | |
| "num_test_windows": int(len(test)), | |
| }) | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| write_json(out_dir / "metrics.json", metrics) | |
| np.savez_compressed(out_dir / "model.npz", **model) | |
| return metrics | |
| def task_modality_reconstruction(out_dir: Path, X: np.ndarray, manifest: list[dict], args: argparse.Namespace) -> dict: | |
| motion_idx = block_indices(manifest, include=["hand_", "body_joints", "body_contacts", "camera_", "imu_"]) | |
| visual_idx = block_indices(manifest, include=["depth_confidence", "video_"]) | |
| train, test = chronological_split_indices(len(X), args.test_fraction) | |
| pred, model = ridge_fit_predict(X[train][:, motion_idx], X[train][:, visual_idx], X[test][:, motion_idx], args.ridge_l2) | |
| metrics = regression_metrics(X[test][:, visual_idx], pred) | |
| metrics.update({ | |
| "task": "modality_reconstruction", | |
| "input": "motion/IMU/camera", | |
| "output": "depth/video feature vector", | |
| "split": "chronological", | |
| "num_train_windows": int(len(train)), | |
| "num_test_windows": int(len(test)), | |
| "target_dim": int(len(visual_idx)), | |
| }) | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| write_json(out_dir / "metrics.json", metrics) | |
| np.savez_compressed(out_dir / "predictions.npz", y_true=X[test][:, visual_idx], y_pred=pred, **model) | |
| return metrics | |
| def binary_classification_from_arrays(out_dir: Path, X: np.ndarray, y: np.ndarray, args: argparse.Namespace, task: str, input_desc: str) -> dict: | |
| train, test = chronological_split_indices(len(y), args.test_fraction) | |
| mean, std = fit_scaler(X[train]) | |
| Xs = (X - mean) / std | |
| W, b, history = train_softmax_classifier( | |
| Xs[train], | |
| y[train].astype(np.int64), | |
| n_classes=2, | |
| epochs=args.epochs, | |
| lr=args.learning_rate, | |
| l2=args.l2, | |
| use_class_weights=True, | |
| seed=args.seed, | |
| ) | |
| pred, prob = predict(Xs[test], W, b) | |
| metrics = binary_metrics(y[test], pred) | |
| metrics.update({ | |
| "task": task, | |
| "input": input_desc, | |
| "split": "chronological", | |
| "num_samples": int(len(y)), | |
| "num_train_samples": int(len(train)), | |
| "num_test_samples": int(len(test)), | |
| "train_final_accuracy": float(history[-1]["train_accuracy"]), | |
| }) | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| write_json(out_dir / "metrics.json", metrics) | |
| pred_rows = [] | |
| for k, idx in enumerate(test): | |
| pred_rows.append({"sample_index": int(idx), "true": int(y[idx]), "predicted": int(pred[k]), "prob_positive": float(prob[k, 1])}) | |
| write_csv(out_dir / "predictions.csv", pred_rows, ["sample_index", "true", "predicted", "prob_positive"]) | |
| np.savez_compressed(out_dir / "model.npz", mean=mean, std=std, W=W, b=b) | |
| return metrics | |
| def task_temporal_order(out_dir: Path, X: np.ndarray, args: argparse.Namespace) -> dict: | |
| pairs, y = [], [] | |
| for i in range(len(X) - 1): | |
| a, b = X[i], X[i + 1] | |
| pairs.append(np.concatenate([a, b, b - a])) | |
| y.append(1) | |
| pairs.append(np.concatenate([b, a, a - b])) | |
| y.append(0) | |
| return binary_classification_from_arrays(out_dir, np.stack(pairs).astype(np.float32), np.asarray(y, dtype=np.int64), args, "temporal_order", "two adjacent windows -> whether order is correct") | |
| def task_misalignment(out_dir: Path, X: np.ndarray, manifest: list[dict], args: argparse.Namespace) -> dict: | |
| motion_idx = block_indices(manifest, include=["hand_", "body_joints", "body_contacts", "camera_", "imu_"]) | |
| visual_idx = block_indices(manifest, include=["depth_confidence", "video_"]) | |
| shift = args.misalignment_shift_windows | |
| pairs, y = [], [] | |
| limit = len(X) - shift | |
| for i in range(limit): | |
| pairs.append(np.concatenate([X[i, motion_idx], X[i, visual_idx]])) | |
| y.append(1) | |
| pairs.append(np.concatenate([X[i, motion_idx], X[i + shift, visual_idx]])) | |
| y.append(0) | |
| return binary_classification_from_arrays(out_dir, np.stack(pairs).astype(np.float32), np.asarray(y, dtype=np.int64), args, "misalignment_detection", f"motion+visual pair -> aligned vs shifted by {shift} windows") | |
| def main() -> int: | |
| args = parse_args() | |
| add_toolkit_to_path(args.workspace) | |
| from data_loader import load_from_annotation_hdf5 | |
| args.output_dir.mkdir(parents=True, exist_ok=True) | |
| tasks = selected_tasks(args.tasks) | |
| print(f"Loading annotation: {args.annotation}") | |
| ann = load_from_annotation_hdf5(args.annotation, 0, None, load_slam_point_cloud=True) | |
| extras, available_modalities = prepare_modalities(args, ann) | |
| print("Building shared all-modality windows") | |
| X, rows, manifest = build_windows(args, ann, extras) | |
| write_json(args.output_dir / "available_modalities.json", available_modalities) | |
| write_json(args.output_dir / "feature_manifest.json", manifest) | |
| write_csv(args.output_dir / "windows.csv", rows, ["window_index", "start_frame", "end_frame", "center_frame", "action_label", "action_fraction", "subtask_label", "subtask_fraction"]) | |
| np.savez_compressed(args.output_dir / "shared_windows.npz", X=X, starts=np.asarray([r["start_frame"] for r in rows]), ends=np.asarray([r["end_frame"] for r in rows])) | |
| summary = { | |
| "annotation": portable_path(args.annotation, args.workspace), | |
| "num_frames": int(len(ann["img_names"])), | |
| "num_windows": int(len(rows)), | |
| "feature_dim": int(X.shape[1]), | |
| "window_frames": int(args.window_frames), | |
| "stride_frames": int(args.stride_frames), | |
| "tasks": {}, | |
| } | |
| print(f"Windows: {len(rows)}, feature_dim: {X.shape[1]}") | |
| for task in tasks: | |
| print(f"\nRunning task: {task}") | |
| out = args.output_dir / task | |
| try: | |
| if task == "timeline_action": | |
| metrics = classification_task(out, X, label_array(rows, "action_label"), rows, args, task, "all modalities -> current action label") | |
| elif task == "timeline_subtask": | |
| metrics = classification_task(out, X, label_array(rows, "subtask_label"), rows, args, task, "all modalities -> current subtask label") | |
| elif task == "transition_detection": | |
| metrics = task_transition_detection(out, X, rows, ann, args) | |
| elif task == "next_action": | |
| metrics = task_next_action(out, X, rows, ann, args) | |
| elif task == "hand_trajectory_forecast": | |
| metrics = task_hand_forecast(out, X, rows, ann, args) | |
| elif task == "contact_prediction": | |
| metrics = task_contact_prediction(out, X, rows, ann, manifest, args) | |
| elif task == "object_relevance": | |
| metrics = task_object_relevance(out, X, rows, ann, manifest, args) | |
| elif task == "caption_grounding": | |
| metrics = task_caption_grounding(out, X, manifest, args) | |
| elif task == "cross_modal_retrieval": | |
| metrics = task_cross_modal_retrieval(out, X, manifest, args) | |
| elif task == "modality_reconstruction": | |
| metrics = task_modality_reconstruction(out, X, manifest, args) | |
| elif task == "temporal_order": | |
| metrics = task_temporal_order(out, X, args) | |
| elif task == "misalignment_detection": | |
| metrics = task_misalignment(out, X, manifest, args) | |
| else: | |
| raise ValueError(task) | |
| summary["tasks"][task] = metrics | |
| key_metrics = {k: metrics[k] for k in ("accuracy", "macro_f1", "f1", "mpjpe", "mrr", "r2", "micro_f1") if k in metrics} | |
| print(f" done: {key_metrics}") | |
| except Exception as exc: | |
| summary["tasks"][task] = {"error": str(exc)} | |
| write_json(out / "error.json", {"task": task, "error": str(exc)}) | |
| print(f" error: {exc}") | |
| write_json(args.output_dir / "summary_report.json", summary) | |
| print(f"\nSuite artifacts written to: {args.output_dir}") | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |