ntsrigaud
/

maestro-lstm-hybrid

@@ -17,7 +17,7 @@ metrics:
 - accuracy
 - f1
 model-index:
-- name: two_stream_attn_v1_finetune_20260515T094538Z
   results:
   - task:
       type: gesture-recognition
@@ -26,12 +26,12 @@ model-index:
       type: IPN-Hand
     metrics:
     - type: accuracy
-      value: 0.9539
     - type: f1
-      value: 0.9527
 ---
-# two_stream_attn_v1_finetune_20260515T094538Z
 A real-time hand gesture classifier trained on
 a Hybrid Jester+IPN gesture dataset (Jester dynamic classes + IPN pointing classes).
@@ -139,7 +139,7 @@ from maestro.infrastructure.model.checkpoint_loader import load_inference_artifa
 # Download the artifact (cached after first call)
 local_path = hf_hub_download(
     repo_id="ntsrigaud/maestro-lstm-hybrid",
-    filename="two_stream_attn_v1_finetune_20260515T094538Z_inference.pt",
 )
 # Load the artifact (includes model, class labels, and feature schema)
@@ -171,7 +171,7 @@ Two-phase transfer learning pipeline:
 - **Phase 1 (pretraining):** backbone pretrained on external checkpoint `two_stream_attn_v1_20260513T155730Z.pt` to learn generic gesture dynamics.
 - **Phase 2 (fine-tuning):** head replaced and model adapted on Hybrid Jester+IPN 10-gesture vocabulary.
 - **Stage A (frozen backbone):** 10 epoch(s) head-only warmup.
-- **Stage B (full model):** up to 48 epoch(s) joint fine-tuning with scheduler/early stopping.
 - **Stage B retention defences:** replay_max_samples_per_class=500, distillation_weight=0.0, replay_ce_weight=0.0, backbone_lr_multiplier=0.1, gpm_components=0, forgetting_penalty_weight=0.5.
 ## Training Configuration
@@ -184,38 +184,38 @@ Two-phase transfer learning pipeline:
 | Projection dim | 96 |
 | Num layers | 4 |
 | MHA heads | 8 (head dim: 24) |
-| Dropout | 0.35 |
 | Learning rate | 3e-05 |
-| Weight decay | 0.0005 |
 | Batch size | 128 |
-| Max epochs | 60 |
-| Early stopping patience | 12 |
 | Label smoothing | 0.05 |
 | Class weighting | disabled |
 | Max samples per class | 3000 |
-| LR scheduler | ReduceLROnPlateau (factor=0.5, patience=8) |
 ## Evaluation Results (Test Set)
 | Metric | Value |
 |--------|-------|
-| Accuracy | 95.4% |
-| Macro F1 | 95.3% |
 ### Per-Class Recall
 | Class | Recall |
 |-------|--------|
-| `fist` | 96.6% |
-| `swiping_right` | 95.3% |
-| `swiping_left` | 98.1% |
-| `swiping_down` | 96.4% |
-| `swiping_up` | 97.8% |
-| `zooming_in_full_hand` | 97.3% |
-| `zooming_out_full_hand` | 93.2% |
-| `point_one` | 96.6% |
-| `point_two` | 93.5% |
-| `unknown` | 87.9% |
 ## Comparison with Previous Architecture

 - accuracy
 - f1
 model-index:
+- name: two_stream_attn_v1_finetune_20260515T104743Z
   results:
   - task:
       type: gesture-recognition
       type: IPN-Hand
     metrics:
     - type: accuracy
+      value: 0.9566
     - type: f1
+      value: 0.9556
 ---
+# two_stream_attn_v1_finetune_20260515T104743Z
 A real-time hand gesture classifier trained on
 a Hybrid Jester+IPN gesture dataset (Jester dynamic classes + IPN pointing classes).
 # Download the artifact (cached after first call)
 local_path = hf_hub_download(
     repo_id="ntsrigaud/maestro-lstm-hybrid",
+    filename="two_stream_attn_v1_finetune_20260515T104743Z_inference.pt",
 )
 # Load the artifact (includes model, class labels, and feature schema)
 - **Phase 1 (pretraining):** backbone pretrained on external checkpoint `two_stream_attn_v1_20260513T155730Z.pt` to learn generic gesture dynamics.
 - **Phase 2 (fine-tuning):** head replaced and model adapted on Hybrid Jester+IPN 10-gesture vocabulary.
 - **Stage A (frozen backbone):** 10 epoch(s) head-only warmup.
+- **Stage B (full model):** up to 80 epoch(s) joint fine-tuning with scheduler/early stopping.
 - **Stage B retention defences:** replay_max_samples_per_class=500, distillation_weight=0.0, replay_ce_weight=0.0, backbone_lr_multiplier=0.1, gpm_components=0, forgetting_penalty_weight=0.5.
 ## Training Configuration
 | Projection dim | 96 |
 | Num layers | 4 |
 | MHA heads | 8 (head dim: 24) |
+| Dropout | 0.4 |
 | Learning rate | 3e-05 |
+| Weight decay | 0.001 |
 | Batch size | 128 |
+| Max epochs | 80 |
+| Early stopping patience | 20 |
 | Label smoothing | 0.05 |
 | Class weighting | disabled |
 | Max samples per class | 3000 |
+| LR scheduler | ReduceLROnPlateau (factor=0.5, patience=10) |
 ## Evaluation Results (Test Set)
 | Metric | Value |
 |--------|-------|
+| Accuracy | 95.7% |
+| Macro F1 | 95.6% |
 ### Per-Class Recall
 | Class | Recall |
 |-------|--------|
+| `fist` | 96.2% |
+| `swiping_right` | 95.5% |
+| `swiping_left` | 98.7% |
+| `swiping_down` | 96.6% |
+| `swiping_up` | 97.6% |
+| `zooming_in_full_hand` | 97.5% |
+| `zooming_out_full_hand` | 93.9% |
+| `point_one` | 97.4% |
+| `point_two` | 94.3% |
+| `unknown` | 87.7% |
 ## Comparison with Previous Architecture

config.json CHANGED Viewed

@@ -1,11 +1,11 @@
 {
-  "model_version": "two_stream_attn_v1_finetune_20260515T094538Z",
   "model_config": {
     "model_name": "two_stream_attn_v1",
     "input_size": 147,
     "hidden_size": 96,
     "num_layers": 4,
-    "dropout": 0.35,
     "num_classes": 10
   },
   "feature_schema": {
@@ -16,10 +16,10 @@
     "window_step": 3
   },
   "training_config": {
-    "epochs": 60,
     "batch_size": 128,
     "learning_rate": 3e-05,
-    "weight_decay": 0.0005,
     "grad_clip_norm": 1.0,
     "seed": 42,
     "label_smoothing": 0.05,
@@ -27,38 +27,38 @@
     "max_samples_per_class": 3000,
     "scheduler": {
       "factor": 0.5,
-      "patience": 8,
       "min_lr": 1e-06
     }
   },
   "evaluation": {
-    "test_accuracy": 0.9538638985005767,
-    "test_macro_f1": 0.9526959411661929,
-    "test_loss": 0.42990012703744873,
-    "calibration_ece": 0.030860847365347433,
     "per_class_recall": {
-      "fist": 0.9656357388316151,
-      "swiping_right": 0.9525959367945824,
-      "swiping_left": 0.9813084112149533,
-      "swiping_down": 0.9644970414201184,
-      "swiping_up": 0.9775967413441955,
-      "zooming_in_full_hand": 0.9727272727272728,
-      "zooming_out_full_hand": 0.9324894514767933,
-      "point_one": 0.9658792650918635,
-      "point_two": 0.9347826086956522,
-      "unknown": 0.8790123456790123
     },
     "per_class_precision": {
-      "fist": 0.972318339100346,
-      "swiping_right": 0.9612756264236902,
-      "swiping_left": 0.9868421052631579,
-      "swiping_down": 0.9607072691552063,
-      "swiping_up": 0.9542743538767395,
-      "zooming_in_full_hand": 0.928416485900217,
-      "zooming_out_full_hand": 0.9692982456140351,
-      "point_one": 0.8910411622276029,
-      "point_two": 0.9717514124293786,
-      "unknown": 0.9393139841688655
     }
   },
   "class_labels": [
@@ -73,7 +73,7 @@
     "point_two",
     "unknown"
   ],
-  "created_at": "2026-05-15T09:51:16.703942+00:00",
   "gesture_command_mapping": {
     "commands": {
       "swiping_up": "start_presentation",

 {
+  "model_version": "two_stream_attn_v1_finetune_20260515T104743Z",
   "model_config": {
     "model_name": "two_stream_attn_v1",
     "input_size": 147,
     "hidden_size": 96,
     "num_layers": 4,
+    "dropout": 0.4,
     "num_classes": 10
   },
   "feature_schema": {
     "window_step": 3
   },
   "training_config": {
+    "epochs": 80,
     "batch_size": 128,
     "learning_rate": 3e-05,
+    "weight_decay": 0.001,
     "grad_clip_norm": 1.0,
     "seed": 42,
     "label_smoothing": 0.05,
     "max_samples_per_class": 3000,
     "scheduler": {
       "factor": 0.5,
+      "patience": 10,
       "min_lr": 1e-06
     }
   },
   "evaluation": {
+    "test_accuracy": 0.9566320645905421,
+    "test_macro_f1": 0.9555633825064029,
+    "test_loss": 0.4212703935896512,
+    "calibration_ece": 0.030699590615060227,
     "per_class_recall": {
+      "fist": 0.9621993127147767,
+      "swiping_right": 0.9548532731376975,
+      "swiping_left": 0.9869158878504672,
+      "swiping_down": 0.9664694280078896,
+      "swiping_up": 0.9755600814663951,
+      "zooming_in_full_hand": 0.975,
+      "zooming_out_full_hand": 0.9388185654008439,
+      "point_one": 0.973753280839895,
+      "point_two": 0.9429347826086957,
+      "unknown": 0.8765432098765432
     },
     "per_class_precision": {
+      "fist": 0.9790209790209791,
+      "swiping_right": 0.9701834862385321,
+      "swiping_left": 0.9777777777777777,
+      "swiping_down": 0.9551656920077972,
+      "swiping_up": 0.9618473895582329,
+      "zooming_in_full_hand": 0.9407894736842105,
+      "zooming_out_full_hand": 0.973741794310722,
+      "point_one": 0.8918269230769231,
+      "point_two": 0.9719887955182073,
+      "unknown": 0.9441489361702128
     }
   },
   "class_labels": [
     "point_two",
     "unknown"
   ],
+  "created_at": "2026-05-15T10:57:07.231228+00:00",
   "gesture_command_mapping": {
     "commands": {
       "swiping_up": "start_presentation",