ntsrigaud commited on
Commit
6ef1410
·
verified ·
1 Parent(s): 296b826

Upload two_stream_attn_v1_finetune_20260515T104743Z

Browse files
Files changed (2) hide show
  1. README.md +23 -23
  2. config.json +30 -30
README.md CHANGED
@@ -17,7 +17,7 @@ metrics:
17
  - accuracy
18
  - f1
19
  model-index:
20
- - name: two_stream_attn_v1_finetune_20260515T094538Z
21
  results:
22
  - task:
23
  type: gesture-recognition
@@ -26,12 +26,12 @@ model-index:
26
  type: IPN-Hand
27
  metrics:
28
  - type: accuracy
29
- value: 0.9539
30
  - type: f1
31
- value: 0.9527
32
  ---
33
 
34
- # two_stream_attn_v1_finetune_20260515T094538Z
35
 
36
  A real-time hand gesture classifier trained on
37
  a Hybrid Jester+IPN gesture dataset (Jester dynamic classes + IPN pointing classes).
@@ -139,7 +139,7 @@ from maestro.infrastructure.model.checkpoint_loader import load_inference_artifa
139
  # Download the artifact (cached after first call)
140
  local_path = hf_hub_download(
141
  repo_id="ntsrigaud/maestro-lstm-hybrid",
142
- filename="two_stream_attn_v1_finetune_20260515T094538Z_inference.pt",
143
  )
144
 
145
  # Load the artifact (includes model, class labels, and feature schema)
@@ -171,7 +171,7 @@ Two-phase transfer learning pipeline:
171
  - **Phase 1 (pretraining):** backbone pretrained on external checkpoint `two_stream_attn_v1_20260513T155730Z.pt` to learn generic gesture dynamics.
172
  - **Phase 2 (fine-tuning):** head replaced and model adapted on Hybrid Jester+IPN 10-gesture vocabulary.
173
  - **Stage A (frozen backbone):** 10 epoch(s) head-only warmup.
174
- - **Stage B (full model):** up to 48 epoch(s) joint fine-tuning with scheduler/early stopping.
175
  - **Stage B retention defences:** replay_max_samples_per_class=500, distillation_weight=0.0, replay_ce_weight=0.0, backbone_lr_multiplier=0.1, gpm_components=0, forgetting_penalty_weight=0.5.
176
 
177
  ## Training Configuration
@@ -184,38 +184,38 @@ Two-phase transfer learning pipeline:
184
  | Projection dim | 96 |
185
  | Num layers | 4 |
186
  | MHA heads | 8 (head dim: 24) |
187
- | Dropout | 0.35 |
188
  | Learning rate | 3e-05 |
189
- | Weight decay | 0.0005 |
190
  | Batch size | 128 |
191
- | Max epochs | 60 |
192
- | Early stopping patience | 12 |
193
  | Label smoothing | 0.05 |
194
  | Class weighting | disabled |
195
  | Max samples per class | 3000 |
196
- | LR scheduler | ReduceLROnPlateau (factor=0.5, patience=8) |
197
 
198
  ## Evaluation Results (Test Set)
199
 
200
  | Metric | Value |
201
  |--------|-------|
202
- | Accuracy | 95.4% |
203
- | Macro F1 | 95.3% |
204
 
205
  ### Per-Class Recall
206
 
207
  | Class | Recall |
208
  |-------|--------|
209
- | `fist` | 96.6% |
210
- | `swiping_right` | 95.3% |
211
- | `swiping_left` | 98.1% |
212
- | `swiping_down` | 96.4% |
213
- | `swiping_up` | 97.8% |
214
- | `zooming_in_full_hand` | 97.3% |
215
- | `zooming_out_full_hand` | 93.2% |
216
- | `point_one` | 96.6% |
217
- | `point_two` | 93.5% |
218
- | `unknown` | 87.9% |
219
 
220
  ## Comparison with Previous Architecture
221
 
 
17
  - accuracy
18
  - f1
19
  model-index:
20
+ - name: two_stream_attn_v1_finetune_20260515T104743Z
21
  results:
22
  - task:
23
  type: gesture-recognition
 
26
  type: IPN-Hand
27
  metrics:
28
  - type: accuracy
29
+ value: 0.9566
30
  - type: f1
31
+ value: 0.9556
32
  ---
33
 
34
+ # two_stream_attn_v1_finetune_20260515T104743Z
35
 
36
  A real-time hand gesture classifier trained on
37
  a Hybrid Jester+IPN gesture dataset (Jester dynamic classes + IPN pointing classes).
 
139
  # Download the artifact (cached after first call)
140
  local_path = hf_hub_download(
141
  repo_id="ntsrigaud/maestro-lstm-hybrid",
142
+ filename="two_stream_attn_v1_finetune_20260515T104743Z_inference.pt",
143
  )
144
 
145
  # Load the artifact (includes model, class labels, and feature schema)
 
171
  - **Phase 1 (pretraining):** backbone pretrained on external checkpoint `two_stream_attn_v1_20260513T155730Z.pt` to learn generic gesture dynamics.
172
  - **Phase 2 (fine-tuning):** head replaced and model adapted on Hybrid Jester+IPN 10-gesture vocabulary.
173
  - **Stage A (frozen backbone):** 10 epoch(s) head-only warmup.
174
+ - **Stage B (full model):** up to 80 epoch(s) joint fine-tuning with scheduler/early stopping.
175
  - **Stage B retention defences:** replay_max_samples_per_class=500, distillation_weight=0.0, replay_ce_weight=0.0, backbone_lr_multiplier=0.1, gpm_components=0, forgetting_penalty_weight=0.5.
176
 
177
  ## Training Configuration
 
184
  | Projection dim | 96 |
185
  | Num layers | 4 |
186
  | MHA heads | 8 (head dim: 24) |
187
+ | Dropout | 0.4 |
188
  | Learning rate | 3e-05 |
189
+ | Weight decay | 0.001 |
190
  | Batch size | 128 |
191
+ | Max epochs | 80 |
192
+ | Early stopping patience | 20 |
193
  | Label smoothing | 0.05 |
194
  | Class weighting | disabled |
195
  | Max samples per class | 3000 |
196
+ | LR scheduler | ReduceLROnPlateau (factor=0.5, patience=10) |
197
 
198
  ## Evaluation Results (Test Set)
199
 
200
  | Metric | Value |
201
  |--------|-------|
202
+ | Accuracy | 95.7% |
203
+ | Macro F1 | 95.6% |
204
 
205
  ### Per-Class Recall
206
 
207
  | Class | Recall |
208
  |-------|--------|
209
+ | `fist` | 96.2% |
210
+ | `swiping_right` | 95.5% |
211
+ | `swiping_left` | 98.7% |
212
+ | `swiping_down` | 96.6% |
213
+ | `swiping_up` | 97.6% |
214
+ | `zooming_in_full_hand` | 97.5% |
215
+ | `zooming_out_full_hand` | 93.9% |
216
+ | `point_one` | 97.4% |
217
+ | `point_two` | 94.3% |
218
+ | `unknown` | 87.7% |
219
 
220
  ## Comparison with Previous Architecture
221
 
config.json CHANGED
@@ -1,11 +1,11 @@
1
  {
2
- "model_version": "two_stream_attn_v1_finetune_20260515T094538Z",
3
  "model_config": {
4
  "model_name": "two_stream_attn_v1",
5
  "input_size": 147,
6
  "hidden_size": 96,
7
  "num_layers": 4,
8
- "dropout": 0.35,
9
  "num_classes": 10
10
  },
11
  "feature_schema": {
@@ -16,10 +16,10 @@
16
  "window_step": 3
17
  },
18
  "training_config": {
19
- "epochs": 60,
20
  "batch_size": 128,
21
  "learning_rate": 3e-05,
22
- "weight_decay": 0.0005,
23
  "grad_clip_norm": 1.0,
24
  "seed": 42,
25
  "label_smoothing": 0.05,
@@ -27,38 +27,38 @@
27
  "max_samples_per_class": 3000,
28
  "scheduler": {
29
  "factor": 0.5,
30
- "patience": 8,
31
  "min_lr": 1e-06
32
  }
33
  },
34
  "evaluation": {
35
- "test_accuracy": 0.9538638985005767,
36
- "test_macro_f1": 0.9526959411661929,
37
- "test_loss": 0.42990012703744873,
38
- "calibration_ece": 0.030860847365347433,
39
  "per_class_recall": {
40
- "fist": 0.9656357388316151,
41
- "swiping_right": 0.9525959367945824,
42
- "swiping_left": 0.9813084112149533,
43
- "swiping_down": 0.9644970414201184,
44
- "swiping_up": 0.9775967413441955,
45
- "zooming_in_full_hand": 0.9727272727272728,
46
- "zooming_out_full_hand": 0.9324894514767933,
47
- "point_one": 0.9658792650918635,
48
- "point_two": 0.9347826086956522,
49
- "unknown": 0.8790123456790123
50
  },
51
  "per_class_precision": {
52
- "fist": 0.972318339100346,
53
- "swiping_right": 0.9612756264236902,
54
- "swiping_left": 0.9868421052631579,
55
- "swiping_down": 0.9607072691552063,
56
- "swiping_up": 0.9542743538767395,
57
- "zooming_in_full_hand": 0.928416485900217,
58
- "zooming_out_full_hand": 0.9692982456140351,
59
- "point_one": 0.8910411622276029,
60
- "point_two": 0.9717514124293786,
61
- "unknown": 0.9393139841688655
62
  }
63
  },
64
  "class_labels": [
@@ -73,7 +73,7 @@
73
  "point_two",
74
  "unknown"
75
  ],
76
- "created_at": "2026-05-15T09:51:16.703942+00:00",
77
  "gesture_command_mapping": {
78
  "commands": {
79
  "swiping_up": "start_presentation",
 
1
  {
2
+ "model_version": "two_stream_attn_v1_finetune_20260515T104743Z",
3
  "model_config": {
4
  "model_name": "two_stream_attn_v1",
5
  "input_size": 147,
6
  "hidden_size": 96,
7
  "num_layers": 4,
8
+ "dropout": 0.4,
9
  "num_classes": 10
10
  },
11
  "feature_schema": {
 
16
  "window_step": 3
17
  },
18
  "training_config": {
19
+ "epochs": 80,
20
  "batch_size": 128,
21
  "learning_rate": 3e-05,
22
+ "weight_decay": 0.001,
23
  "grad_clip_norm": 1.0,
24
  "seed": 42,
25
  "label_smoothing": 0.05,
 
27
  "max_samples_per_class": 3000,
28
  "scheduler": {
29
  "factor": 0.5,
30
+ "patience": 10,
31
  "min_lr": 1e-06
32
  }
33
  },
34
  "evaluation": {
35
+ "test_accuracy": 0.9566320645905421,
36
+ "test_macro_f1": 0.9555633825064029,
37
+ "test_loss": 0.4212703935896512,
38
+ "calibration_ece": 0.030699590615060227,
39
  "per_class_recall": {
40
+ "fist": 0.9621993127147767,
41
+ "swiping_right": 0.9548532731376975,
42
+ "swiping_left": 0.9869158878504672,
43
+ "swiping_down": 0.9664694280078896,
44
+ "swiping_up": 0.9755600814663951,
45
+ "zooming_in_full_hand": 0.975,
46
+ "zooming_out_full_hand": 0.9388185654008439,
47
+ "point_one": 0.973753280839895,
48
+ "point_two": 0.9429347826086957,
49
+ "unknown": 0.8765432098765432
50
  },
51
  "per_class_precision": {
52
+ "fist": 0.9790209790209791,
53
+ "swiping_right": 0.9701834862385321,
54
+ "swiping_left": 0.9777777777777777,
55
+ "swiping_down": 0.9551656920077972,
56
+ "swiping_up": 0.9618473895582329,
57
+ "zooming_in_full_hand": 0.9407894736842105,
58
+ "zooming_out_full_hand": 0.973741794310722,
59
+ "point_one": 0.8918269230769231,
60
+ "point_two": 0.9719887955182073,
61
+ "unknown": 0.9441489361702128
62
  }
63
  },
64
  "class_labels": [
 
73
  "point_two",
74
  "unknown"
75
  ],
76
+ "created_at": "2026-05-15T10:57:07.231228+00:00",
77
  "gesture_command_mapping": {
78
  "commands": {
79
  "swiping_up": "start_presentation",