{ "sampled_total": 120, "task_counts": { "": 30, "": 30, "": 60 }, "verify": { "accuracy": 0.5333333333333333, "macro_f1": 0.5385224796989503, "pred_label_dist": { "SUPPORTS": 10, "null": 3, "NEI": 9, "REFUTES": 8 }, "gold_label_dist": { "SUPPORTS": 8, "REFUTES": 9, "NEI": 13 }, "confusion_matrix": { "SUPPORTS": { "SUPPORTS": 4, "REFUTES": 3, "NEI": 0, "null": 1 }, "REFUTES": { "SUPPORTS": 4, "REFUTES": 3, "NEI": 0, "null": 2 }, "NEI": { "SUPPORTS": 2, "REFUTES": 2, "NEI": 9, "null": 0 } } }, "qa": { "accuracy": 0.4, "pred_label_dist": { "no": 8, "yes": 22 }, "gold_label_dist": { "maybe": 12, "no": 10, "yes": 8 }, "majority_label": "maybe", "majority_baseline": 0.4 }, "rerank": { "pair_accuracy": 0.35, "pred_label_dist": { "3": 3, "2": 21, "0": 31, "1": 5 }, "gold_label_dist": { "1": 25, "0": 29, "3": 3, "2": 3 }, "group_count": 6, "group_size_dist": { "10": 6 }, "groups": 6.0, "mrr_at_10": 0.26666666666666666, "recall_at_1": 0.0, "recall_at_3": 0.3333333333333333, "recall_at_5": 0.5 }, "elapsed_sec": 216.709543466568 }