{"candidates": [{"candidate_id": "valid_m_p", "claim": "All M are P", "confidence": 0.42, "raw_output": "All M are P", "source": "learned_candidate_dataset"}, {"candidate_id": "bad_reverse_p_m", "claim": "All P are M", "confidence": 0.99, "raw_output": "All P are M", "source": "learned_candidate_dataset"}, {"candidate_id": "bad_identity_m_p", "claim": "M equals P", "confidence": 0.98, "raw_output": "M equals P", "source": "learned_candidate_dataset"}], "case_id": "stress_high_confidence_wrong", "input_text": "All M are N. All N are O. All O are P. Are all M P?", "labels": {"bad_identity_m_p": {"channels": ["identity_preservation"], "claim": "M equals P", "reason": "candidate collapses distinct graph nodes", "resolver": "reject_identity", "status": "rejected"}, "bad_reverse_p_m": {"channels": ["directionality"], "claim": "All P are M", "reason": "candidate reverses a directed support path", "resolver": "reject_reverse", "status": "rejected"}, "valid_m_p": {"channels": ["logic_transitivity"], "claim": "All M are P", "reason": "candidate is supported by a typed transitive inference", "resolver": "accept_transitive", "status": "accepted"}}, "split": "stress", "tags": ["deeper_chain", "high_confidence_wrong"]}
{"candidates": [{"candidate_id": "valid_cats_animals", "claim": "All cats are animals", "confidence": 0.53, "raw_output": "All cats are animals", "source": "learned_candidate_dataset"}, {"candidate_id": "bad_no_cats_animals", "claim": "No cats are animals", "confidence": 0.9, "raw_output": "No cats are animals", "source": "learned_candidate_dataset"}, {"candidate_id": "malformed_cats", "claim": "cats somehow animalish", "confidence": 0.88, "raw_output": "cats somehow animalish", "source": "learned_candidate_dataset"}], "case_id": "stress_malformed_and_contradictory", "input_text": "All cats are mammals. All mammals are animals. Are all cats animals?", "labels": {"bad_no_cats_animals": {"channels": ["contradiction"], "claim": "No cats are animals", "reason": "candidate contradicts a premise-supported edge", "resolver": "reject_contradiction", "status": "rejected"}, "malformed_cats": {"channels": ["malformed_relation"], "claim": "cats somehow animalish", "reason": "candidate claim could not be parsed as a graph relation", "resolver": "reject_malformed", "status": "rejected"}, "valid_cats_animals": {"channels": ["logic_transitivity"], "claim": "All cats are animals", "reason": "candidate is supported by a typed transitive inference", "resolver": "accept_transitive", "status": "accepted"}}, "split": "stress", "tags": ["malformed", "contradiction", "high_confidence_wrong"]}
{"candidates": [{"candidate_id": "valid_a_d_with_distractor", "claim": "All A are D", "confidence": 0.49, "raw_output": "All A are D", "source": "learned_candidate_dataset"}, {"candidate_id": "wrong_r_d", "claim": "All R are D", "confidence": 0.89, "raw_output": "All R are D", "source": "learned_candidate_dataset"}, {"candidate_id": "wrong_reverse_d_a", "claim": "All D are A", "confidence": 0.91, "raw_output": "All D are A", "source": "learned_candidate_dataset"}], "case_id": "stress_distractor_chain", "input_text": "All A are B. All R are S. All B are C. All S are T. All C are D. Are all A D?", "labels": {"valid_a_d_with_distractor": {"channels": ["logic_transitivity"], "claim": "All A are D", "reason": "candidate is supported by a typed transitive inference", "resolver": "accept_transitive", "status": "accepted"}, "wrong_r_d": {"channels": ["typed_support"], "claim": "All R are D", "reason": "no typed channel produced support or a typed rejection", "resolver": "abstain_unsupported", "status": "abstained"}, "wrong_reverse_d_a": {"channels": ["directionality"], "claim": "All D are A", "reason": "candidate reverses a directed support path", "resolver": "reject_reverse", "status": "rejected"}}, "split": "stress", "tags": ["deeper_chain", "distractor", "high_confidence_wrong"]}
{"candidates": [{"candidate_id": "unsupported_a_z", "claim": "All A are Z", "confidence": 0.86, "raw_output": "All A are Z", "source": "learned_candidate_dataset"}, {"candidate_id": "bad_reverse_z_a", "claim": "All Z are A", "confidence": 0.92, "raw_output": "All Z are A", "source": "learned_candidate_dataset"}], "case_id": "stress_unsupported_abstention", "input_text": "All A are B. All B are C. Are all A Z?", "labels": {"bad_reverse_z_a": {"channels": ["typed_support"], "claim": "All Z are A", "reason": "no typed channel produced support or a typed rejection", "resolver": "abstain_unsupported", "status": "abstained"}, "unsupported_a_z": {"channels": ["typed_support"], "claim": "All A are Z", "reason": "no typed channel produced support or a typed rejection", "resolver": "abstain_unsupported", "status": "abstained"}}, "split": "stress", "tags": ["unsupported", "high_confidence_wrong"]}