DATEXIS
/

sproto

@@ -103,62 +103,121 @@ The model depends on the base `sproto` package (which contains `MultiProtoModule
 ```bash
 pip install torch>=1.12.1 \
-            transformers>=4.25.1 \
-            torchmetrics>=0.10.1 \
-            pytorch-lightning==1.9
 ```
 | Package | Required version | Reason |
 |---------|-----------------|--------|
 | `torch` | `>= 1.12.1` | Minimum version for `nn.PairwiseDistance` and `torch.einsum` patterns used in the prototype layer |
-| `transformers` | `>= 4.25.1` | Minimum version with `trust_remote_code` + `auto_map` support for custom model loading |
-| `torchmetrics` | `>= 0.10.1` | `MultilabelAveragePrecision` was added in 0.10; older versions raise `AttributeError` on load |
 | `pytorch-lightning` | `== 1.9` | `MultiProtoModule` is a `pl.LightningModule`; the exact API (e.g. `validation_epoch_end`) changed in 2.x |
 | `sproto` | bundled | The `sproto/` package is included in this HF repo and downloaded automatically with `trust_remote_code=True` — no separate install needed |
 ## Inference Example
 ```python
-from transformers import AutoTokenizer, AutoModel
 import torch
-tokenizer = AutoTokenizer.from_pretrained("DATEXIS/sproto")
-model = AutoModel.from_pretrained("DATEXIS/sproto", trust_remote_code=True)
-model.eval()
-text_input = [
-    "CHIEF COMPLAINT: Right Carotid Artery Stenosis. "
-    "PRESENT ILLNESS: Ms. ___ is a ___ year old woman with hyperlipidemia, "
-    "cirrhosis with esophageal varices, alcoholism, COPD, left eye blindness, "
-    "and right carotid stenosis status post right carotid endarterectomy."
-]
-inputs = tokenizer(
-    text_input,
-    padding=True,
-    truncation=True,
-    max_length=512,
-    return_tensors="pt"
-)
-tokens = [tokenizer.convert_ids_to_tokens(ids) for ids in inputs["input_ids"]]
-with torch.no_grad():
-    output = model(
-        input_ids=inputs["input_ids"],
-        attention_mask=inputs["attention_mask"],
-        token_type_ids=inputs.get("token_type_ids"),
-        tokens=tokens
     )
-logits = output["logits"]
-max_indices = output["max_indices"]
-metadata = output["metadata"]
-print("Inference successful")
-print("Logits shape:", logits.shape)
-print("Max indices:", max_indices)
-print("Metadata:", metadata)
 ```
 > **Note:** `tokens` (the list of token strings per sample) is **required** when `use_attention=True`

 ```bash
 pip install torch>=1.12.1 \
+            transformers==4.40.0 \
+            torchmetrics==0.10.3 \
+            pytorch-lightning==1.9 \
+            huggingface-hub \
+            matplotlib
 ```
 | Package | Required version | Reason |
 |---------|-----------------|--------|
 | `torch` | `>= 1.12.1` | Minimum version for `nn.PairwiseDistance` and `torch.einsum` patterns used in the prototype layer |
+| `transformers` | `== 4.40.0` | Required to bypass a metadata parsing bug |
+| `torchmetrics` | `== 0.10.3` | `MultilabelAveragePrecision` was added in 0.10; older versions raise `AttributeError` on load |
 | `pytorch-lightning` | `== 1.9` | `MultiProtoModule` is a `pl.LightningModule`; the exact API (e.g. `validation_epoch_end`) changed in 2.x |
+| `huggingface-hub` | any | Required for fetching additional assets like thresholds and labels |
+| `matplotlib` | any | Used for visualizations |
 | `sproto` | bundled | The `sproto/` package is included in this HF repo and downloaded automatically with `trust_remote_code=True` — no separate install needed |
 ## Inference Example
 ```python
 import torch
+import sys
+import json
+from huggingface_hub import snapshot_download, hf_hub_download
+from transformers import AutoTokenizer, AutoModel
+def main():
+    # 1. Download the repo and inject it into sys.path to resolve the internal 'sproto' package
+    repo_id = "DATEXIS/sproto"
+    repo_path = snapshot_download(repo_id)
+    if repo_path not in sys.path:
+        sys.path.insert(0, repo_path)
+    # 2. Load Tokenizer and Model
+    tokenizer = AutoTokenizer.from_pretrained(repo_id)
+    # use_safetensors=False is required to bypass a metadata parsing bug in transformers 4.40.0
+    model = AutoModel.from_pretrained(repo_id, trust_remote_code=True, use_safetensors=False)
+    model.eval()
+    # 3. Prepare Input Text
+    text = """CHIEF COMPLAINT: depression, chest pain and vomiting
+    PRESENT ILLNESS: The patient is a 53-year-old woman with a history of hypertension, diabetes, and depression. She developed severe anxiety and depression. She was having chest pains along with significant vomiting and diarrhea.
+    """
+    inputs = tokenizer(
+        text,
+        return_tensors="pt",
+        padding="max_length",
+        truncation=True,
+        max_length=512
     )
+    # Sproto requires raw token strings for its clinical section masking logic
+    tokens = [tokenizer.convert_ids_to_tokens(ids) for ids in inputs["input_ids"]]
+    # 4. Forward Pass
+    with torch.no_grad():
+        outputs = model(
+            input_ids=inputs["input_ids"],
+            attention_mask=inputs["attention_mask"],
+            tokens=tokens
+        )
+    # Apply sigmoid to convert BCE loss logits to probabilities
+    probs = torch.sigmoid(outputs.logits)[0]
+    # 5. Fetch Labels and Thresholds dynamically from Hugging Face Hub
+    try:
+        labels_path = hf_hub_download(repo_id=repo_id, filename="labels.txt")
+        icd_mapping_path = hf_hub_download(repo_id=repo_id, filename="icd_10_mappings.json")
+        thresholds_path = hf_hub_download(repo_id=repo_id, filename="thresholds_per_label.json")
+        with open(labels_path, "r") as f:
+            labels = f.read().strip().split("\n")
+        with open(icd_mapping_path, "r") as f:
+            icd_mapping = json.load(f)
+        with open(thresholds_path, "r") as f:
+            threshold_mapping = json.load(f)
+    except Exception as e:
+        print(f"Warning: Could not load label mapping files from HF Hub: {e}")
+        labels, threshold_mapping = None, None
+    # 6. Evaluate and Print Results
+    print("\n--- Inference Results ---")
+    if labels and threshold_mapping:
+        threshold_tensor = torch.zeros(len(labels))
+        for idx, label in enumerate(labels):
+            val = threshold_mapping.get(label, 0.20)
+            threshold_tensor[idx] = val if val > 0.0 else 0.20 # Enforce valid > 0.0 threshold
+        predicted_indices = torch.where(probs > threshold_tensor)[0]
+    else:
+        predicted_indices = torch.where(probs > 0.20)[0]
+    if len(predicted_indices) == 0:
+        print("No diagnoses predicted above the threshold.")
+    else:
+        results = []
+        for idx in predicted_indices:
+            idx_val = idx.item()
+            prob = probs[idx_val].item()
+            if labels and idx_val < len(labels):
+                icd_code = labels[idx_val]
+                description = icd_mapping.get(icd_code, "Unknown Description")
+                results.append((icd_code, description, prob))
+        # Sort alphabetically by ICD-10 code
+        results.sort(key=lambda x: x[0])
+        for icd_code, description, prob in results:
+            print(f"- {icd_code} ({description}): {prob:.4f}")
+if __name__ == "__main__":
+    main()
 ```
 > **Note:** `tokens` (the list of token strings per sample) is **required** when `use_attention=True`