import torch
import open_clip
from safetensors.torch import load_file
from PIL import Image

model_name = "ViT-L-14"
checkpoint_path = "open_clip_pytorch_model.safetensors"

device = "cuda" if torch.cuda.is_available() else "cpu"

model, _, preprocess = open_clip.create_model_and_transforms(
    model_name,
    pretrained=None,
)

state_dict = load_file(checkpoint_path)
model.load_state_dict(state_dict, strict=True)
model = model.to(device)
model.eval()

tokenizer = open_clip.get_tokenizer(model_name)

image = preprocess(Image.open("example.jpg").convert("RGB")).unsqueeze(0).to(device)
texts = tokenizer([
    "chest radiograph",
    "fundus photograph",
    "histopathology image",
]).to(device)

with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(texts)

    image_features = image_features / image_features.norm(dim=-1, keepdim=True)
    text_features = text_features / text_features.norm(dim=-1, keepdim=True)

    similarity = image_features @ text_features.T
    probs = similarity.softmax(dim=-1)

print(probs)