nzs234 commited on
Commit
a5a9cc1
·
verified ·
1 Parent(s): 02d82ac

Add Gradio demo app for aesthetic scoring

Browse files
Files changed (3) hide show
  1. README.md +12 -12
  2. app.py +125 -0
  3. requirements.txt +6 -0
README.md CHANGED
@@ -1,12 +1,12 @@
1
- ---
2
- title: Siglip2 Aesthetic Scorer Demo
3
- emoji: 📊
4
- colorFrom: purple
5
- colorTo: green
6
- sdk: gradio
7
- sdk_version: 6.6.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: SigLIP2 Aesthetic Scorer Demo
3
+ emoji: 🖼️
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 4.44.1
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Upload an image to get aesthetic score (`score_1..score_9`).
app.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+
4
+ import gradio as gr
5
+ import torch
6
+ import torch.nn as nn
7
+ from PIL import Image
8
+ from safetensors.torch import load_file
9
+ from transformers import AutoImageProcessor, AutoModel
10
+ from huggingface_hub import snapshot_download
11
+
12
+ MODEL_REPO = "nzs234/siglip2-so400m-aesthetic-scorer-v1"
13
+ CACHE_DIR = Path("./model_cache")
14
+
15
+
16
+ def infer_feature_dim(vision):
17
+ cfg = getattr(vision, "config", None)
18
+ for obj in [cfg, getattr(cfg, "vision_config", None) if cfg is not None else None]:
19
+ if obj is None:
20
+ continue
21
+ for k in ("projection_dim", "hidden_size"):
22
+ v = getattr(obj, k, None)
23
+ if isinstance(v, int) and v > 0:
24
+ return v
25
+ proj = getattr(vision, "visual_projection", None)
26
+ if isinstance(proj, nn.Linear):
27
+ return int(proj.out_features)
28
+ raise ValueError("cannot infer feature dim")
29
+
30
+
31
+ class Regressor(nn.Module):
32
+ def __init__(self, backbone_dir: str, hidden_dim: int = 2048, dropout: float = 0.2):
33
+ super().__init__()
34
+ self.vision = AutoModel.from_pretrained(backbone_dir, local_files_only=True)
35
+ feat_dim = infer_feature_dim(self.vision)
36
+ h1 = int(hidden_dim)
37
+ h2, h3, h4, h5 = 512, 256, 128, 32
38
+ d1 = float(max(0.0, min(0.8, dropout if dropout > 0 else 0.3)))
39
+ d2 = d1
40
+ d3 = float(max(0.0, min(0.8, d1 * 0.67)))
41
+ d4 = float(max(0.0, min(0.8, d1 * 0.33)))
42
+ self.head = nn.Sequential(
43
+ nn.LayerNorm(feat_dim),
44
+ nn.Linear(feat_dim, h1),
45
+ nn.ReLU(),
46
+ nn.BatchNorm1d(h1),
47
+ nn.Dropout(d1),
48
+ nn.Linear(h1, h2),
49
+ nn.ReLU(),
50
+ nn.BatchNorm1d(h2),
51
+ nn.Dropout(d2),
52
+ nn.Linear(h2, h3),
53
+ nn.ReLU(),
54
+ nn.BatchNorm1d(h3),
55
+ nn.Dropout(d3),
56
+ nn.Linear(h3, h4),
57
+ nn.ReLU(),
58
+ nn.BatchNorm1d(h4),
59
+ nn.Dropout(d4),
60
+ nn.Linear(h4, h5),
61
+ nn.ReLU(),
62
+ nn.Linear(h5, 1),
63
+ )
64
+
65
+ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
66
+ if hasattr(self.vision, "get_image_features"):
67
+ feats = self.vision.get_image_features(pixel_values=pixel_values)
68
+ if not isinstance(feats, torch.Tensor):
69
+ feats = feats.image_embeds if hasattr(feats, "image_embeds") else feats.pooler_output
70
+ else:
71
+ out = self.vision(pixel_values=pixel_values)
72
+ feats = out.pooler_output if hasattr(out, "pooler_output") and out.pooler_output is not None else out.last_hidden_state[:, 0, :]
73
+ feats = feats / (feats.norm(dim=1, keepdim=True) + 1e-8)
74
+ x = self.head(feats).squeeze(-1)
75
+ return torch.sigmoid(x)
76
+
77
+
78
+ print("Downloading model repo snapshot...")
79
+ local_repo = snapshot_download(repo_id=MODEL_REPO, repo_type="model", local_dir=str(CACHE_DIR), local_dir_use_symlinks=False)
80
+ local_repo = Path(local_repo)
81
+ meta = json.loads((local_repo / "metadata.json").read_text(encoding="utf-8"))
82
+ model_cfg = meta.get("model", {})
83
+ data_cfg = meta.get("data", {})
84
+
85
+ processor = AutoImageProcessor.from_pretrained(str(local_repo / "backbone"), local_files_only=True)
86
+ model = Regressor(
87
+ backbone_dir=str(local_repo / "backbone"),
88
+ hidden_dim=int(model_cfg.get("hidden_dim", 2048)),
89
+ dropout=float(model_cfg.get("dropout", 0.2)),
90
+ )
91
+ head_state = load_file(str(local_repo / "head.safetensors"), device="cpu")
92
+ model.head.load_state_dict(head_state, strict=False)
93
+ model.eval()
94
+
95
+ score_min = float(data_cfg.get("score_min", 1.0))
96
+ score_max = float(data_cfg.get("score_max", 9.0))
97
+
98
+
99
+ def predict(img: Image.Image):
100
+ if img is None:
101
+ return {"error": "no image"}
102
+ if img.mode != "RGB":
103
+ img = img.convert("RGB")
104
+ proc = processor(images=img, return_tensors="pt")
105
+ with torch.inference_mode():
106
+ pred_01 = model(proc["pixel_values"]).item()
107
+ pred_01 = max(0.0, min(1.0, float(pred_01)))
108
+ pred_score = pred_01 * (score_max - score_min) + score_min
109
+ score_int = int(round(pred_score))
110
+ score_int = max(int(score_min), min(int(score_max), score_int))
111
+ return {
112
+ "score": f"score_{score_int}",
113
+ "score_float": round(pred_score, 4)
114
+ }
115
+
116
+
117
+ with gr.Blocks() as demo:
118
+ gr.Markdown("# SigLIP2 Aesthetic Scorer Demo")
119
+ inp = gr.Image(type="pil", label="Image")
120
+ out = gr.JSON(label="Result")
121
+ btn = gr.Button("Predict")
122
+ btn.click(fn=predict, inputs=[inp], outputs=[out])
123
+
124
+
125
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ torch>=2.1.0
3
+ transformers>=4.40.0
4
+ safetensors>=0.4.0
5
+ huggingface_hub>=0.24.0
6
+ Pillow>=10.0.0