pauvanbr commited on
Commit
16cb311
·
verified ·
1 Parent(s): fcde35f

Upload src/submission/check_validity.py

Browse files
Files changed (1) hide show
  1. src/submission/check_validity.py +129 -0
src/submission/check_validity.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ import re
5
+ from collections import defaultdict
6
+
7
+ import huggingface_hub
8
+ from huggingface_hub import ModelCard
9
+ from huggingface_hub.hf_api import ModelInfo, get_safetensors_metadata
10
+ from transformers import AutoConfig, AutoTokenizer
11
+
12
+
13
+ # TODO: Traducir mensajes de error
14
+ def check_model_card(repo_id: str) -> tuple[bool, str]:
15
+ """Check whether the model card and license exist and have been filled."""
16
+ try:
17
+ card = ModelCard.load(repo_id)
18
+ except huggingface_hub.utils.EntryNotFoundError:
19
+ return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
20
+
21
+ # Enforce license metadata
22
+ if card.data.license is None:
23
+ if not ("license_name" in card.data and "license_link" in card.data):
24
+ return False, (
25
+ "License not found. Please add a license to your model card using the `license` metadata or a"
26
+ " `license_name`/`license_link` pair."
27
+ )
28
+
29
+ # Enforce card content
30
+ if len(card.text) < 200:
31
+ return False, "Please add a description to your model card, it is too short."
32
+
33
+ return True, ""
34
+
35
+
36
+ def is_model_on_hub(
37
+ model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False
38
+ ) -> tuple[bool, str]:
39
+ """Check whether the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
40
+ try:
41
+ config = AutoConfig.from_pretrained(
42
+ model_name, revision=revision, trust_remote_code=trust_remote_code, token=token, force_download=True
43
+ )
44
+ if test_tokenizer:
45
+ try:
46
+ AutoTokenizer.from_pretrained(
47
+ model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
48
+ )
49
+ except ValueError as e:
50
+ return (False, f"uses a tokenizer which is not in a transformers release: {e}", None)
51
+ except Exception as e:
52
+ return (
53
+ False,
54
+ f"'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured? {e}",
55
+ None,
56
+ )
57
+ return True, None, config
58
+
59
+ except ValueError:
60
+ return (
61
+ False,
62
+ "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
63
+ None,
64
+ )
65
+
66
+ except Exception as e:
67
+ if "You are trying to access a gated repo." in str(e):
68
+ return True, "uses a gated model.", None
69
+ return False, f"was not found or misconfigured on the hub! Error raised was {e.args[0]}", None
70
+
71
+
72
+ def get_model_size(model_info: ModelInfo, precision: str) -> float:
73
+ size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
74
+ safetensors = None
75
+
76
+ try:
77
+ safetensors = get_safetensors_metadata(model_info.id)
78
+ except Exception as e:
79
+ logging.error(f"Failed to get safetensors metadata for model {model_info.id}: {str(e)}")
80
+
81
+ if safetensors is not None:
82
+ model_size = round(sum(safetensors.parameter_count.values()) / 1e9, 3)
83
+ else:
84
+ try:
85
+ size_match = re.search(size_pattern, model_info.id.lower())
86
+ if size_match:
87
+ model_size = size_match.group(0)
88
+ model_size = round(
89
+ float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3
90
+ )
91
+ else:
92
+ return -1 # Unknown model size
93
+ except AttributeError:
94
+ logging.warning(f"Unable to parse model size from ID: {model_info.id}")
95
+ return -1 # Unknown model size
96
+
97
+ size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.id.lower()) else 1
98
+ model_size = size_factor * model_size
99
+ return model_size
100
+
101
+
102
+ def get_model_arch(model_info: ModelInfo):
103
+ """Get the model architecture from the configuration."""
104
+ return model_info.config.get("architectures", "Unknown")
105
+
106
+
107
+ def already_submitted_models(requested_models_dir: str) -> set[str]:
108
+ """Gather a list of already submitted models to avoid duplicates."""
109
+ depth = 1
110
+ file_names = []
111
+ users_to_submission_dates = defaultdict(list)
112
+
113
+ for root, _, files in os.walk(requested_models_dir):
114
+ current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
115
+ if current_depth == depth:
116
+ for file in files:
117
+ if not file.endswith(".json"):
118
+ continue
119
+ with open(os.path.join(root, file), "r") as f:
120
+ info = json.load(f)
121
+ file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
122
+
123
+ # Select organisation
124
+ if info["model"].count("/") == 0 or "submitted_time" not in info:
125
+ continue
126
+ organisation, _ = info["model"].split("/")
127
+ users_to_submission_dates[organisation].append(info["submitted_time"])
128
+
129
+ return set(file_names), users_to_submission_dates