AI-Joe-git commited on
Commit
776bb60
·
verified ·
1 Parent(s): 535f60d

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +142 -192
app.py CHANGED
@@ -12,21 +12,19 @@ from textwrap import dedent
12
  from apscheduler.schedulers.background import BackgroundScheduler
13
 
14
 
 
15
  HF_TOKEN = os.environ.get("HF_TOKEN")
16
  CONVERSION_SCRIPT = "./llama.cpp/convert_hf_to_gguf.py"
17
- # 🔧 CHANGED: was "ggml-model-mmproj-f32.gguf"
18
- MMPROJ_FILENAME = "ggml-model-mmproj-q8_0.gguf"
19
-
20
 
 
21
  def escape(s: str) -> str:
22
- s = s.replace("&", "&")
23
  s = s.replace("<", "&lt;")
24
  s = s.replace(">", "&gt;")
25
  s = s.replace('"', "&quot;")
26
  s = s.replace("\n", "<br/>")
27
  return s
28
 
29
-
30
  def is_vision_model(model_dir: str) -> bool:
31
  """Check if a HuggingFace model directory contains vision capabilities."""
32
  config_path = Path(model_dir) / "config.json"
@@ -42,13 +40,17 @@ def is_vision_model(model_dir: str) -> bool:
42
 
43
  if "vision_config" in config and config["vision_config"]:
44
  return True
 
45
  if config.get("image_token_id") is not None and config.get("vision_start_token_id") is not None:
46
  return True
 
47
  if "text_config" in config and isinstance(config["text_config"], dict):
48
  if "vision_config" in config["text_config"] and config["text_config"]["vision_config"]:
49
  return True
 
50
  if config.get("mm_cfg"):
51
  return True
 
52
  if config.get("mm_projector"):
53
  return True
54
 
@@ -56,26 +58,20 @@ def is_vision_model(model_dir: str) -> bool:
56
 
57
 
58
  def generate_mmproj(model_dir: str, outdir: str) -> str | None:
59
- """
60
- Generate a Q8_0-quantized mmproj GGUF from a HuggingFace vision model directory.
61
-
62
- Uses --outtype q8_0 directly via convert_hf_to_gguf.py (supported natively).
63
- Q8_0 reduces file size ~50% vs F32 with negligible quality loss for vision encoders.
64
- """ # 🔧 CHANGED: entire docstring and logic updated
65
  print(f"[MMPROJ] Checking model dir: {model_dir}")
66
  if not is_vision_model(model_dir):
67
  print("[MMPROJ] Not a vision model, skipping mmproj generation.")
68
  return None
69
 
70
- # 🔧 CHANGED: outtype is now q8_0 directly; no two-step conversion needed
71
- print(f"[MMPROJ] Vision model detected — generating {MMPROJ_FILENAME} (Q8_0)...")
72
- mmproj_outfile = str(Path(outdir) / MMPROJ_FILENAME)
73
 
74
  result = subprocess.run([
75
  "python", CONVERSION_SCRIPT,
76
  model_dir,
77
  "--outfile", mmproj_outfile,
78
- "--outtype", "q8_0", # 🔧 CHANGED: was "f32"
79
  "--mmproj",
80
  ], shell=False, capture_output=True)
81
 
@@ -85,28 +81,13 @@ def generate_mmproj(model_dir: str, outdir: str) -> str | None:
85
  if result.returncode != 0:
86
  print(f"[MMPROJ] stderr: {stderr[:1000]}")
87
  print(f"[MMPROJ] Return code: {result.returncode}")
88
- # 🆕 NEW: graceful fallback — retry with f16 if q8_0 not supported by this model class
89
- print("[MMPROJ] Q8_0 failed — retrying with f16 as fallback...")
90
- mmproj_outfile_f16 = str(Path(outdir) / "ggml-model-mmproj-f16.gguf")
91
- result2 = subprocess.run([
92
- "python", CONVERSION_SCRIPT,
93
- model_dir,
94
- "--outfile", mmproj_outfile_f16,
95
- "--outtype", "f16",
96
- "--mmproj",
97
- ], shell=False, capture_output=True)
98
- if result2.returncode != 0 or not os.path.isfile(mmproj_outfile_f16):
99
- print(f"[MMPROJ] Fallback f16 also failed: {result2.stderr.decode('utf-8')[:500]}")
100
- return None
101
- print(f"[MMPROJ] Fallback mmproj (f16) generated: {mmproj_outfile_f16}")
102
- return mmproj_outfile_f16
103
 
104
  if not os.path.isfile(mmproj_outfile):
105
  print(f"[MMPROJ] File not found at {mmproj_outfile}")
106
  return None
107
 
108
- size_mb = os.path.getsize(mmproj_outfile) / (1024 * 1024)
109
- print(f"[MMPROJ] {MMPROJ_FILENAME} generated: {mmproj_outfile} ({size_mb:.1f} MB)")
110
  return mmproj_outfile
111
 
112
 
@@ -127,19 +108,18 @@ def generate_importance_matrix(model_path: str, train_data_path: str, output_pat
127
  process = subprocess.Popen(imatrix_command, shell=False)
128
 
129
  try:
130
- process.wait(timeout=60)
131
  except subprocess.TimeoutExpired:
132
  print("Imatrix computation timed out. Sending SIGINT to allow graceful termination...")
133
  process.send_signal(signal.SIGINT)
134
  try:
135
- process.wait(timeout=5)
136
  except subprocess.TimeoutExpired:
137
- print("Imatrix proc still didn't term. Forcefully terminating process...")
138
  process.kill()
139
 
140
  print("Importance matrix generation completed.")
141
 
142
-
143
  def split_upload_model(model_path: str, outdir: str, repo_id: str, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
144
  print(f"Model path: {model_path}")
145
  print(f"Output dir: {outdir}")
@@ -147,31 +127,40 @@ def split_upload_model(model_path: str, outdir: str, repo_id: str, oauth_token:
147
  if oauth_token is None or oauth_token.token is None:
148
  raise ValueError("You have to be logged in.")
149
 
150
- split_cmd = ["./llama.cpp/llama-gguf-split", "--split"]
 
 
 
151
  if split_max_size:
152
- split_cmd += ["--split-max-size", split_max_size]
 
153
  else:
154
- split_cmd += ["--split-max-tensors", str(split_max_tensors)]
 
155
 
156
- model_path_prefix = '.'.join(model_path.split('.')[:-1])
157
- split_cmd += [model_path, model_path_prefix]
 
 
158
 
159
  print(f"Split command: {split_cmd}")
 
160
  result = subprocess.run(split_cmd, shell=False, capture_output=True, text=True)
161
  print(f"Split command stdout: {result.stdout}")
162
  print(f"Split command stderr: {result.stderr}")
163
 
164
  if result.returncode != 0:
165
- raise Exception(f"Error splitting the model: {result.stderr}")
 
166
  print("Model split successfully!")
167
 
 
168
  if os.path.exists(model_path):
169
  os.remove(model_path)
170
 
171
  model_file_prefix = model_path_prefix.split('/')[-1]
172
  print(f"Model file name prefix: {model_file_prefix}")
173
  sharded_model_files = [f for f in os.listdir(outdir) if f.startswith(model_file_prefix) and f.endswith(".gguf")]
174
-
175
  if sharded_model_files:
176
  print(f"Sharded model files: {sharded_model_files}")
177
  api = HfApi(token=oauth_token.token)
@@ -191,168 +180,148 @@ def split_upload_model(model_path: str, outdir: str, repo_id: str, oauth_token:
191
 
192
  print("Sharded model has been uploaded successfully!")
193
 
194
-
195
  def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
196
  if oauth_token is None or oauth_token.token is None:
197
  raise gr.Error("You must be logged in to use GGUF-my-repo")
198
 
 
199
  try:
200
  whoami(oauth_token.token)
201
- except Exception:
202
  raise gr.Error("You must be logged in to use GGUF-my-repo")
203
 
204
  model_name = model_id.split('/')[-1]
205
 
206
  try:
207
  api = HfApi(token=oauth_token.token)
 
208
  dl_pattern = ["*.md", "*.json", "*.model"]
209
 
210
  pattern = (
211
  "*.safetensors"
212
  if any(
213
  file.path.endswith(".safetensors")
214
- for file in api.list_repo_tree(repo_id=model_id, recursive=True)
 
 
 
215
  )
216
  else "*.bin"
217
  )
 
218
  dl_pattern += [pattern]
219
 
220
- os.makedirs("downloads", exist_ok=True)
221
- os.makedirs("outputs", exist_ok=True)
 
 
 
222
 
223
  with tempfile.TemporaryDirectory(dir="outputs") as outdir:
224
- fp16 = str(Path(outdir) / f"{model_name}.fp16.gguf")
225
 
226
  with tempfile.TemporaryDirectory(dir="downloads") as tmpdir:
227
- local_dir = Path(tmpdir) / model_name
 
228
  print(local_dir)
229
- api.snapshot_download(
230
- repo_id=model_id,
231
- local_dir=local_dir,
232
- local_dir_use_symlinks=False,
233
- allow_patterns=dl_pattern,
234
- )
235
  print("Model downloaded successfully!")
236
  print(f"Current working directory: {os.getcwd()}")
237
  print(f"Model directory contents: {os.listdir(local_dir)}")
238
 
239
- config_dir = local_dir / "config.json"
240
- adapter_config_dir = local_dir / "adapter_config.json"
241
  if os.path.exists(adapter_config_dir) and not os.path.exists(config_dir):
242
- raise Exception(
243
- 'adapter_config.json is present.<br/><br/>'
244
- 'If you are converting a LoRA adapter to GGUF, please use '
245
- '<a href="https://huggingface.co/spaces/ggml-org/gguf-my-lora" '
246
- 'target="_blank" style="text-decoration:underline">GGUF-my-lora</a>.'
247
- )
248
 
249
  result = subprocess.run([
250
- "python", CONVERSION_SCRIPT, local_dir,
251
- "--outtype", "f16", "--outfile", fp16
252
  ], shell=False, capture_output=True)
253
  print(result)
254
  if result.returncode != 0:
255
- raise Exception(f"Error converting to fp16: {result.stderr.decode('utf-8')}")
 
256
  print("Model converted to fp16 successfully!")
 
257
 
258
- # 🔧 CHANGED: now generates Q8_0 mmproj; filename tracked dynamically
259
- mmproj_gguf = generate_mmproj(str(local_dir), outdir)
260
- if mmproj_gguf:
261
- mmproj_filename_used = Path(mmproj_gguf).name
262
- print(f"[MMPROJ] Will upload: {mmproj_gguf} as {mmproj_filename_used}")
263
- else:
264
- mmproj_filename_used = None
265
 
266
- imatrix_path = Path(outdir) / "imatrix.dat"
267
 
268
  if use_imatrix:
269
- train_data_path = train_data_file.name if train_data_file else "llama.cpp/groups_merged.txt"
 
 
 
 
270
  print(f"Training data file path: {train_data_path}")
 
271
  if not os.path.isfile(train_data_path):
272
  raise Exception(f"Training data file not found: {train_data_path}")
 
273
  generate_importance_matrix(fp16, train_data_path, imatrix_path)
274
  else:
275
  print("Not using imatrix quantization.")
276
 
277
- active_method = imatrix_q_method if use_imatrix else q_method
278
- quantized_gguf_name = (
279
- f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf"
280
- if use_imatrix
281
- else f"{model_name.lower()}-{q_method.lower()}.gguf"
282
- )
283
- quantized_gguf_path = str(Path(outdir) / quantized_gguf_name)
284
-
285
- quantise_ggml = ["./llama.cpp/llama-quantize"]
286
  if use_imatrix:
287
- quantise_ggml += ["--imatrix", imatrix_path, fp16, quantized_gguf_path, imatrix_q_method]
 
 
 
288
  else:
289
- quantise_ggml += [fp16, quantized_gguf_path, q_method]
290
-
 
 
291
  result = subprocess.run(quantise_ggml, shell=False, capture_output=True)
292
  if result.returncode != 0:
293
- raise Exception(f"Error quantizing: {result.stderr.decode('utf-8')}")
294
- print(f"Quantized successfully with {active_method} option!")
 
 
295
 
 
296
  username = whoami(oauth_token.token)["name"]
297
- new_repo_url = api.create_repo(
298
- repo_id=f"{username}/{model_name}-{active_method}-GGUF",
299
- exist_ok=True,
300
- private=private_repo,
301
- )
302
  new_repo_id = new_repo_url.repo_id
303
  print("Repo created successfully!", new_repo_url)
304
 
305
  try:
306
  card = ModelCard.load(model_id, token=oauth_token.token)
307
- except Exception:
308
  card = ModelCard("")
309
-
310
  if card.data.tags is None:
311
  card.data.tags = []
312
  card.data.tags.append("llama-cpp")
313
  card.data.tags.append("gguf-my-repo")
314
  card.data.base_model = model_id
315
-
316
- # 🔧 CHANGED: mmproj section updated with actual filename, Q8_0 note, and richer instructions
317
  mmproj_note = ""
318
  if mmproj_gguf and os.path.isfile(mmproj_gguf):
319
- mmproj_note = dedent(f"""
320
-
321
- ## 👁️ Vision / Multimodal Support
322
-
323
- This is a **vision-capable model**. A multimodal projector file (`{mmproj_filename_used}`) has been
324
- automatically generated and uploaded alongside the quantized weights.
325
- It is quantized at **Q8_0** — near-lossless compression (~50% smaller than F32)
326
- recommended for vision encoders.
327
-
328
- Use the `--mmproj` flag when running inference:
329
-
330
- ```bash
331
- # CLI
332
- llama-cli -m {quantized_gguf_name} --mmproj {mmproj_filename_used} -p "Describe this image" --image /path/to/image.jpg
333
-
334
- # Server
335
- llama-server -m {quantized_gguf_name} --mmproj {mmproj_filename_used}
336
- ```
337
-
338
- > **Tip:** Both files must be in the same directory, or provide full paths.
339
- > Compatible with llama.cpp builds that include vision support (default since mid-2024).
340
- """)
341
 
342
  card.text = dedent(
343
  f"""
344
  # {new_repo_id}
345
-
346
- This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id})
347
- using llama.cpp via the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
348
  Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
349
  {mmproj_note}
350
  ## Use with llama.cpp
 
351
 
352
- Install llama.cpp through brew (works on Mac and Linux):
353
  ```bash
354
  brew install llama.cpp
 
355
  ```
 
356
 
357
  ### CLI:
358
  ```bash
@@ -364,16 +333,14 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
364
  llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
365
  ```
366
 
367
- Note: You can also use this checkpoint directly through the
368
- [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo.
369
 
370
  Step 1: Clone llama.cpp from GitHub.
371
  ```
372
  git clone https://github.com/ggerganov/llama.cpp
373
  ```
374
 
375
- Step 2: Move into the llama.cpp folder and build it with `LLAMA_CURL=1` flag along with other
376
- hardware-specific flags (for ex: LLAMA_CUDA=1 for Nvidia GPUs on Linux).
377
  ```
378
  cd llama.cpp && LLAMA_CURL=1 make
379
  ```
@@ -388,8 +355,7 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
388
  ```
389
  """
390
  )
391
-
392
- readme_path = Path(outdir) / "README.md"
393
  card.save(readme_path)
394
 
395
  if split_model:
@@ -407,6 +373,7 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
407
 
408
  if os.path.isfile(imatrix_path):
409
  try:
 
410
  api.upload_file(
411
  path_or_fileobj=imatrix_path,
412
  path_in_repo="imatrix.dat",
@@ -415,47 +382,38 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
415
  except Exception as e:
416
  raise Exception(f"Error uploading imatrix.dat: {e}")
417
 
418
- # 🔧 CHANGED: upload path now uses dynamic filename (handles q8_0 or f16 fallback)
419
  if mmproj_gguf and os.path.isfile(mmproj_gguf):
420
  try:
421
- print(f"Uploading {mmproj_filename_used}: {mmproj_gguf}")
422
  api.upload_file(
423
  path_or_fileobj=mmproj_gguf,
424
- path_in_repo=mmproj_filename_used, # 🔧 CHANGED: was hardcoded "ggml-model-mmproj-f32.gguf"
425
  repo_id=new_repo_id,
426
  )
427
  except Exception as e:
428
- print(f"Warning: Failed to upload {mmproj_filename_used}: {e}")
429
 
430
  api.upload_file(
431
  path_or_fileobj=readme_path,
432
  path_in_repo="README.md",
433
  repo_id=new_repo_id,
434
  )
435
- print(f"Uploaded successfully with {active_method} option!")
436
 
437
- # 🆕 NEW: success message notes vision support when applicable
438
- vision_note = ""
439
- if mmproj_gguf:
440
- vision_note = f'<br/>👁️ Vision model detected — <code>{mmproj_filename_used}</code> (Q8_0) also uploaded.'
441
 
442
  return (
443
- f'<h1>✅ DONE</h1><br/>Find your repo here: '
444
- f'<a href="{new_repo_url}" target="_blank" style="text-decoration:underline">{new_repo_id}</a>'
445
- f'{vision_note}',
446
  "llama.png",
447
  )
448
-
449
  except Exception as e:
450
  return (f'<h1>❌ ERROR</h1><br/><pre style="white-space:pre-wrap;">{escape(str(e))}</pre>', "error.png")
451
 
452
 
453
- # ─── UI ──────────────────────────────────────────────────────────────────────
454
-
455
- css = """
456
  .gradio-container {overflow-y: auto;}
457
  """
458
-
459
  model_id = HuggingfaceHubSearch(
460
  label="Hub Model ID",
461
  placeholder="Search for model id on Huggingface",
@@ -463,13 +421,12 @@ model_id = HuggingfaceHubSearch(
463
  )
464
 
465
  q_method = gr.Dropdown(
466
- ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M",
467
- "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
468
  label="Quantization Method",
469
  info="GGML quantization type",
470
  value="Q4_K_M",
471
  filterable=False,
472
- visible=True,
473
  )
474
 
475
  imatrix_q_method = gr.Dropdown(
@@ -478,77 +435,71 @@ imatrix_q_method = gr.Dropdown(
478
  info="GGML imatrix quants type",
479
  value="IQ4_NL",
480
  filterable=False,
481
- visible=False,
482
  )
483
 
484
  use_imatrix = gr.Checkbox(
485
  value=False,
486
  label="Use Imatrix Quantization",
487
- info="Use importance matrix for quantization.",
488
  )
489
 
490
  private_repo = gr.Checkbox(
491
  value=False,
492
  label="Private Repo",
493
- info="Create a private repo under your username.",
494
  )
495
 
496
  train_data_file = gr.File(
497
  label="Training Data File",
498
  file_types=["txt"],
499
- visible=False,
500
  )
501
 
502
  split_model = gr.Checkbox(
503
  value=False,
504
  label="Split Model",
505
- info="Shard the model using gguf-split.",
506
  )
507
 
508
  split_max_tensors = gr.Number(
509
  value=256,
510
  label="Max Tensors per File",
511
  info="Maximum number of tensors per file when splitting model.",
512
- visible=False,
513
  )
514
 
515
  split_max_size = gr.Textbox(
516
  label="Max File Size",
517
- info="Maximum file size when splitting model (--split-max-size). "
518
- "May leave empty to use the default. Accepted suffixes: M, G. Example: 256M, 5G",
519
- visible=False,
520
  )
521
 
522
- # 🔧 CHANGED: description updated to mention vision/mmproj support
523
  iface = gr.Interface(
524
- fn=process_model,
525
- inputs=[
526
- model_id, q_method, use_imatrix, imatrix_q_method,
527
- private_repo, train_data_file, split_model, split_max_tensors, split_max_size,
528
- ],
529
- outputs=[
530
- gr.Markdown(label="output"),
531
- gr.Image(show_label=False),
532
- ],
533
- title="Create your own GGUF Quants, blazingly fast ⚡!",
534
- description=(
535
- "The space takes an HF repo as input, quantizes it and creates a repo containing "
536
- "the selected quant under your HF user namespace.\n\n"
537
- "👁️ **Vision models are automatically detected** — a multimodal projector "
538
- "(`ggml-model-mmproj-q8_0.gguf`) will be generated and uploaded alongside "
539
- "the quantized weights, ready to use with `llama-cli --mmproj`."
540
- ),
541
- api_name=False,
542
- )
 
543
 
 
544
  with gr.Blocks(css=css) as demo:
545
- # 🔧 CHANGED: header updated to mention vision support
546
- gr.Markdown(
547
- "You must be logged in to use GGUF-my-repo.\n\n"
548
- "> 👁️ **Vision model support:** If the selected model has vision capabilities, "
549
- "a `ggml-model-mmproj-q8_0.gguf` projector file will be **automatically generated "
550
- "and uploaded** to your repo alongside the quantized model."
551
- )
552
  gr.LoginButton(min_width=250)
553
 
554
  iface.render()
@@ -559,7 +510,7 @@ with gr.Blocks(css=css) as demo:
559
  split_model.change(
560
  fn=update_split_visibility,
561
  inputs=split_model,
562
- outputs=[split_max_tensors, split_max_size],
563
  )
564
 
565
  def update_visibility(use_imatrix):
@@ -568,16 +519,15 @@ with gr.Blocks(css=css) as demo:
568
  use_imatrix.change(
569
  fn=update_visibility,
570
  inputs=use_imatrix,
571
- outputs=[q_method, imatrix_q_method, train_data_file],
572
  )
573
 
574
-
575
  def restart_space():
576
  HfApi().restart_space(repo_id="ggml-org/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)
577
 
578
-
579
  scheduler = BackgroundScheduler()
580
  scheduler.add_job(restart_space, "interval", seconds=21600)
581
  scheduler.start()
582
 
583
- demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True, show_api=False)
 
 
12
  from apscheduler.schedulers.background import BackgroundScheduler
13
 
14
 
15
+ # used for restarting the space
16
  HF_TOKEN = os.environ.get("HF_TOKEN")
17
  CONVERSION_SCRIPT = "./llama.cpp/convert_hf_to_gguf.py"
 
 
 
18
 
19
+ # escape HTML for logging
20
  def escape(s: str) -> str:
21
+ s = s.replace("&", "&amp;") # Must be done first!
22
  s = s.replace("<", "&lt;")
23
  s = s.replace(">", "&gt;")
24
  s = s.replace('"', "&quot;")
25
  s = s.replace("\n", "<br/>")
26
  return s
27
 
 
28
  def is_vision_model(model_dir: str) -> bool:
29
  """Check if a HuggingFace model directory contains vision capabilities."""
30
  config_path = Path(model_dir) / "config.json"
 
40
 
41
  if "vision_config" in config and config["vision_config"]:
42
  return True
43
+
44
  if config.get("image_token_id") is not None and config.get("vision_start_token_id") is not None:
45
  return True
46
+
47
  if "text_config" in config and isinstance(config["text_config"], dict):
48
  if "vision_config" in config["text_config"] and config["text_config"]["vision_config"]:
49
  return True
50
+
51
  if config.get("mm_cfg"):
52
  return True
53
+
54
  if config.get("mm_projector"):
55
  return True
56
 
 
58
 
59
 
60
  def generate_mmproj(model_dir: str, outdir: str) -> str | None:
61
+ """Generate mmproj.gguf from a HuggingFace vision model directory."""
 
 
 
 
 
62
  print(f"[MMPROJ] Checking model dir: {model_dir}")
63
  if not is_vision_model(model_dir):
64
  print("[MMPROJ] Not a vision model, skipping mmproj generation.")
65
  return None
66
 
67
+ print(f"[MMPROJ] Vision model detected, generating mmproj.gguf...")
68
+ mmproj_outfile = str(Path(outdir) / "ggml-model-mmproj-f32.gguf")
 
69
 
70
  result = subprocess.run([
71
  "python", CONVERSION_SCRIPT,
72
  model_dir,
73
  "--outfile", mmproj_outfile,
74
+ "--outtype", "f32",
75
  "--mmproj",
76
  ], shell=False, capture_output=True)
77
 
 
81
  if result.returncode != 0:
82
  print(f"[MMPROJ] stderr: {stderr[:1000]}")
83
  print(f"[MMPROJ] Return code: {result.returncode}")
84
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  if not os.path.isfile(mmproj_outfile):
87
  print(f"[MMPROJ] File not found at {mmproj_outfile}")
88
  return None
89
 
90
+ print(f"[MMPROJ] mmproj.gguf generated: {mmproj_outfile} ({os.path.getsize(mmproj_outfile) / (1024*1024):.1f} MB)")
 
91
  return mmproj_outfile
92
 
93
 
 
108
  process = subprocess.Popen(imatrix_command, shell=False)
109
 
110
  try:
111
+ process.wait(timeout=60) # added wait
112
  except subprocess.TimeoutExpired:
113
  print("Imatrix computation timed out. Sending SIGINT to allow graceful termination...")
114
  process.send_signal(signal.SIGINT)
115
  try:
116
+ process.wait(timeout=5) # grace period
117
  except subprocess.TimeoutExpired:
118
+ print("Imatrix proc still didn't term. Forecfully terming process...")
119
  process.kill()
120
 
121
  print("Importance matrix generation completed.")
122
 
 
123
  def split_upload_model(model_path: str, outdir: str, repo_id: str, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
124
  print(f"Model path: {model_path}")
125
  print(f"Output dir: {outdir}")
 
127
  if oauth_token is None or oauth_token.token is None:
128
  raise ValueError("You have to be logged in.")
129
 
130
+ split_cmd = [
131
+ "./llama.cpp/llama-gguf-split",
132
+ "--split",
133
+ ]
134
  if split_max_size:
135
+ split_cmd.append("--split-max-size")
136
+ split_cmd.append(split_max_size)
137
  else:
138
+ split_cmd.append("--split-max-tensors")
139
+ split_cmd.append(str(split_max_tensors))
140
 
141
+ # args for output
142
+ model_path_prefix = '.'.join(model_path.split('.')[:-1]) # remove the file extension
143
+ split_cmd.append(model_path)
144
+ split_cmd.append(model_path_prefix)
145
 
146
  print(f"Split command: {split_cmd}")
147
+
148
  result = subprocess.run(split_cmd, shell=False, capture_output=True, text=True)
149
  print(f"Split command stdout: {result.stdout}")
150
  print(f"Split command stderr: {result.stderr}")
151
 
152
  if result.returncode != 0:
153
+ stderr_str = result.stderr.decode("utf-8")
154
+ raise Exception(f"Error splitting the model: {stderr_str}")
155
  print("Model split successfully!")
156
 
157
+ # remove the original model file if needed
158
  if os.path.exists(model_path):
159
  os.remove(model_path)
160
 
161
  model_file_prefix = model_path_prefix.split('/')[-1]
162
  print(f"Model file name prefix: {model_file_prefix}")
163
  sharded_model_files = [f for f in os.listdir(outdir) if f.startswith(model_file_prefix) and f.endswith(".gguf")]
 
164
  if sharded_model_files:
165
  print(f"Sharded model files: {sharded_model_files}")
166
  api = HfApi(token=oauth_token.token)
 
180
 
181
  print("Sharded model has been uploaded successfully!")
182
 
 
183
  def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
184
  if oauth_token is None or oauth_token.token is None:
185
  raise gr.Error("You must be logged in to use GGUF-my-repo")
186
 
187
+ # validate the oauth token
188
  try:
189
  whoami(oauth_token.token)
190
+ except Exception as e:
191
  raise gr.Error("You must be logged in to use GGUF-my-repo")
192
 
193
  model_name = model_id.split('/')[-1]
194
 
195
  try:
196
  api = HfApi(token=oauth_token.token)
197
+
198
  dl_pattern = ["*.md", "*.json", "*.model"]
199
 
200
  pattern = (
201
  "*.safetensors"
202
  if any(
203
  file.path.endswith(".safetensors")
204
+ for file in api.list_repo_tree(
205
+ repo_id=model_id,
206
+ recursive=True,
207
+ )
208
  )
209
  else "*.bin"
210
  )
211
+
212
  dl_pattern += [pattern]
213
 
214
+ if not os.path.exists("downloads"):
215
+ os.makedirs("downloads")
216
+
217
+ if not os.path.exists("outputs"):
218
+ os.makedirs("outputs")
219
 
220
  with tempfile.TemporaryDirectory(dir="outputs") as outdir:
221
+ fp16 = str(Path(outdir)/f"{model_name}.fp16.gguf")
222
 
223
  with tempfile.TemporaryDirectory(dir="downloads") as tmpdir:
224
+ # Keep the model name as the dirname so the model name metadata is populated correctly
225
+ local_dir = Path(tmpdir)/model_name
226
  print(local_dir)
227
+ api.snapshot_download(repo_id=model_id, local_dir=local_dir, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
 
 
 
 
 
228
  print("Model downloaded successfully!")
229
  print(f"Current working directory: {os.getcwd()}")
230
  print(f"Model directory contents: {os.listdir(local_dir)}")
231
 
232
+ config_dir = local_dir/"config.json"
233
+ adapter_config_dir = local_dir/"adapter_config.json"
234
  if os.path.exists(adapter_config_dir) and not os.path.exists(config_dir):
235
+ raise Exception('adapter_config.json is present.<br/><br/>If you are converting a LoRA adapter to GGUF, please use <a href="https://huggingface.co/spaces/ggml-org/gguf-my-lora" target="_blank" style="text-decoration:underline">GGUF-my-lora</a>.')
 
 
 
 
 
236
 
237
  result = subprocess.run([
238
+ "python", CONVERSION_SCRIPT, local_dir, "--outtype", "f16", "--outfile", fp16
 
239
  ], shell=False, capture_output=True)
240
  print(result)
241
  if result.returncode != 0:
242
+ stderr_str = result.stderr.decode("utf-8")
243
+ raise Exception(f"Error converting to fp16: {stderr_str}")
244
  print("Model converted to fp16 successfully!")
245
+ print(f"Converted model path: {fp16}")
246
 
247
+ # Generate mmproj.gguf for vision models
248
+ mmproj_gguf_path = str(Path(outdir)/"ggml-model-mmproj-f32.gguf")
249
+ mmproj_result = generate_mmproj(str(local_dir), outdir)
250
+ mmproj_gguf = None
251
+ if mmproj_result:
252
+ mmproj_gguf = mmproj_result
253
+ print(f"[MMPROJ] Will upload mmproj.gguf: {mmproj_gguf}")
254
 
255
+ imatrix_path = Path(outdir)/"imatrix.dat"
256
 
257
  if use_imatrix:
258
+ if train_data_file:
259
+ train_data_path = train_data_file.name
260
+ else:
261
+ train_data_path = "llama.cpp/groups_merged.txt" #fallback calibration dataset
262
+
263
  print(f"Training data file path: {train_data_path}")
264
+
265
  if not os.path.isfile(train_data_path):
266
  raise Exception(f"Training data file not found: {train_data_path}")
267
+
268
  generate_importance_matrix(fp16, train_data_path, imatrix_path)
269
  else:
270
  print("Not using imatrix quantization.")
271
 
272
+ # Quantize the model
273
+ quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
274
+ quantized_gguf_path = str(Path(outdir)/quantized_gguf_name)
 
 
 
 
 
 
275
  if use_imatrix:
276
+ quantise_ggml = [
277
+ "./llama.cpp/llama-quantize",
278
+ "--imatrix", imatrix_path, fp16, quantized_gguf_path, imatrix_q_method
279
+ ]
280
  else:
281
+ quantise_ggml = [
282
+ "./llama.cpp/llama-quantize",
283
+ fp16, quantized_gguf_path, q_method
284
+ ]
285
  result = subprocess.run(quantise_ggml, shell=False, capture_output=True)
286
  if result.returncode != 0:
287
+ stderr_str = result.stderr.decode("utf-8")
288
+ raise Exception(f"Error quantizing: {stderr_str}")
289
+ print(f"Quantized successfully with {imatrix_q_method if use_imatrix else q_method} option!")
290
+ print(f"Quantized model path: {quantized_gguf_path}")
291
 
292
+ # Create empty repo
293
  username = whoami(oauth_token.token)["name"]
294
+ new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{imatrix_q_method if use_imatrix else q_method}-GGUF", exist_ok=True, private=private_repo)
 
 
 
 
295
  new_repo_id = new_repo_url.repo_id
296
  print("Repo created successfully!", new_repo_url)
297
 
298
  try:
299
  card = ModelCard.load(model_id, token=oauth_token.token)
300
+ except:
301
  card = ModelCard("")
 
302
  if card.data.tags is None:
303
  card.data.tags = []
304
  card.data.tags.append("llama-cpp")
305
  card.data.tags.append("gguf-my-repo")
306
  card.data.base_model = model_id
 
 
307
  mmproj_note = ""
308
  if mmproj_gguf and os.path.isfile(mmproj_gguf):
309
+ mmproj_note = f'\n\n## Multimodal Support\n\nThis model includes a multimodal projector (`ggml-model-mmproj-f32.gguf`) for vision capabilities. Use with llama.cpp vision models:\n\n```bash\nllama-cli -m {quantized_gguf_name} --mmproj ggml-model-mmproj-f32.gguf -p "Describe this image"\n```\n\n```bash\nllama-server -m {quantized_gguf_name} --mmproj ggml-model-mmproj-f32.gguf\n```\n'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
 
311
  card.text = dedent(
312
  f"""
313
  # {new_repo_id}
314
+ This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
 
 
315
  Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
316
  {mmproj_note}
317
  ## Use with llama.cpp
318
+ Install llama.cpp through brew (works on Mac and Linux)
319
 
 
320
  ```bash
321
  brew install llama.cpp
322
+
323
  ```
324
+ Invoke the llama.cpp server or the CLI.
325
 
326
  ### CLI:
327
  ```bash
 
333
  llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
334
  ```
335
 
336
+ Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
 
337
 
338
  Step 1: Clone llama.cpp from GitHub.
339
  ```
340
  git clone https://github.com/ggerganov/llama.cpp
341
  ```
342
 
343
+ Step 2: Move into the llama.cpp folder and build it with `LLAMA_CURL=1` flag along with other hardware-specific flags (for ex: LLAMA_CUDA=1 for Nvidia GPUs on Linux).
 
344
  ```
345
  cd llama.cpp && LLAMA_CURL=1 make
346
  ```
 
355
  ```
356
  """
357
  )
358
+ readme_path = Path(outdir)/"README.md"
 
359
  card.save(readme_path)
360
 
361
  if split_model:
 
373
 
374
  if os.path.isfile(imatrix_path):
375
  try:
376
+ print(f"Uploading imatrix.dat: {imatrix_path}")
377
  api.upload_file(
378
  path_or_fileobj=imatrix_path,
379
  path_in_repo="imatrix.dat",
 
382
  except Exception as e:
383
  raise Exception(f"Error uploading imatrix.dat: {e}")
384
 
385
+ # Upload mmproj.gguf if it was generated
386
  if mmproj_gguf and os.path.isfile(mmproj_gguf):
387
  try:
388
+ print(f"Uploading mmproj.gguf: {mmproj_gguf}")
389
  api.upload_file(
390
  path_or_fileobj=mmproj_gguf,
391
+ path_in_repo="ggml-model-mmproj-f32.gguf",
392
  repo_id=new_repo_id,
393
  )
394
  except Exception as e:
395
+ print(f"Warning: Failed to upload mmproj.gguf: {e}")
396
 
397
  api.upload_file(
398
  path_or_fileobj=readme_path,
399
  path_in_repo="README.md",
400
  repo_id=new_repo_id,
401
  )
402
+ print(f"Uploaded successfully with {imatrix_q_method if use_imatrix else q_method} option!")
403
 
404
+ # end of the TemporaryDirectory(dir="outputs") block; temporary outputs are deleted here
 
 
 
405
 
406
  return (
407
+ f'<h1>✅ DONE</h1><br/>Find your repo here: <a href="{new_repo_url}" target="_blank" style="text-decoration:underline">{new_repo_id}</a>',
 
 
408
  "llama.png",
409
  )
 
410
  except Exception as e:
411
  return (f'<h1>❌ ERROR</h1><br/><pre style="white-space:pre-wrap;">{escape(str(e))}</pre>', "error.png")
412
 
413
 
414
+ css="""/* Custom CSS to allow scrolling */
 
 
415
  .gradio-container {overflow-y: auto;}
416
  """
 
417
  model_id = HuggingfaceHubSearch(
418
  label="Hub Model ID",
419
  placeholder="Search for model id on Huggingface",
 
421
  )
422
 
423
  q_method = gr.Dropdown(
424
+ ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
 
425
  label="Quantization Method",
426
  info="GGML quantization type",
427
  value="Q4_K_M",
428
  filterable=False,
429
+ visible=True
430
  )
431
 
432
  imatrix_q_method = gr.Dropdown(
 
435
  info="GGML imatrix quants type",
436
  value="IQ4_NL",
437
  filterable=False,
438
+ visible=False
439
  )
440
 
441
  use_imatrix = gr.Checkbox(
442
  value=False,
443
  label="Use Imatrix Quantization",
444
+ info="Use importance matrix for quantization."
445
  )
446
 
447
  private_repo = gr.Checkbox(
448
  value=False,
449
  label="Private Repo",
450
+ info="Create a private repo under your username."
451
  )
452
 
453
  train_data_file = gr.File(
454
  label="Training Data File",
455
  file_types=["txt"],
456
+ visible=False
457
  )
458
 
459
  split_model = gr.Checkbox(
460
  value=False,
461
  label="Split Model",
462
+ info="Shard the model using gguf-split."
463
  )
464
 
465
  split_max_tensors = gr.Number(
466
  value=256,
467
  label="Max Tensors per File",
468
  info="Maximum number of tensors per file when splitting model.",
469
+ visible=False
470
  )
471
 
472
  split_max_size = gr.Textbox(
473
  label="Max File Size",
474
+ info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default. Accepted suffixes: M, G. Example: 256M, 5G",
475
+ visible=False
 
476
  )
477
 
 
478
  iface = gr.Interface(
479
+ fn=process_model,
480
+ inputs=[
481
+ model_id,
482
+ q_method,
483
+ use_imatrix,
484
+ imatrix_q_method,
485
+ private_repo,
486
+ train_data_file,
487
+ split_model,
488
+ split_max_tensors,
489
+ split_max_size,
490
+ ],
491
+ outputs=[
492
+ gr.Markdown(label="output"),
493
+ gr.Image(show_label=False),
494
+ ],
495
+ title="Create your own GGUF Quants, blazingly fast ⚡!",
496
+ description="The space takes an HF repo as an input, quantizes it and creates a Public repo containing the selected quant under your HF user namespace.",
497
+ api_name=False
498
+ )
499
 
500
+ # Create Gradio interface
501
  with gr.Blocks(css=css) as demo:
502
+ gr.Markdown("You must be logged in to use GGUF-my-repo.")
 
 
 
 
 
 
503
  gr.LoginButton(min_width=250)
504
 
505
  iface.render()
 
510
  split_model.change(
511
  fn=update_split_visibility,
512
  inputs=split_model,
513
+ outputs=[split_max_tensors, split_max_size]
514
  )
515
 
516
  def update_visibility(use_imatrix):
 
519
  use_imatrix.change(
520
  fn=update_visibility,
521
  inputs=use_imatrix,
522
+ outputs=[q_method, imatrix_q_method, train_data_file]
523
  )
524
 
 
525
  def restart_space():
526
  HfApi().restart_space(repo_id="ggml-org/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)
527
 
 
528
  scheduler = BackgroundScheduler()
529
  scheduler.add_job(restart_space, "interval", seconds=21600)
530
  scheduler.start()
531
 
532
+ # Launch the interface
533
+ demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True, show_api=False)