Spaces:

AI-Joe-git
/

gguf-my-repo-v2

Runtime error

App Files Files Community

AI-Joe-git commited on Apr 29

Commit

776bb60

verified ·

1 Parent(s): 535f60d

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +142 -192

app.py CHANGED Viewed

@@ -12,21 +12,19 @@ from textwrap import dedent
 from apscheduler.schedulers.background import BackgroundScheduler
 HF_TOKEN = os.environ.get("HF_TOKEN")
 CONVERSION_SCRIPT = "./llama.cpp/convert_hf_to_gguf.py"
-# 🔧 CHANGED: was "ggml-model-mmproj-f32.gguf"
-MMPROJ_FILENAME = "ggml-model-mmproj-q8_0.gguf"
 def escape(s: str) -> str:
-    s = s.replace("&", "&amp;")
     s = s.replace("<", "&lt;")
     s = s.replace(">", "&gt;")
     s = s.replace('"', "&quot;")
     s = s.replace("\n", "<br/>")
     return s
 def is_vision_model(model_dir: str) -> bool:
     """Check if a HuggingFace model directory contains vision capabilities."""
     config_path = Path(model_dir) / "config.json"
@@ -42,13 +40,17 @@ def is_vision_model(model_dir: str) -> bool:
     if "vision_config" in config and config["vision_config"]:
         return True
     if config.get("image_token_id") is not None and config.get("vision_start_token_id") is not None:
         return True
     if "text_config" in config and isinstance(config["text_config"], dict):
         if "vision_config" in config["text_config"] and config["text_config"]["vision_config"]:
             return True
     if config.get("mm_cfg"):
         return True
     if config.get("mm_projector"):
         return True
@@ -56,26 +58,20 @@ def is_vision_model(model_dir: str) -> bool:
 def generate_mmproj(model_dir: str, outdir: str) -> str | None:
-    """
-    Generate a Q8_0-quantized mmproj GGUF from a HuggingFace vision model directory.
-    Uses --outtype q8_0 directly via convert_hf_to_gguf.py (supported natively).
-    Q8_0 reduces file size ~50% vs F32 with negligible quality loss for vision encoders.
-    """  # 🔧 CHANGED: entire docstring and logic updated
     print(f"[MMPROJ] Checking model dir: {model_dir}")
     if not is_vision_model(model_dir):
         print("[MMPROJ] Not a vision model, skipping mmproj generation.")
         return None
-    # 🔧 CHANGED: outtype is now q8_0 directly; no two-step conversion needed
-    print(f"[MMPROJ] Vision model detected — generating {MMPROJ_FILENAME} (Q8_0)...")
-    mmproj_outfile = str(Path(outdir) / MMPROJ_FILENAME)
     result = subprocess.run([
         "python", CONVERSION_SCRIPT,
         model_dir,
         "--outfile", mmproj_outfile,
-        "--outtype", "q8_0",   # 🔧 CHANGED: was "f32"
         "--mmproj",
     ], shell=False, capture_output=True)
@@ -85,28 +81,13 @@ def generate_mmproj(model_dir: str, outdir: str) -> str | None:
     if result.returncode != 0:
         print(f"[MMPROJ] stderr: {stderr[:1000]}")
         print(f"[MMPROJ] Return code: {result.returncode}")
-        # 🆕 NEW: graceful fallback — retry with f16 if q8_0 not supported by this model class
-        print("[MMPROJ] Q8_0 failed — retrying with f16 as fallback...")
-        mmproj_outfile_f16 = str(Path(outdir) / "ggml-model-mmproj-f16.gguf")
-        result2 = subprocess.run([
-            "python", CONVERSION_SCRIPT,
-            model_dir,
-            "--outfile", mmproj_outfile_f16,
-            "--outtype", "f16",
-            "--mmproj",
-        ], shell=False, capture_output=True)
-        if result2.returncode != 0 or not os.path.isfile(mmproj_outfile_f16):
-            print(f"[MMPROJ] Fallback f16 also failed: {result2.stderr.decode('utf-8')[:500]}")
-            return None
-        print(f"[MMPROJ] Fallback mmproj (f16) generated: {mmproj_outfile_f16}")
-        return mmproj_outfile_f16
     if not os.path.isfile(mmproj_outfile):
         print(f"[MMPROJ] File not found at {mmproj_outfile}")
         return None
-    size_mb = os.path.getsize(mmproj_outfile) / (1024 * 1024)
-    print(f"[MMPROJ] {MMPROJ_FILENAME} generated: {mmproj_outfile} ({size_mb:.1f} MB)")
     return mmproj_outfile
@@ -127,19 +108,18 @@ def generate_importance_matrix(model_path: str, train_data_path: str, output_pat
     process = subprocess.Popen(imatrix_command, shell=False)
     try:
-        process.wait(timeout=60)
     except subprocess.TimeoutExpired:
         print("Imatrix computation timed out. Sending SIGINT to allow graceful termination...")
         process.send_signal(signal.SIGINT)
         try:
-            process.wait(timeout=5)
         except subprocess.TimeoutExpired:
-            print("Imatrix proc still didn't term. Forcefully terminating process...")
             process.kill()
     print("Importance matrix generation completed.")
 def split_upload_model(model_path: str, outdir: str, repo_id: str, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
     print(f"Model path: {model_path}")
     print(f"Output dir: {outdir}")
@@ -147,31 +127,40 @@ def split_upload_model(model_path: str, outdir: str, repo_id: str, oauth_token:
     if oauth_token is None or oauth_token.token is None:
         raise ValueError("You have to be logged in.")
-    split_cmd = ["./llama.cpp/llama-gguf-split", "--split"]
     if split_max_size:
-        split_cmd += ["--split-max-size", split_max_size]
     else:
-        split_cmd += ["--split-max-tensors", str(split_max_tensors)]
-    model_path_prefix = '.'.join(model_path.split('.')[:-1])
-    split_cmd += [model_path, model_path_prefix]
     print(f"Split command: {split_cmd}")
     result = subprocess.run(split_cmd, shell=False, capture_output=True, text=True)
     print(f"Split command stdout: {result.stdout}")
     print(f"Split command stderr: {result.stderr}")
     if result.returncode != 0:
-        raise Exception(f"Error splitting the model: {result.stderr}")
     print("Model split successfully!")
     if os.path.exists(model_path):
         os.remove(model_path)
     model_file_prefix = model_path_prefix.split('/')[-1]
     print(f"Model file name prefix: {model_file_prefix}")
     sharded_model_files = [f for f in os.listdir(outdir) if f.startswith(model_file_prefix) and f.endswith(".gguf")]
     if sharded_model_files:
         print(f"Sharded model files: {sharded_model_files}")
         api = HfApi(token=oauth_token.token)
@@ -191,168 +180,148 @@ def split_upload_model(model_path: str, outdir: str, repo_id: str, oauth_token:
     print("Sharded model has been uploaded successfully!")
 def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
     if oauth_token is None or oauth_token.token is None:
         raise gr.Error("You must be logged in to use GGUF-my-repo")
     try:
         whoami(oauth_token.token)
-    except Exception:
         raise gr.Error("You must be logged in to use GGUF-my-repo")
     model_name = model_id.split('/')[-1]
     try:
         api = HfApi(token=oauth_token.token)
         dl_pattern = ["*.md", "*.json", "*.model"]
         pattern = (
             "*.safetensors"
             if any(
                 file.path.endswith(".safetensors")
-                for file in api.list_repo_tree(repo_id=model_id, recursive=True)
             )
             else "*.bin"
         )
         dl_pattern += [pattern]
-        os.makedirs("downloads", exist_ok=True)
-        os.makedirs("outputs", exist_ok=True)
         with tempfile.TemporaryDirectory(dir="outputs") as outdir:
-            fp16 = str(Path(outdir) / f"{model_name}.fp16.gguf")
             with tempfile.TemporaryDirectory(dir="downloads") as tmpdir:
-                local_dir = Path(tmpdir) / model_name
                 print(local_dir)
-                api.snapshot_download(
-                    repo_id=model_id,
-                    local_dir=local_dir,
-                    local_dir_use_symlinks=False,
-                    allow_patterns=dl_pattern,
-                )
                 print("Model downloaded successfully!")
                 print(f"Current working directory: {os.getcwd()}")
                 print(f"Model directory contents: {os.listdir(local_dir)}")
-                config_dir = local_dir / "config.json"
-                adapter_config_dir = local_dir / "adapter_config.json"
                 if os.path.exists(adapter_config_dir) and not os.path.exists(config_dir):
-                    raise Exception(
-                        'adapter_config.json is present.<br/><br/>'
-                        'If you are converting a LoRA adapter to GGUF, please use '
-                        '<a href="https://huggingface.co/spaces/ggml-org/gguf-my-lora" '
-                        'target="_blank" style="text-decoration:underline">GGUF-my-lora</a>.'
-                    )
                 result = subprocess.run([
-                    "python", CONVERSION_SCRIPT, local_dir,
-                    "--outtype", "f16", "--outfile", fp16
                 ], shell=False, capture_output=True)
                 print(result)
                 if result.returncode != 0:
-                    raise Exception(f"Error converting to fp16: {result.stderr.decode('utf-8')}")
                 print("Model converted to fp16 successfully!")
-                # 🔧 CHANGED: now generates Q8_0 mmproj; filename tracked dynamically
-                mmproj_gguf = generate_mmproj(str(local_dir), outdir)
-                if mmproj_gguf:
-                    mmproj_filename_used = Path(mmproj_gguf).name
-                    print(f"[MMPROJ] Will upload: {mmproj_gguf} as {mmproj_filename_used}")
-                else:
-                    mmproj_filename_used = None
-            imatrix_path = Path(outdir) / "imatrix.dat"
             if use_imatrix:
-                train_data_path = train_data_file.name if train_data_file else "llama.cpp/groups_merged.txt"
                 print(f"Training data file path: {train_data_path}")
                 if not os.path.isfile(train_data_path):
                     raise Exception(f"Training data file not found: {train_data_path}")
                 generate_importance_matrix(fp16, train_data_path, imatrix_path)
             else:
                 print("Not using imatrix quantization.")
-            active_method = imatrix_q_method if use_imatrix else q_method
-            quantized_gguf_name = (
-                f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf"
-                if use_imatrix
-                else f"{model_name.lower()}-{q_method.lower()}.gguf"
-            )
-            quantized_gguf_path = str(Path(outdir) / quantized_gguf_name)
-            quantise_ggml = ["./llama.cpp/llama-quantize"]
             if use_imatrix:
-                quantise_ggml += ["--imatrix", imatrix_path, fp16, quantized_gguf_path, imatrix_q_method]
             else:
-                quantise_ggml += [fp16, quantized_gguf_path, q_method]
             result = subprocess.run(quantise_ggml, shell=False, capture_output=True)
             if result.returncode != 0:
-                raise Exception(f"Error quantizing: {result.stderr.decode('utf-8')}")
-            print(f"Quantized successfully with {active_method} option!")
             username = whoami(oauth_token.token)["name"]
-            new_repo_url = api.create_repo(
-                repo_id=f"{username}/{model_name}-{active_method}-GGUF",
-                exist_ok=True,
-                private=private_repo,
-            )
             new_repo_id = new_repo_url.repo_id
             print("Repo created successfully!", new_repo_url)
             try:
                 card = ModelCard.load(model_id, token=oauth_token.token)
-            except Exception:
                 card = ModelCard("")
             if card.data.tags is None:
                 card.data.tags = []
             card.data.tags.append("llama-cpp")
             card.data.tags.append("gguf-my-repo")
             card.data.base_model = model_id
-            # 🔧 CHANGED: mmproj section updated with actual filename, Q8_0 note, and richer instructions
             mmproj_note = ""
             if mmproj_gguf and os.path.isfile(mmproj_gguf):
-                mmproj_note = dedent(f"""
-                    ## 👁️ Vision / Multimodal Support
-                    This is a **vision-capable model**. A multimodal projector file (`{mmproj_filename_used}`) has been
-                    automatically generated and uploaded alongside the quantized weights.
-                    It is quantized at **Q8_0** — near-lossless compression (~50% smaller than F32)
-                    recommended for vision encoders.
-                    Use the `--mmproj` flag when running inference:
-                    ```bash
-                    # CLI
-                    llama-cli -m {quantized_gguf_name} --mmproj {mmproj_filename_used} -p "Describe this image" --image /path/to/image.jpg
-                    # Server
-                    llama-server -m {quantized_gguf_name} --mmproj {mmproj_filename_used}
-                    ```
-                    > **Tip:** Both files must be in the same directory, or provide full paths.
-                    > Compatible with llama.cpp builds that include vision support (default since mid-2024).
-                """)
             card.text = dedent(
                 f"""
                 # {new_repo_id}
-                This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id})
-                using llama.cpp via the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
                 Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
                 {mmproj_note}
                 ## Use with llama.cpp
-                Install llama.cpp through brew (works on Mac and Linux):
                 ```bash
                 brew install llama.cpp
                 ```
                 ### CLI:
                 ```bash
@@ -364,16 +333,14 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
                 llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
                 ```
-                Note: You can also use this checkpoint directly through the
-                [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo.
                 Step 1: Clone llama.cpp from GitHub.
                 ```
                 git clone https://github.com/ggerganov/llama.cpp
                 ```
-                Step 2: Move into the llama.cpp folder and build it with `LLAMA_CURL=1` flag along with other
-                hardware-specific flags (for ex: LLAMA_CUDA=1 for Nvidia GPUs on Linux).
                 ```
                 cd llama.cpp && LLAMA_CURL=1 make
                 ```
@@ -388,8 +355,7 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
                 ```
                 """
             )
-            readme_path = Path(outdir) / "README.md"
             card.save(readme_path)
             if split_model:
@@ -407,6 +373,7 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
             if os.path.isfile(imatrix_path):
                 try:
                     api.upload_file(
                         path_or_fileobj=imatrix_path,
                         path_in_repo="imatrix.dat",
@@ -415,47 +382,38 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
                 except Exception as e:
                     raise Exception(f"Error uploading imatrix.dat: {e}")
-            # 🔧 CHANGED: upload path now uses dynamic filename (handles q8_0 or f16 fallback)
             if mmproj_gguf and os.path.isfile(mmproj_gguf):
                 try:
-                    print(f"Uploading {mmproj_filename_used}: {mmproj_gguf}")
                     api.upload_file(
                         path_or_fileobj=mmproj_gguf,
-                        path_in_repo=mmproj_filename_used,   # 🔧 CHANGED: was hardcoded "ggml-model-mmproj-f32.gguf"
                         repo_id=new_repo_id,
                     )
                 except Exception as e:
-                    print(f"Warning: Failed to upload {mmproj_filename_used}: {e}")
             api.upload_file(
                 path_or_fileobj=readme_path,
                 path_in_repo="README.md",
                 repo_id=new_repo_id,
             )
-            print(f"Uploaded successfully with {active_method} option!")
-        # 🆕 NEW: success message notes vision support when applicable
-        vision_note = ""
-        if mmproj_gguf:
-            vision_note = f'<br/>👁️ Vision model detected — <code>{mmproj_filename_used}</code> (Q8_0) also uploaded.'
         return (
-            f'<h1>✅ DONE</h1><br/>Find your repo here: '
-            f'<a href="{new_repo_url}" target="_blank" style="text-decoration:underline">{new_repo_id}</a>'
-            f'{vision_note}',
             "llama.png",
         )
     except Exception as e:
         return (f'<h1>❌ ERROR</h1><br/><pre style="white-space:pre-wrap;">{escape(str(e))}</pre>', "error.png")
-# ─── UI ──────────────────────────────────────────────────────────────────────
-css = """
 .gradio-container {overflow-y: auto;}
 """
 model_id = HuggingfaceHubSearch(
     label="Hub Model ID",
     placeholder="Search for model id on Huggingface",
@@ -463,13 +421,12 @@ model_id = HuggingfaceHubSearch(
 )
 q_method = gr.Dropdown(
-    ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M",
-     "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
     label="Quantization Method",
     info="GGML quantization type",
     value="Q4_K_M",
     filterable=False,
-    visible=True,
 )
 imatrix_q_method = gr.Dropdown(
@@ -478,77 +435,71 @@ imatrix_q_method = gr.Dropdown(
     info="GGML imatrix quants type",
     value="IQ4_NL",
     filterable=False,
-    visible=False,
 )
 use_imatrix = gr.Checkbox(
     value=False,
     label="Use Imatrix Quantization",
-    info="Use importance matrix for quantization.",
 )
 private_repo = gr.Checkbox(
     value=False,
     label="Private Repo",
-    info="Create a private repo under your username.",
 )
 train_data_file = gr.File(
     label="Training Data File",
     file_types=["txt"],
-    visible=False,
 )
 split_model = gr.Checkbox(
     value=False,
     label="Split Model",
-    info="Shard the model using gguf-split.",
 )
 split_max_tensors = gr.Number(
     value=256,
     label="Max Tensors per File",
     info="Maximum number of tensors per file when splitting model.",
-    visible=False,
 )
 split_max_size = gr.Textbox(
     label="Max File Size",
-    info="Maximum file size when splitting model (--split-max-size). "
-         "May leave empty to use the default. Accepted suffixes: M, G. Example: 256M, 5G",
-    visible=False,
 )
-# 🔧 CHANGED: description updated to mention vision/mmproj support
 iface = gr.Interface(
-    fn=process_model,
-    inputs=[
-        model_id, q_method, use_imatrix, imatrix_q_method,
-        private_repo, train_data_file, split_model, split_max_tensors, split_max_size,
-    ],
-    outputs=[
-        gr.Markdown(label="output"),
-        gr.Image(show_label=False),
-    ],
-    title="Create your own GGUF Quants, blazingly fast ⚡!",
-    description=(
-        "The space takes an HF repo as input, quantizes it and creates a repo containing "
-        "the selected quant under your HF user namespace.\n\n"
-        "👁️ **Vision models are automatically detected** — a multimodal projector "
-        "(`ggml-model-mmproj-q8_0.gguf`) will be generated and uploaded alongside "
-        "the quantized weights, ready to use with `llama-cli --mmproj`."
-    ),
-    api_name=False,
-)
 with gr.Blocks(css=css) as demo:
-    # 🔧 CHANGED: header updated to mention vision support
-    gr.Markdown(
-        "You must be logged in to use GGUF-my-repo.\n\n"
-        "> 👁️ **Vision model support:** If the selected model has vision capabilities, "
-        "a `ggml-model-mmproj-q8_0.gguf` projector file will be **automatically generated "
-        "and uploaded** to your repo alongside the quantized model."
-    )
     gr.LoginButton(min_width=250)
     iface.render()
@@ -559,7 +510,7 @@ with gr.Blocks(css=css) as demo:
     split_model.change(
         fn=update_split_visibility,
         inputs=split_model,
-        outputs=[split_max_tensors, split_max_size],
     )
     def update_visibility(use_imatrix):
@@ -568,16 +519,15 @@ with gr.Blocks(css=css) as demo:
     use_imatrix.change(
         fn=update_visibility,
         inputs=use_imatrix,
-        outputs=[q_method, imatrix_q_method, train_data_file],
     )
 def restart_space():
     HfApi().restart_space(repo_id="ggml-org/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=21600)
 scheduler.start()
-demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True, show_api=False)

 from apscheduler.schedulers.background import BackgroundScheduler
+# used for restarting the space
 HF_TOKEN = os.environ.get("HF_TOKEN")
 CONVERSION_SCRIPT = "./llama.cpp/convert_hf_to_gguf.py"
+# escape HTML for logging
 def escape(s: str) -> str:
+    s = s.replace("&", "&amp;") # Must be done first!
     s = s.replace("<", "&lt;")
     s = s.replace(">", "&gt;")
     s = s.replace('"', "&quot;")
     s = s.replace("\n", "<br/>")
     return s
 def is_vision_model(model_dir: str) -> bool:
     """Check if a HuggingFace model directory contains vision capabilities."""
     config_path = Path(model_dir) / "config.json"
     if "vision_config" in config and config["vision_config"]:
         return True
     if config.get("image_token_id") is not None and config.get("vision_start_token_id") is not None:
         return True
     if "text_config" in config and isinstance(config["text_config"], dict):
         if "vision_config" in config["text_config"] and config["text_config"]["vision_config"]:
             return True
     if config.get("mm_cfg"):
         return True
     if config.get("mm_projector"):
         return True
 def generate_mmproj(model_dir: str, outdir: str) -> str | None:
+    """Generate mmproj.gguf from a HuggingFace vision model directory."""
     print(f"[MMPROJ] Checking model dir: {model_dir}")
     if not is_vision_model(model_dir):
         print("[MMPROJ] Not a vision model, skipping mmproj generation.")
         return None
+    print(f"[MMPROJ] Vision model detected, generating mmproj.gguf...")
+    mmproj_outfile = str(Path(outdir) / "ggml-model-mmproj-f32.gguf")
     result = subprocess.run([
         "python", CONVERSION_SCRIPT,
         model_dir,
         "--outfile", mmproj_outfile,
+        "--outtype", "f32",
         "--mmproj",
     ], shell=False, capture_output=True)
     if result.returncode != 0:
         print(f"[MMPROJ] stderr: {stderr[:1000]}")
         print(f"[MMPROJ] Return code: {result.returncode}")
+        return None
     if not os.path.isfile(mmproj_outfile):
         print(f"[MMPROJ] File not found at {mmproj_outfile}")
         return None
+    print(f"[MMPROJ] mmproj.gguf generated: {mmproj_outfile} ({os.path.getsize(mmproj_outfile) / (1024*1024):.1f} MB)")
     return mmproj_outfile
     process = subprocess.Popen(imatrix_command, shell=False)
     try:
+        process.wait(timeout=60)  # added wait
     except subprocess.TimeoutExpired:
         print("Imatrix computation timed out. Sending SIGINT to allow graceful termination...")
         process.send_signal(signal.SIGINT)
         try:
+            process.wait(timeout=5)  # grace period
         except subprocess.TimeoutExpired:
+            print("Imatrix proc still didn't term. Forecfully terming process...")
             process.kill()
     print("Importance matrix generation completed.")
 def split_upload_model(model_path: str, outdir: str, repo_id: str, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
     print(f"Model path: {model_path}")
     print(f"Output dir: {outdir}")
     if oauth_token is None or oauth_token.token is None:
         raise ValueError("You have to be logged in.")
+    split_cmd = [
+        "./llama.cpp/llama-gguf-split",
+        "--split",
+    ]
     if split_max_size:
+        split_cmd.append("--split-max-size")
+        split_cmd.append(split_max_size)
     else:
+        split_cmd.append("--split-max-tensors")
+        split_cmd.append(str(split_max_tensors))
+    # args for output
+    model_path_prefix = '.'.join(model_path.split('.')[:-1]) # remove the file extension
+    split_cmd.append(model_path)
+    split_cmd.append(model_path_prefix)
     print(f"Split command: {split_cmd}")
     result = subprocess.run(split_cmd, shell=False, capture_output=True, text=True)
     print(f"Split command stdout: {result.stdout}")
     print(f"Split command stderr: {result.stderr}")
     if result.returncode != 0:
+        stderr_str = result.stderr.decode("utf-8")
+        raise Exception(f"Error splitting the model: {stderr_str}")
     print("Model split successfully!")
+    # remove the original model file if needed
     if os.path.exists(model_path):
         os.remove(model_path)
     model_file_prefix = model_path_prefix.split('/')[-1]
     print(f"Model file name prefix: {model_file_prefix}")
     sharded_model_files = [f for f in os.listdir(outdir) if f.startswith(model_file_prefix) and f.endswith(".gguf")]
     if sharded_model_files:
         print(f"Sharded model files: {sharded_model_files}")
         api = HfApi(token=oauth_token.token)
     print("Sharded model has been uploaded successfully!")
 def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
     if oauth_token is None or oauth_token.token is None:
         raise gr.Error("You must be logged in to use GGUF-my-repo")
+    # validate the oauth token
     try:
         whoami(oauth_token.token)
+    except Exception as e:
         raise gr.Error("You must be logged in to use GGUF-my-repo")
     model_name = model_id.split('/')[-1]
     try:
         api = HfApi(token=oauth_token.token)
         dl_pattern = ["*.md", "*.json", "*.model"]
         pattern = (
             "*.safetensors"
             if any(
                 file.path.endswith(".safetensors")
+                for file in api.list_repo_tree(
+                    repo_id=model_id,
+                    recursive=True,
+                )
             )
             else "*.bin"
         )
         dl_pattern += [pattern]
+        if not os.path.exists("downloads"):
+            os.makedirs("downloads")
+        if not os.path.exists("outputs"):
+            os.makedirs("outputs")
         with tempfile.TemporaryDirectory(dir="outputs") as outdir:
+            fp16 = str(Path(outdir)/f"{model_name}.fp16.gguf")
             with tempfile.TemporaryDirectory(dir="downloads") as tmpdir:
+                # Keep the model name as the dirname so the model name metadata is populated correctly
+                local_dir = Path(tmpdir)/model_name
                 print(local_dir)
+                api.snapshot_download(repo_id=model_id, local_dir=local_dir, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
                 print("Model downloaded successfully!")
                 print(f"Current working directory: {os.getcwd()}")
                 print(f"Model directory contents: {os.listdir(local_dir)}")
+                config_dir = local_dir/"config.json"
+                adapter_config_dir = local_dir/"adapter_config.json"
                 if os.path.exists(adapter_config_dir) and not os.path.exists(config_dir):
+                    raise Exception('adapter_config.json is present.<br/><br/>If you are converting a LoRA adapter to GGUF, please use <a href="https://huggingface.co/spaces/ggml-org/gguf-my-lora" target="_blank" style="text-decoration:underline">GGUF-my-lora</a>.')
                 result = subprocess.run([
+                    "python", CONVERSION_SCRIPT, local_dir, "--outtype", "f16", "--outfile", fp16
                 ], shell=False, capture_output=True)
                 print(result)
                 if result.returncode != 0:
+                    stderr_str = result.stderr.decode("utf-8")
+                    raise Exception(f"Error converting to fp16: {stderr_str}")
                 print("Model converted to fp16 successfully!")
+                print(f"Converted model path: {fp16}")
+                # Generate mmproj.gguf for vision models
+                mmproj_gguf_path = str(Path(outdir)/"ggml-model-mmproj-f32.gguf")
+                mmproj_result = generate_mmproj(str(local_dir), outdir)
+                mmproj_gguf = None
+                if mmproj_result:
+                    mmproj_gguf = mmproj_result
+                    print(f"[MMPROJ] Will upload mmproj.gguf: {mmproj_gguf}")
+            imatrix_path = Path(outdir)/"imatrix.dat"
             if use_imatrix:
+                if train_data_file:
+                    train_data_path = train_data_file.name
+                else:
+                    train_data_path = "llama.cpp/groups_merged.txt" #fallback calibration dataset
                 print(f"Training data file path: {train_data_path}")
                 if not os.path.isfile(train_data_path):
                     raise Exception(f"Training data file not found: {train_data_path}")
                 generate_importance_matrix(fp16, train_data_path, imatrix_path)
             else:
                 print("Not using imatrix quantization.")
+            # Quantize the model
+            quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
+            quantized_gguf_path = str(Path(outdir)/quantized_gguf_name)
             if use_imatrix:
+                quantise_ggml = [
+                    "./llama.cpp/llama-quantize",
+                    "--imatrix", imatrix_path, fp16, quantized_gguf_path, imatrix_q_method
+                ]
             else:
+                quantise_ggml = [
+                    "./llama.cpp/llama-quantize",
+                    fp16, quantized_gguf_path, q_method
+                ]
             result = subprocess.run(quantise_ggml, shell=False, capture_output=True)
             if result.returncode != 0:
+                stderr_str = result.stderr.decode("utf-8")
+                raise Exception(f"Error quantizing: {stderr_str}")
+            print(f"Quantized successfully with {imatrix_q_method if use_imatrix else q_method} option!")
+            print(f"Quantized model path: {quantized_gguf_path}")
+            # Create empty repo
             username = whoami(oauth_token.token)["name"]
+            new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{imatrix_q_method if use_imatrix else q_method}-GGUF", exist_ok=True, private=private_repo)
             new_repo_id = new_repo_url.repo_id
             print("Repo created successfully!", new_repo_url)
             try:
                 card = ModelCard.load(model_id, token=oauth_token.token)
+            except:
                 card = ModelCard("")
             if card.data.tags is None:
                 card.data.tags = []
             card.data.tags.append("llama-cpp")
             card.data.tags.append("gguf-my-repo")
             card.data.base_model = model_id
             mmproj_note = ""
             if mmproj_gguf and os.path.isfile(mmproj_gguf):
+                mmproj_note = f'\n\n## Multimodal Support\n\nThis model includes a multimodal projector (`ggml-model-mmproj-f32.gguf`) for vision capabilities. Use with llama.cpp vision models:\n\n```bash\nllama-cli -m {quantized_gguf_name} --mmproj ggml-model-mmproj-f32.gguf -p "Describe this image"\n```\n\n```bash\nllama-server -m {quantized_gguf_name} --mmproj ggml-model-mmproj-f32.gguf\n```\n'
             card.text = dedent(
                 f"""
                 # {new_repo_id}
+                This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
                 Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
                 {mmproj_note}
                 ## Use with llama.cpp
+                Install llama.cpp through brew (works on Mac and Linux)
                 ```bash
                 brew install llama.cpp
                 ```
+                Invoke the llama.cpp server or the CLI.
                 ### CLI:
                 ```bash
                 llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
                 ```
+                Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
                 Step 1: Clone llama.cpp from GitHub.
                 ```
                 git clone https://github.com/ggerganov/llama.cpp
                 ```
+                Step 2: Move into the llama.cpp folder and build it with `LLAMA_CURL=1` flag along with other hardware-specific flags (for ex: LLAMA_CUDA=1 for Nvidia GPUs on Linux).
                 ```
                 cd llama.cpp && LLAMA_CURL=1 make
                 ```
                 ```
                 """
             )
+            readme_path = Path(outdir)/"README.md"
             card.save(readme_path)
             if split_model:
             if os.path.isfile(imatrix_path):
                 try:
+                    print(f"Uploading imatrix.dat: {imatrix_path}")
                     api.upload_file(
                         path_or_fileobj=imatrix_path,
                         path_in_repo="imatrix.dat",
                 except Exception as e:
                     raise Exception(f"Error uploading imatrix.dat: {e}")
+            # Upload mmproj.gguf if it was generated
             if mmproj_gguf and os.path.isfile(mmproj_gguf):
                 try:
+                    print(f"Uploading mmproj.gguf: {mmproj_gguf}")
                     api.upload_file(
                         path_or_fileobj=mmproj_gguf,
+                        path_in_repo="ggml-model-mmproj-f32.gguf",
                         repo_id=new_repo_id,
                     )
                 except Exception as e:
+                    print(f"Warning: Failed to upload mmproj.gguf: {e}")
             api.upload_file(
                 path_or_fileobj=readme_path,
                 path_in_repo="README.md",
                 repo_id=new_repo_id,
             )
+            print(f"Uploaded successfully with {imatrix_q_method if use_imatrix else q_method} option!")
+        # end of the TemporaryDirectory(dir="outputs") block; temporary outputs are deleted here
         return (
+            f'<h1>✅ DONE</h1><br/>Find your repo here: <a href="{new_repo_url}" target="_blank" style="text-decoration:underline">{new_repo_id}</a>',
             "llama.png",
         )
     except Exception as e:
         return (f'<h1>❌ ERROR</h1><br/><pre style="white-space:pre-wrap;">{escape(str(e))}</pre>', "error.png")
+css="""/* Custom CSS to allow scrolling */
 .gradio-container {overflow-y: auto;}
 """
 model_id = HuggingfaceHubSearch(
     label="Hub Model ID",
     placeholder="Search for model id on Huggingface",
 )
 q_method = gr.Dropdown(
+    ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
     label="Quantization Method",
     info="GGML quantization type",
     value="Q4_K_M",
     filterable=False,
+    visible=True
 )
 imatrix_q_method = gr.Dropdown(
     info="GGML imatrix quants type",
     value="IQ4_NL",
     filterable=False,
+    visible=False
 )
 use_imatrix = gr.Checkbox(
     value=False,
     label="Use Imatrix Quantization",
+    info="Use importance matrix for quantization."
 )
 private_repo = gr.Checkbox(
     value=False,
     label="Private Repo",
+    info="Create a private repo under your username."
 )
 train_data_file = gr.File(
     label="Training Data File",
     file_types=["txt"],
+    visible=False
 )
 split_model = gr.Checkbox(
     value=False,
     label="Split Model",
+    info="Shard the model using gguf-split."
 )
 split_max_tensors = gr.Number(
     value=256,
     label="Max Tensors per File",
     info="Maximum number of tensors per file when splitting model.",
+    visible=False
 )
 split_max_size = gr.Textbox(
     label="Max File Size",
+    info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default. Accepted suffixes: M, G. Example: 256M, 5G",
+    visible=False
 )
 iface = gr.Interface(
+        fn=process_model,
+        inputs=[
+            model_id,
+            q_method,
+            use_imatrix,
+            imatrix_q_method,
+            private_repo,
+            train_data_file,
+            split_model,
+            split_max_tensors,
+            split_max_size,
+        ],
+        outputs=[
+            gr.Markdown(label="output"),
+            gr.Image(show_label=False),
+        ],
+        title="Create your own GGUF Quants, blazingly fast ⚡!",
+        description="The space takes an HF repo as an input, quantizes it and creates a Public repo containing the selected quant under your HF user namespace.",
+        api_name=False
+    )
+# Create Gradio interface
 with gr.Blocks(css=css) as demo:
+    gr.Markdown("You must be logged in to use GGUF-my-repo.")
     gr.LoginButton(min_width=250)
     iface.render()
     split_model.change(
         fn=update_split_visibility,
         inputs=split_model,
+        outputs=[split_max_tensors, split_max_size]
     )
     def update_visibility(use_imatrix):
     use_imatrix.change(
         fn=update_visibility,
         inputs=use_imatrix,
+        outputs=[q_method, imatrix_q_method, train_data_file]
     )
 def restart_space():
     HfApi().restart_space(repo_id="ggml-org/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=21600)
 scheduler.start()
+# Launch the interface
+demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True, show_api=False)