#!/usr/bin/env python3 """ Duplicate layer 5 (GQA Attention) in nemotron-cascade-2 GGUF. Produces a 53-layer model where layers 5 and 6 are identical attention blocks. All layers >= 6 in the original are shifted up by 1. Per-layer metadata arrays (head_count_kv, feed_forward_length) are also updated. """ import re import numpy as np from pathlib import Path from gguf import GGUFReader, GGUFWriter, GGUFValueType SRC = Path("/usr/share/ollama/.ollama/models/blobs/sha256-9e0c827cfd6a6d000032be3da3d0914668b0c1112977e927186d29c4487466c4") DST = Path("/home/j/nemotron-cascade-2-attn-repeat-L5.gguf") DUP_LAYER = 5 ARCH = "nemotron_h_moe" # Per-layer array fields that need an extra element inserted PER_LAYER_ARRAY_FIELDS = { f"{ARCH}.attention.head_count_kv", f"{ARCH}.feed_forward_length", } def read_per_layer_array(field): """Read a per-layer uint32 array from GGUF field parts. In gguf-py, array fields have parts: [0]: key name length [1]: key name bytes [2]: array type marker (9 = ARRAY) [3]: element type (4 = UINT32) [4]: array length [5..]: individual element values """ parts = field.parts arr_len = int(parts[4][0]) if hasattr(parts[4], '__getitem__') else int(parts[4]) values = [] for i in range(arr_len): p = parts[5 + i] val = int(p[0]) if hasattr(p, '__getitem__') else int(p) values.append(val) return values def main(): print(f"Reading {SRC} ...") reader = GGUFReader(str(SRC)) # --- Build tensor list with duplication --- tensors_to_write = [] for t in reader.tensors: m = re.match(r'^blk\.(\d+)\.(.*)', t.name) if m: blk = int(m.group(1)) suffix = m.group(2) if blk <= DUP_LAYER: tensors_to_write.append((t.name, t)) if blk == DUP_LAYER: tensors_to_write.append((f"blk.{blk + 1}.{suffix}", t)) else: tensors_to_write.append((f"blk.{blk + 1}.{suffix}", t)) else: tensors_to_write.append((t.name, t)) print(f"Original: {len(reader.tensors)} tensors, 52 layers") print(f"New: {len(tensors_to_write)} tensors, 53 layers") # --- Read per-layer arrays and insert duplicate --- per_layer_arrays = {} for field_name in PER_LAYER_ARRAY_FIELDS: field = reader.fields.get(field_name) if field and len(field.types) > 1 and field.types[0] == GGUFValueType.ARRAY: orig = read_per_layer_array(field) # Insert duplicate of DUP_LAYER at DUP_LAYER+1 new_arr = orig[:DUP_LAYER + 1] + [orig[DUP_LAYER]] + orig[DUP_LAYER + 1:] per_layer_arrays[field_name] = new_arr print(f" {field_name}: {len(orig)} -> {len(new_arr)} elements") # Show attention/moe layer positions for verification nonzero = [(i, v) for i, v in enumerate(new_arr) if v != 0] print(f" non-zero positions: {nonzero}") # --- Write new GGUF --- print(f"Writing {DST} ...") writer = GGUFWriter(str(DST), ARCH) for field_name in reader.fields: if field_name.startswith("GGUF."): continue field = reader.fields[field_name] parts = field.parts # Skip architecture (writer adds automatically) if field_name == "general.architecture": continue # Update block_count if field_name == f"{ARCH}.block_count": print(f" block_count: 52 -> 53") writer.add_uint32(f"{ARCH}.block_count", 53) continue # Handle per-layer arrays with inserted element if field_name in per_layer_arrays: writer.add_array(field_name, per_layer_arrays[field_name]) continue # Handle other array types if len(field.types) > 1 and field.types[0] == GGUFValueType.ARRAY: arr_type = field.types[1] if arr_type == GGUFValueType.STRING: # String arrays: parts[5+] are alternating (length, bytes) pairs # Use a simpler approach: collect from data_offset strings = [] i = 5 # skip header parts: name_len, name, array_type, elem_type, count while i < len(parts): # Each string: length part then bytes part if i + 1 < len(parts): try: s = parts[i + 1].tobytes().decode('utf-8') strings.append(s) i += 2 except: i += 1 else: break if strings: writer.add_array(field_name, strings) continue elif arr_type == GGUFValueType.UINT32: arr_len = int(parts[4][0]) if hasattr(parts[4], '__getitem__') else int(parts[4]) values = [] for idx in range(arr_len): p = parts[5 + idx] values.append(int(p[0]) if hasattr(p, '__getitem__') else int(p)) writer.add_array(field_name, values) continue elif arr_type == GGUFValueType.INT32: arr_len = int(parts[4][0]) if hasattr(parts[4], '__getitem__') else int(parts[4]) values = [] for idx in range(arr_len): p = parts[5 + idx] values.append(int(p[0]) if hasattr(p, '__getitem__') else int(p)) writer.add_array(field_name, values) continue elif arr_type == GGUFValueType.FLOAT32: arr_len = int(parts[4][0]) if hasattr(parts[4], '__getitem__') else int(parts[4]) values = [] for idx in range(arr_len): p = parts[5 + idx] values.append(float(p[0]) if hasattr(p, '__getitem__') else float(p)) writer.add_array(field_name, values) continue else: print(f" SKIP array: {field_name} (elem type {arr_type})") continue # Scalar types field_type = field.types[-1] if field.types else None try: if field_type == GGUFValueType.STRING: val = parts[-1].tobytes().decode('utf-8') writer.add_string(field_name, val) elif field_type == GGUFValueType.UINT32: val = int(parts[-1][0]) if hasattr(parts[-1], '__getitem__') else int(parts[-1]) writer.add_uint32(field_name, val) elif field_type == GGUFValueType.INT32: val = int(parts[-1][0]) if hasattr(parts[-1], '__getitem__') else int(parts[-1]) writer.add_int32(field_name, val) elif field_type == GGUFValueType.FLOAT32: val = float(parts[-1][0]) if hasattr(parts[-1], '__getitem__') else float(parts[-1]) writer.add_float32(field_name, val) elif field_type == GGUFValueType.BOOL: val = bool(parts[-1][0]) if hasattr(parts[-1], '__getitem__') else bool(parts[-1]) writer.add_bool(field_name, val) elif field_type == GGUFValueType.UINT64: val = int(parts[-1][0]) if hasattr(parts[-1], '__getitem__') else int(parts[-1]) writer.add_uint64(field_name, val) elif field_type == GGUFValueType.UINT16: val = int(parts[-1][0]) if hasattr(parts[-1], '__getitem__') else int(parts[-1]) writer.add_uint16(field_name, val) elif field_type == GGUFValueType.UINT8: val = int(parts[-1][0]) if hasattr(parts[-1], '__getitem__') else int(parts[-1]) writer.add_uint8(field_name, val) else: print(f" SKIP: {field_name} (type {field_type})") except Exception as e: print(f" ERROR on {field_name}: {e}") # --- Add tensors --- for new_name, tensor in tensors_to_write: writer.add_tensor(new_name, tensor.data, raw_dtype=tensor.tensor_type) print("Finalizing ...") writer.write_header_to_file() writer.write_kv_data_to_file() writer.write_tensors_to_file() writer.close() size_gb = DST.stat().st_size / (1024**3) print(f"Done! {DST} ({size_gb:.1f} GB)") if __name__ == "__main__": main()