Upload folder using huggingface_hub

Browse files

Files changed (11) hide show

.gitattributes +1 -0
FiTv1-XL-2-256/README.md +65 -0
FiTv1-XL-2-256/demo.png +3 -0
FiTv1-XL-2-256/model_index.json +1021 -0
FiTv1-XL-2-256/pipeline.py +447 -0
FiTv1-XL-2-256/scheduler/scheduler_config.json +15 -0
FiTv1-XL-2-256/transformer/config.json +16 -0
FiTv1-XL-2-256/transformer/diffusion_pytorch_model.safetensors +3 -0
FiTv1-XL-2-256/transformer/fit_transformer_2d.py +993 -0
FiTv1-XL-2-256/vae/config.json +38 -0
FiTv1-XL-2-256/vae/diffusion_pytorch_model.safetensors +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+FiTv1-XL-2-256/demo.png filter=lfs diff=lfs merge=lfs -text

FiTv1-XL-2-256/README.md ADDED Viewed

	@@ -0,0 +1,65 @@

+---
+license: apache-2.0
+library_name: diffusers
+pipeline_tag: unconditional-image-generation
+tags:
+  - diffusers
+  - fit
+  - image-generation
+  - class-conditional
+  - imagenet
+inference: true
+---
+# FiTv1-XL-2-256
+Self-contained Diffusers checkpoint for **FiTv1-XL/2**, converted from [`InfImagine/FiT`](https://huggingface.co/InfImagine/FiT).
+Each subfolder is a self-contained Diffusers model repo with:
+- `model_index.json` (includes ImageNet `id2label`)
+- `pipeline.py` (custom `FiTPipeline`)
+- `transformer/fit_transformer_2d.py` and weights
+- `scheduler/scheduling_fit_improved.py` and `scheduler_config.json`
+- `vae/diffusion_pytorch_model.safetensors`
+## Recommended inference (256×256)
+| Setting | Value |
+| --- | --- |
+| Resolution | 256×256 |
+| Sampler | improved diffusion (DDPM respaced) |
+| Steps | 250 |
+| CFG scale | 1.5 |
+| Dtype | `bfloat16` (recommended on Ampere+) |
+| VAE | `stabilityai/sd-vae-ft-ema` (bundled under `vae/`) |
+```python
+from pathlib import Path
+import torch
+from diffusers import DiffusionPipeline
+model_dir = Path("./FiTv1-XL-2-256").resolve()
+pipe = DiffusionPipeline.from_pretrained(
+    str(model_dir),
+    local_files_only=True,
+    custom_pipeline=str(model_dir / "pipeline.py"),
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+)
+pipe.to("cuda")
+print(pipe.id2label[207])
+print(pipe.get_label_ids("golden retriever"))
+generator = torch.Generator(device="cuda").manual_seed(42)
+image = pipe(
+    class_labels="golden retriever",
+    height=256,
+    width=256,
+    num_inference_steps=250,
+    guidance_scale=1.5,
+    generator=generator,
+).images[0]
+image.save("demo.png")
+```

FiTv1-XL-2-256/demo.png ADDED Viewed

Git LFS Details

SHA256: 5e38af76d6f83632058c958517367d351c4a4c8d6d03eb65e337f38ebc977f68
Pointer size: 131 Bytes
Size of remote file: 127 kB

FiTv1-XL-2-256/model_index.json ADDED Viewed

	@@ -0,0 +1,1021 @@

+{
+  "_class_name": [
+    "pipeline",
+    "FiTPipeline"
+  ],
+  "_diffusers_version": "0.36.0",
+  "transformer": [
+    "fit_transformer_2d",
+    "FiTTransformer2DModel"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKL"
+  ],
+  "scheduler": [
+    "diffusers",
+    "DDPMScheduler"
+  ],
+  "id2label": {
+    "0": "tench, Tinca tinca",
+    "1": "goldfish, Carassius auratus",
+    "2": "great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias",
+    "3": "tiger shark, Galeocerdo cuvieri",
+    "4": "hammerhead, hammerhead shark",
+    "5": "electric ray, crampfish, numbfish, torpedo",
+    "6": "stingray",
+    "7": "cock",
+    "8": "hen",
+    "9": "ostrich, Struthio camelus",
+    "10": "brambling, Fringilla montifringilla",
+    "11": "goldfinch, Carduelis carduelis",
+    "12": "house finch, linnet, Carpodacus mexicanus",
+    "13": "junco, snowbird",
+    "14": "indigo bunting, indigo finch, indigo bird, Passerina cyanea",
+    "15": "robin, American robin, Turdus migratorius",
+    "16": "bulbul",
+    "17": "jay",
+    "18": "magpie",
+    "19": "chickadee",
+    "20": "water ouzel, dipper",
+    "21": "kite",
+    "22": "bald eagle, American eagle, Haliaeetus leucocephalus",
+    "23": "vulture",
+    "24": "great grey owl, great gray owl, Strix nebulosa",
+    "25": "European fire salamander, Salamandra salamandra",
+    "26": "common newt, Triturus vulgaris",
+    "27": "eft",
+    "28": "spotted salamander, Ambystoma maculatum",
+    "29": "axolotl, mud puppy, Ambystoma mexicanum",
+    "30": "bullfrog, Rana catesbeiana",
+    "31": "tree frog, tree-frog",
+    "32": "tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui",
+    "33": "loggerhead, loggerhead turtle, Caretta caretta",
+    "34": "leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea",
+    "35": "mud turtle",
+    "36": "terrapin",
+    "37": "box turtle, box tortoise",
+    "38": "banded gecko",
+    "39": "common iguana, iguana, Iguana iguana",
+    "40": "American chameleon, anole, Anolis carolinensis",
+    "41": "whiptail, whiptail lizard",
+    "42": "agama",
+    "43": "frilled lizard, Chlamydosaurus kingi",
+    "44": "alligator lizard",
+    "45": "Gila monster, Heloderma suspectum",
+    "46": "green lizard, Lacerta viridis",
+    "47": "African chameleon, Chamaeleo chamaeleon",
+    "48": "Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis",
+    "49": "African crocodile, Nile crocodile, Crocodylus niloticus",
+    "50": "American alligator, Alligator mississipiensis",
+    "51": "triceratops",
+    "52": "thunder snake, worm snake, Carphophis amoenus",
+    "53": "ringneck snake, ring-necked snake, ring snake",
+    "54": "hognose snake, puff adder, sand viper",
+    "55": "green snake, grass snake",
+    "56": "king snake, kingsnake",
+    "57": "garter snake, grass snake",
+    "58": "water snake",
+    "59": "vine snake",
+    "60": "night snake, Hypsiglena torquata",
+    "61": "boa constrictor, Constrictor constrictor",
+    "62": "rock python, rock snake, Python sebae",
+    "63": "Indian cobra, Naja naja",
+    "64": "green mamba",
+    "65": "sea snake",
+    "66": "horned viper, cerastes, sand viper, horned asp, Cerastes cornutus",
+    "67": "diamondback, diamondback rattlesnake, Crotalus adamanteus",
+    "68": "sidewinder, horned rattlesnake, Crotalus cerastes",
+    "69": "trilobite",
+    "70": "harvestman, daddy longlegs, Phalangium opilio",
+    "71": "scorpion",
+    "72": "black and gold garden spider, Argiope aurantia",
+    "73": "barn spider, Araneus cavaticus",
+    "74": "garden spider, Aranea diademata",
+    "75": "black widow, Latrodectus mactans",
+    "76": "tarantula",
+    "77": "wolf spider, hunting spider",
+    "78": "tick",
+    "79": "centipede",
+    "80": "black grouse",
+    "81": "ptarmigan",
+    "82": "ruffed grouse, partridge, Bonasa umbellus",
+    "83": "prairie chicken, prairie grouse, prairie fowl",
+    "84": "peacock",
+    "85": "quail",
+    "86": "partridge",
+    "87": "African grey, African gray, Psittacus erithacus",
+    "88": "macaw",
+    "89": "sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita",
+    "90": "lorikeet",
+    "91": "coucal",
+    "92": "bee eater",
+    "93": "hornbill",
+    "94": "hummingbird",
+    "95": "jacamar",
+    "96": "toucan",
+    "97": "drake",
+    "98": "red-breasted merganser, Mergus serrator",
+    "99": "goose",
+    "100": "black swan, Cygnus atratus",
+    "101": "tusker",
+    "102": "echidna, spiny anteater, anteater",
+    "103": "platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus",
+    "104": "wallaby, brush kangaroo",
+    "105": "koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus",
+    "106": "wombat",
+    "107": "jellyfish",
+    "108": "sea anemone, anemone",
+    "109": "brain coral",
+    "110": "flatworm, platyhelminth",
+    "111": "nematode, nematode worm, roundworm",
+    "112": "conch",
+    "113": "snail",
+    "114": "slug",
+    "115": "sea slug, nudibranch",
+    "116": "chiton, coat-of-mail shell, sea cradle, polyplacophore",
+    "117": "chambered nautilus, pearly nautilus, nautilus",
+    "118": "Dungeness crab, Cancer magister",
+    "119": "rock crab, Cancer irroratus",
+    "120": "fiddler crab",
+    "121": "king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica",
+    "122": "American lobster, Northern lobster, Maine lobster, Homarus americanus",
+    "123": "spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish",
+    "124": "crayfish, crawfish, crawdad, crawdaddy",
+    "125": "hermit crab",
+    "126": "isopod",
+    "127": "white stork, Ciconia ciconia",
+    "128": "black stork, Ciconia nigra",
+    "129": "spoonbill",
+    "130": "flamingo",
+    "131": "little blue heron, Egretta caerulea",
+    "132": "American egret, great white heron, Egretta albus",
+    "133": "bittern",
+    "134": "crane",
+    "135": "limpkin, Aramus pictus",
+    "136": "European gallinule, Porphyrio porphyrio",
+    "137": "American coot, marsh hen, mud hen, water hen, Fulica americana",
+    "138": "bustard",
+    "139": "ruddy turnstone, Arenaria interpres",
+    "140": "red-backed sandpiper, dunlin, Erolia alpina",
+    "141": "redshank, Tringa totanus",
+    "142": "dowitcher",
+    "143": "oystercatcher, oyster catcher",
+    "144": "pelican",
+    "145": "king penguin, Aptenodytes patagonica",
+    "146": "albatross, mollymawk",
+    "147": "grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus",
+    "148": "killer whale, killer, orca, grampus, sea wolf, Orcinus orca",
+    "149": "dugong, Dugong dugon",
+    "150": "sea lion",
+    "151": "Chihuahua",
+    "152": "Japanese spaniel",
+    "153": "Maltese dog, Maltese terrier, Maltese",
+    "154": "Pekinese, Pekingese, Peke",
+    "155": "Shih-Tzu",
+    "156": "Blenheim spaniel",
+    "157": "papillon",
+    "158": "toy terrier",
+    "159": "Rhodesian ridgeback",
+    "160": "Afghan hound, Afghan",
+    "161": "basset, basset hound",
+    "162": "beagle",
+    "163": "bloodhound, sleuthhound",
+    "164": "bluetick",
+    "165": "black-and-tan coonhound",
+    "166": "Walker hound, Walker foxhound",
+    "167": "English foxhound",
+    "168": "redbone",
+    "169": "borzoi, Russian wolfhound",
+    "170": "Irish wolfhound",
+    "171": "Italian greyhound",
+    "172": "whippet",
+    "173": "Ibizan hound, Ibizan Podenco",
+    "174": "Norwegian elkhound, elkhound",
+    "175": "otterhound, otter hound",
+    "176": "Saluki, gazelle hound",
+    "177": "Scottish deerhound, deerhound",
+    "178": "Weimaraner",
+    "179": "Staffordshire bullterrier, Staffordshire bull terrier",
+    "180": "American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier",
+    "181": "Bedlington terrier",
+    "182": "Border terrier",
+    "183": "Kerry blue terrier",
+    "184": "Irish terrier",
+    "185": "Norfolk terrier",
+    "186": "Norwich terrier",
+    "187": "Yorkshire terrier",
+    "188": "wire-haired fox terrier",
+    "189": "Lakeland terrier",
+    "190": "Sealyham terrier, Sealyham",
+    "191": "Airedale, Airedale terrier",
+    "192": "cairn, cairn terrier",
+    "193": "Australian terrier",
+    "194": "Dandie Dinmont, Dandie Dinmont terrier",
+    "195": "Boston bull, Boston terrier",
+    "196": "miniature schnauzer",
+    "197": "giant schnauzer",
+    "198": "standard schnauzer",
+    "199": "Scotch terrier, Scottish terrier, Scottie",
+    "200": "Tibetan terrier, chrysanthemum dog",
+    "201": "silky terrier, Sydney silky",
+    "202": "soft-coated wheaten terrier",
+    "203": "West Highland white terrier",
+    "204": "Lhasa, Lhasa apso",
+    "205": "flat-coated retriever",
+    "206": "curly-coated retriever",
+    "207": "golden retriever",
+    "208": "Labrador retriever",
+    "209": "Chesapeake Bay retriever",
+    "210": "German short-haired pointer",
+    "211": "vizsla, Hungarian pointer",
+    "212": "English setter",
+    "213": "Irish setter, red setter",
+    "214": "Gordon setter",
+    "215": "Brittany spaniel",
+    "216": "clumber, clumber spaniel",
+    "217": "English springer, English springer spaniel",
+    "218": "Welsh springer spaniel",
+    "219": "cocker spaniel, English cocker spaniel, cocker",
+    "220": "Sussex spaniel",
+    "221": "Irish water spaniel",
+    "222": "kuvasz",
+    "223": "schipperke",
+    "224": "groenendael",
+    "225": "malinois",
+    "226": "briard",
+    "227": "kelpie",
+    "228": "komondor",
+    "229": "Old English sheepdog, bobtail",
+    "230": "Shetland sheepdog, Shetland sheep dog, Shetland",
+    "231": "collie",
+    "232": "Border collie",
+    "233": "Bouvier des Flandres, Bouviers des Flandres",
+    "234": "Rottweiler",
+    "235": "German shepherd, German shepherd dog, German police dog, alsatian",
+    "236": "Doberman, Doberman pinscher",
+    "237": "miniature pinscher",
+    "238": "Greater Swiss Mountain dog",
+    "239": "Bernese mountain dog",
+    "240": "Appenzeller",
+    "241": "EntleBucher",
+    "242": "boxer",
+    "243": "bull mastiff",
+    "244": "Tibetan mastiff",
+    "245": "French bulldog",
+    "246": "Great Dane",
+    "247": "Saint Bernard, St Bernard",
+    "248": "Eskimo dog, husky",
+    "249": "malamute, malemute, Alaskan malamute",
+    "250": "Siberian husky",
+    "251": "dalmatian, coach dog, carriage dog",
+    "252": "affenpinscher, monkey pinscher, monkey dog",
+    "253": "basenji",
+    "254": "pug, pug-dog",
+    "255": "Leonberg",
+    "256": "Newfoundland, Newfoundland dog",
+    "257": "Great Pyrenees",
+    "258": "Samoyed, Samoyede",
+    "259": "Pomeranian",
+    "260": "chow, chow chow",
+    "261": "keeshond",
+    "262": "Brabancon griffon",
+    "263": "Pembroke, Pembroke Welsh corgi",
+    "264": "Cardigan, Cardigan Welsh corgi",
+    "265": "toy poodle",
+    "266": "miniature poodle",
+    "267": "standard poodle",
+    "268": "Mexican hairless",
+    "269": "timber wolf, grey wolf, gray wolf, Canis lupus",
+    "270": "white wolf, Arctic wolf, Canis lupus tundrarum",
+    "271": "red wolf, maned wolf, Canis rufus, Canis niger",
+    "272": "coyote, prairie wolf, brush wolf, Canis latrans",
+    "273": "dingo, warrigal, warragal, Canis dingo",
+    "274": "dhole, Cuon alpinus",
+    "275": "African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus",
+    "276": "hyena, hyaena",
+    "277": "red fox, Vulpes vulpes",
+    "278": "kit fox, Vulpes macrotis",
+    "279": "Arctic fox, white fox, Alopex lagopus",
+    "280": "grey fox, gray fox, Urocyon cinereoargenteus",
+    "281": "tabby, tabby cat",
+    "282": "tiger cat",
+    "283": "Persian cat",
+    "284": "Siamese cat, Siamese",
+    "285": "Egyptian cat",
+    "286": "cougar, puma, catamount, mountain lion, painter, panther, Felis concolor",
+    "287": "lynx, catamount",
+    "288": "leopard, Panthera pardus",
+    "289": "snow leopard, ounce, Panthera uncia",
+    "290": "jaguar, panther, Panthera onca, Felis onca",
+    "291": "lion, king of beasts, Panthera leo",
+    "292": "tiger, Panthera tigris",
+    "293": "cheetah, chetah, Acinonyx jubatus",
+    "294": "brown bear, bruin, Ursus arctos",
+    "295": "American black bear, black bear, Ursus americanus, Euarctos americanus",
+    "296": "ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus",
+    "297": "sloth bear, Melursus ursinus, Ursus ursinus",
+    "298": "mongoose",
+    "299": "meerkat, mierkat",
+    "300": "tiger beetle",
+    "301": "ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle",
+    "302": "ground beetle, carabid beetle",
+    "303": "long-horned beetle, longicorn, longicorn beetle",
+    "304": "leaf beetle, chrysomelid",
+    "305": "dung beetle",
+    "306": "rhinoceros beetle",
+    "307": "weevil",
+    "308": "fly",
+    "309": "bee",
+    "310": "ant, emmet, pismire",
+    "311": "grasshopper, hopper",
+    "312": "cricket",
+    "313": "walking stick, walkingstick, stick insect",
+    "314": "cockroach, roach",
+    "315": "mantis, mantid",
+    "316": "cicada, cicala",
+    "317": "leafhopper",
+    "318": "lacewing, lacewing fly",
+    "319": "dragonfly, darning needle, devils darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk",
+    "320": "damselfly",
+    "321": "admiral",
+    "322": "ringlet, ringlet butterfly",
+    "323": "monarch, monarch butterfly, milkweed butterfly, Danaus plexippus",
+    "324": "cabbage butterfly",
+    "325": "sulphur butterfly, sulfur butterfly",
+    "326": "lycaenid, lycaenid butterfly",
+    "327": "starfish, sea star",
+    "328": "sea urchin",
+    "329": "sea cucumber, holothurian",
+    "330": "wood rabbit, cottontail, cottontail rabbit",
+    "331": "hare",
+    "332": "Angora, Angora rabbit",
+    "333": "hamster",
+    "334": "porcupine, hedgehog",
+    "335": "fox squirrel, eastern fox squirrel, Sciurus niger",
+    "336": "marmot",
+    "337": "beaver",
+    "338": "guinea pig, Cavia cobaya",
+    "339": "sorrel",
+    "340": "zebra",
+    "341": "hog, pig, grunter, squealer, Sus scrofa",
+    "342": "wild boar, boar, Sus scrofa",
+    "343": "warthog",
+    "344": "hippopotamus, hippo, river horse, Hippopotamus amphibius",
+    "345": "ox",
+    "346": "water buffalo, water ox, Asiatic buffalo, Bubalus bubalis",
+    "347": "bison",
+    "348": "ram, tup",
+    "349": "bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis",
+    "350": "ibex, Capra ibex",
+    "351": "hartebeest",
+    "352": "impala, Aepyceros melampus",
+    "353": "gazelle",
+    "354": "Arabian camel, dromedary, Camelus dromedarius",
+    "355": "llama",
+    "356": "weasel",
+    "357": "mink",
+    "358": "polecat, fitch, foulmart, foumart, Mustela putorius",
+    "359": "black-footed ferret, ferret, Mustela nigripes",
+    "360": "otter",
+    "361": "skunk, polecat, wood pussy",
+    "362": "badger",
+    "363": "armadillo",
+    "364": "three-toed sloth, ai, Bradypus tridactylus",
+    "365": "orangutan, orang, orangutang, Pongo pygmaeus",
+    "366": "gorilla, Gorilla gorilla",
+    "367": "chimpanzee, chimp, Pan troglodytes",
+    "368": "gibbon, Hylobates lar",
+    "369": "siamang, Hylobates syndactylus, Symphalangus syndactylus",
+    "370": "guenon, guenon monkey",
+    "371": "patas, hussar monkey, Erythrocebus patas",
+    "372": "baboon",
+    "373": "macaque",
+    "374": "langur",
+    "375": "colobus, colobus monkey",
+    "376": "proboscis monkey, Nasalis larvatus",
+    "377": "marmoset",
+    "378": "capuchin, ringtail, Cebus capucinus",
+    "379": "howler monkey, howler",
+    "380": "titi, titi monkey",
+    "381": "spider monkey, Ateles geoffroyi",
+    "382": "squirrel monkey, Saimiri sciureus",
+    "383": "Madagascar cat, ring-tailed lemur, Lemur catta",
+    "384": "indri, indris, Indri indri, Indri brevicaudatus",
+    "385": "Indian elephant, Elephas maximus",
+    "386": "African elephant, Loxodonta africana",
+    "387": "lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens",
+    "388": "giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca",
+    "389": "barracouta, snoek",
+    "390": "eel",
+    "391": "coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch",
+    "392": "rock beauty, Holocanthus tricolor",
+    "393": "anemone fish",
+    "394": "sturgeon",
+    "395": "gar, garfish, garpike, billfish, Lepisosteus osseus",
+    "396": "lionfish",
+    "397": "puffer, pufferfish, blowfish, globefish",
+    "398": "abacus",
+    "399": "abaya",
+    "400": "academic gown, academic robe, judge robe",
+    "401": "accordion, piano accordion, squeeze box",
+    "402": "acoustic guitar",
+    "403": "aircraft carrier, carrier, flattop, attack aircraft carrier",
+    "404": "airliner",
+    "405": "airship, dirigible",
+    "406": "altar",
+    "407": "ambulance",
+    "408": "amphibian, amphibious vehicle",
+    "409": "analog clock",
+    "410": "apiary, bee house",
+    "411": "apron",
+    "412": "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin",
+    "413": "assault rifle, assault gun",
+    "414": "backpack, back pack, knapsack, packsack, rucksack, haversack",
+    "415": "bakery, bakeshop, bakehouse",
+    "416": "balance beam, beam",
+    "417": "balloon",
+    "418": "ballpoint, ballpoint pen, ballpen, Biro",
+    "419": "Band Aid",
+    "420": "banjo",
+    "421": "bannister, banister, balustrade, balusters, handrail",
+    "422": "barbell",
+    "423": "barber chair",
+    "424": "barbershop",
+    "425": "barn",
+    "426": "barometer",
+    "427": "barrel, cask",
+    "428": "barrow, garden cart, lawn cart, wheelbarrow",
+    "429": "baseball",
+    "430": "basketball",
+    "431": "bassinet",
+    "432": "bassoon",
+    "433": "bathing cap, swimming cap",
+    "434": "bath towel",
+    "435": "bathtub, bathing tub, bath, tub",
+    "436": "beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon",
+    "437": "beacon, lighthouse, beacon light, pharos",
+    "438": "beaker",
+    "439": "bearskin, busby, shako",
+    "440": "beer bottle",
+    "441": "beer glass",
+    "442": "bell cote, bell cot",
+    "443": "bib",
+    "444": "bicycle-built-for-two, tandem bicycle, tandem",
+    "445": "bikini, two-piece",
+    "446": "binder, ring-binder",
+    "447": "binoculars, field glasses, opera glasses",
+    "448": "birdhouse",
+    "449": "boathouse",
+    "450": "bobsled, bobsleigh, bob",
+    "451": "bolo tie, bolo, bola tie, bola",
+    "452": "bonnet, poke bonnet",
+    "453": "bookcase",
+    "454": "bookshop, bookstore, bookstall",
+    "455": "bottlecap",
+    "456": "bow",
+    "457": "bow tie, bow-tie, bowtie",
+    "458": "brass, memorial tablet, plaque",
+    "459": "brassiere, bra, bandeau",
+    "460": "breakwater, groin, groyne, mole, bulwark, seawall, jetty",
+    "461": "breastplate, aegis, egis",
+    "462": "broom",
+    "463": "bucket, pail",
+    "464": "buckle",
+    "465": "bulletproof vest",
+    "466": "bullet train, bullet",
+    "467": "butcher shop, meat market",
+    "468": "cab, hack, taxi, taxicab",
+    "469": "caldron, cauldron",
+    "470": "candle, taper, wax light",
+    "471": "cannon",
+    "472": "canoe",
+    "473": "can opener, tin opener",
+    "474": "cardigan",
+    "475": "car mirror",
+    "476": "carousel, carrousel, merry-go-round, roundabout, whirligig",
+    "477": "carpenters kit, tool kit",
+    "478": "carton",
+    "479": "car wheel",
+    "480": "cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM",
+    "481": "cassette",
+    "482": "cassette player",
+    "483": "castle",
+    "484": "catamaran",
+    "485": "CD player",
+    "486": "cello, violoncello",
+    "487": "cellular telephone, cellular phone, cellphone, cell, mobile phone",
+    "488": "chain",
+    "489": "chainlink fence",
+    "490": "chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour",
+    "491": "chain saw, chainsaw",
+    "492": "chest",
+    "493": "chiffonier, commode",
+    "494": "chime, bell, gong",
+    "495": "china cabinet, china closet",
+    "496": "Christmas stocking",
+    "497": "church, church building",
+    "498": "cinema, movie theater, movie theatre, movie house, picture palace",
+    "499": "cleaver, meat cleaver, chopper",
+    "500": "cliff dwelling",
+    "501": "cloak",
+    "502": "clog, geta, patten, sabot",
+    "503": "cocktail shaker",
+    "504": "coffee mug",
+    "505": "coffeepot",
+    "506": "coil, spiral, volute, whorl, helix",
+    "507": "combination lock",
+    "508": "computer keyboard, keypad",
+    "509": "confectionery, confectionary, candy store",
+    "510": "container ship, containership, container vessel",
+    "511": "convertible",
+    "512": "corkscrew, bottle screw",
+    "513": "cornet, horn, trumpet, trump",
+    "514": "cowboy boot",
+    "515": "cowboy hat, ten-gallon hat",
+    "516": "cradle",
+    "517": "crane",
+    "518": "crash helmet",
+    "519": "crate",
+    "520": "crib, cot",
+    "521": "Crock Pot",
+    "522": "croquet ball",
+    "523": "crutch",
+    "524": "cuirass",
+    "525": "dam, dike, dyke",
+    "526": "desk",
+    "527": "desktop computer",
+    "528": "dial telephone, dial phone",
+    "529": "diaper, nappy, napkin",
+    "530": "digital clock",
+    "531": "digital watch",
+    "532": "dining table, board",
+    "533": "dishrag, dishcloth",
+    "534": "dishwasher, dish washer, dishwashing machine",
+    "535": "disk brake, disc brake",
+    "536": "dock, dockage, docking facility",
+    "537": "dogsled, dog sled, dog sleigh",
+    "538": "dome",
+    "539": "doormat, welcome mat",
+    "540": "drilling platform, offshore rig",
+    "541": "drum, membranophone, tympan",
+    "542": "drumstick",
+    "543": "dumbbell",
+    "544": "Dutch oven",
+    "545": "electric fan, blower",
+    "546": "electric guitar",
+    "547": "electric locomotive",
+    "548": "entertainment center",
+    "549": "envelope",
+    "550": "espresso maker",
+    "551": "face powder",
+    "552": "feather boa, boa",
+    "553": "file, file cabinet, filing cabinet",
+    "554": "fireboat",
+    "555": "fire engine, fire truck",
+    "556": "fire screen, fireguard",
+    "557": "flagpole, flagstaff",
+    "558": "flute, transverse flute",
+    "559": "folding chair",
+    "560": "football helmet",
+    "561": "forklift",
+    "562": "fountain",
+    "563": "fountain pen",
+    "564": "four-poster",
+    "565": "freight car",
+    "566": "French horn, horn",
+    "567": "frying pan, frypan, skillet",
+    "568": "fur coat",
+    "569": "garbage truck, dustcart",
+    "570": "gasmask, respirator, gas helmet",
+    "571": "gas pump, gasoline pump, petrol pump, island dispenser",
+    "572": "goblet",
+    "573": "go-kart",
+    "574": "golf ball",
+    "575": "golfcart, golf cart",
+    "576": "gondola",
+    "577": "gong, tam-tam",
+    "578": "gown",
+    "579": "grand piano, grand",
+    "580": "greenhouse, nursery, glasshouse",
+    "581": "grille, radiator grille",
+    "582": "grocery store, grocery, food market, market",
+    "583": "guillotine",
+    "584": "hair slide",
+    "585": "hair spray",
+    "586": "half track",
+    "587": "hammer",
+    "588": "hamper",
+    "589": "hand blower, blow dryer, blow drier, hair dryer, hair drier",
+    "590": "hand-held computer, hand-held microcomputer",
+    "591": "handkerchief, hankie, hanky, hankey",
+    "592": "hard disc, hard disk, fixed disk",
+    "593": "harmonica, mouth organ, harp, mouth harp",
+    "594": "harp",
+    "595": "harvester, reaper",
+    "596": "hatchet",
+    "597": "holster",
+    "598": "home theater, home theatre",
+    "599": "honeycomb",
+    "600": "hook, claw",
+    "601": "hoopskirt, crinoline",
+    "602": "horizontal bar, high bar",
+    "603": "horse cart, horse-cart",
+    "604": "hourglass",
+    "605": "iPod",
+    "606": "iron, smoothing iron",
+    "607": "jack-o-lantern",
+    "608": "jean, blue jean, denim",
+    "609": "jeep, landrover",
+    "610": "jersey, T-shirt, tee shirt",
+    "611": "jigsaw puzzle",
+    "612": "jinrikisha, ricksha, rickshaw",
+    "613": "joystick",
+    "614": "kimono",
+    "615": "knee pad",
+    "616": "knot",
+    "617": "lab coat, laboratory coat",
+    "618": "ladle",
+    "619": "lampshade, lamp shade",
+    "620": "laptop, laptop computer",
+    "621": "lawn mower, mower",
+    "622": "lens cap, lens cover",
+    "623": "letter opener, paper knife, paperknife",
+    "624": "library",
+    "625": "lifeboat",
+    "626": "lighter, light, igniter, ignitor",
+    "627": "limousine, limo",
+    "628": "liner, ocean liner",
+    "629": "lipstick, lip rouge",
+    "630": "Loafer",
+    "631": "lotion",
+    "632": "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system",
+    "633": "loupe, jewelers loupe",
+    "634": "lumbermill, sawmill",
+    "635": "magnetic compass",
+    "636": "mailbag, postbag",
+    "637": "mailbox, letter box",
+    "638": "maillot",
+    "639": "maillot, tank suit",
+    "640": "manhole cover",
+    "641": "maraca",
+    "642": "marimba, xylophone",
+    "643": "mask",
+    "644": "matchstick",
+    "645": "maypole",
+    "646": "maze, labyrinth",
+    "647": "measuring cup",
+    "648": "medicine chest, medicine cabinet",
+    "649": "megalith, megalithic structure",
+    "650": "microphone, mike",
+    "651": "microwave, microwave oven",
+    "652": "military uniform",
+    "653": "milk can",
+    "654": "minibus",
+    "655": "miniskirt, mini",
+    "656": "minivan",
+    "657": "missile",
+    "658": "mitten",
+    "659": "mixing bowl",
+    "660": "mobile home, manufactured home",
+    "661": "Model T",
+    "662": "modem",
+    "663": "monastery",
+    "664": "monitor",
+    "665": "moped",
+    "666": "mortar",
+    "667": "mortarboard",
+    "668": "mosque",
+    "669": "mosquito net",
+    "670": "motor scooter, scooter",
+    "671": "mountain bike, all-terrain bike, off-roader",
+    "672": "mountain tent",
+    "673": "mouse, computer mouse",
+    "674": "mousetrap",
+    "675": "moving van",
+    "676": "muzzle",
+    "677": "nail",
+    "678": "neck brace",
+    "679": "necklace",
+    "680": "nipple",
+    "681": "notebook, notebook computer",
+    "682": "obelisk",
+    "683": "oboe, hautboy, hautbois",
+    "684": "ocarina, sweet potato",
+    "685": "odometer, hodometer, mileometer, milometer",
+    "686": "oil filter",
+    "687": "organ, pipe organ",
+    "688": "oscilloscope, scope, cathode-ray oscilloscope, CRO",
+    "689": "overskirt",
+    "690": "oxcart",
+    "691": "oxygen mask",
+    "692": "packet",
+    "693": "paddle, boat paddle",
+    "694": "paddlewheel, paddle wheel",
+    "695": "padlock",
+    "696": "paintbrush",
+    "697": "pajama, pyjama, pjs, jammies",
+    "698": "palace",
+    "699": "panpipe, pandean pipe, syrinx",
+    "700": "paper towel",
+    "701": "parachute, chute",
+    "702": "parallel bars, bars",
+    "703": "park bench",
+    "704": "parking meter",
+    "705": "passenger car, coach, carriage",
+    "706": "patio, terrace",
+    "707": "pay-phone, pay-station",
+    "708": "pedestal, plinth, footstall",
+    "709": "pencil box, pencil case",
+    "710": "pencil sharpener",
+    "711": "perfume, essence",
+    "712": "Petri dish",
+    "713": "photocopier",
+    "714": "pick, plectrum, plectron",
+    "715": "pickelhaube",
+    "716": "picket fence, paling",
+    "717": "pickup, pickup truck",
+    "718": "pier",
+    "719": "piggy bank, penny bank",
+    "720": "pill bottle",
+    "721": "pillow",
+    "722": "ping-pong ball",
+    "723": "pinwheel",
+    "724": "pirate, pirate ship",
+    "725": "pitcher, ewer",
+    "726": "plane, carpenters plane, woodworking plane",
+    "727": "planetarium",
+    "728": "plastic bag",
+    "729": "plate rack",
+    "730": "plow, plough",
+    "731": "plunger, plumbers helper",
+    "732": "Polaroid camera, Polaroid Land camera",
+    "733": "pole",
+    "734": "police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria",
+    "735": "poncho",
+    "736": "pool table, billiard table, snooker table",
+    "737": "pop bottle, soda bottle",
+    "738": "pot, flowerpot",
+    "739": "potters wheel",
+    "740": "power drill",
+    "741": "prayer rug, prayer mat",
+    "742": "printer",
+    "743": "prison, prison house",
+    "744": "projectile, missile",
+    "745": "projector",
+    "746": "puck, hockey puck",
+    "747": "punching bag, punch bag, punching ball, punchball",
+    "748": "purse",
+    "749": "quill, quill pen",
+    "750": "quilt, comforter, comfort, puff",
+    "751": "racer, race car, racing car",
+    "752": "racket, racquet",
+    "753": "radiator",
+    "754": "radio, wireless",
+    "755": "radio telescope, radio reflector",
+    "756": "rain barrel",
+    "757": "recreational vehicle, RV, R.V.",
+    "758": "reel",
+    "759": "reflex camera",
+    "760": "refrigerator, icebox",
+    "761": "remote control, remote",
+    "762": "restaurant, eating house, eating place, eatery",
+    "763": "revolver, six-gun, six-shooter",
+    "764": "rifle",
+    "765": "rocking chair, rocker",
+    "766": "rotisserie",
+    "767": "rubber eraser, rubber, pencil eraser",
+    "768": "rugby ball",
+    "769": "rule, ruler",
+    "770": "running shoe",
+    "771": "safe",
+    "772": "safety pin",
+    "773": "saltshaker, salt shaker",
+    "774": "sandal",
+    "775": "sarong",
+    "776": "sax, saxophone",
+    "777": "scabbard",
+    "778": "scale, weighing machine",
+    "779": "school bus",
+    "780": "schooner",
+    "781": "scoreboard",
+    "782": "screen, CRT screen",
+    "783": "screw",
+    "784": "screwdriver",
+    "785": "seat belt, seatbelt",
+    "786": "sewing machine",
+    "787": "shield, buckler",
+    "788": "shoe shop, shoe-shop, shoe store",
+    "789": "shoji",
+    "790": "shopping basket",
+    "791": "shopping cart",
+    "792": "shovel",
+    "793": "shower cap",
+    "794": "shower curtain",
+    "795": "ski",
+    "796": "ski mask",
+    "797": "sleeping bag",
+    "798": "slide rule, slipstick",
+    "799": "sliding door",
+    "800": "slot, one-armed bandit",
+    "801": "snorkel",
+    "802": "snowmobile",
+    "803": "snowplow, snowplough",
+    "804": "soap dispenser",
+    "805": "soccer ball",
+    "806": "sock",
+    "807": "solar dish, solar collector, solar furnace",
+    "808": "sombrero",
+    "809": "soup bowl",
+    "810": "space bar",
+    "811": "space heater",
+    "812": "space shuttle",
+    "813": "spatula",
+    "814": "speedboat",
+    "815": "spider web, spiders web",
+    "816": "spindle",
+    "817": "sports car, sport car",
+    "818": "spotlight, spot",
+    "819": "stage",
+    "820": "steam locomotive",
+    "821": "steel arch bridge",
+    "822": "steel drum",
+    "823": "stethoscope",
+    "824": "stole",
+    "825": "stone wall",
+    "826": "stopwatch, stop watch",
+    "827": "stove",
+    "828": "strainer",
+    "829": "streetcar, tram, tramcar, trolley, trolley car",
+    "830": "stretcher",
+    "831": "studio couch, day bed",
+    "832": "stupa, tope",
+    "833": "submarine, pigboat, sub, U-boat",
+    "834": "suit, suit of clothes",
+    "835": "sundial",
+    "836": "sunglass",
+    "837": "sunglasses, dark glasses, shades",
+    "838": "sunscreen, sunblock, sun blocker",
+    "839": "suspension bridge",
+    "840": "swab, swob, mop",
+    "841": "sweatshirt",
+    "842": "swimming trunks, bathing trunks",
+    "843": "swing",
+    "844": "switch, electric switch, electrical switch",
+    "845": "syringe",
+    "846": "table lamp",
+    "847": "tank, army tank, armored combat vehicle, armoured combat vehicle",
+    "848": "tape player",
+    "849": "teapot",
+    "850": "teddy, teddy bear",
+    "851": "television, television system",
+    "852": "tennis ball",
+    "853": "thatch, thatched roof",
+    "854": "theater curtain, theatre curtain",
+    "855": "thimble",
+    "856": "thresher, thrasher, threshing machine",
+    "857": "throne",
+    "858": "tile roof",
+    "859": "toaster",
+    "860": "tobacco shop, tobacconist shop, tobacconist",
+    "861": "toilet seat",
+    "862": "torch",
+    "863": "totem pole",
+    "864": "tow truck, tow car, wrecker",
+    "865": "toyshop",
+    "866": "tractor",
+    "867": "trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi",
+    "868": "tray",
+    "869": "trench coat",
+    "870": "tricycle, trike, velocipede",
+    "871": "trimaran",
+    "872": "tripod",
+    "873": "triumphal arch",
+    "874": "trolleybus, trolley coach, trackless trolley",
+    "875": "trombone",
+    "876": "tub, vat",
+    "877": "turnstile",
+    "878": "typewriter keyboard",
+    "879": "umbrella",
+    "880": "unicycle, monocycle",
+    "881": "upright, upright piano",
+    "882": "vacuum, vacuum cleaner",
+    "883": "vase",
+    "884": "vault",
+    "885": "velvet",
+    "886": "vending machine",
+    "887": "vestment",
+    "888": "viaduct",
+    "889": "violin, fiddle",
+    "890": "volleyball",
+    "891": "waffle iron",
+    "892": "wall clock",
+    "893": "wallet, billfold, notecase, pocketbook",
+    "894": "wardrobe, closet, press",
+    "895": "warplane, military plane",
+    "896": "washbasin, handbasin, washbowl, lavabo, wash-hand basin",
+    "897": "washer, automatic washer, washing machine",
+    "898": "water bottle",
+    "899": "water jug",
+    "900": "water tower",
+    "901": "whiskey jug",
+    "902": "whistle",
+    "903": "wig",
+    "904": "window screen",
+    "905": "window shade",
+    "906": "Windsor tie",
+    "907": "wine bottle",
+    "908": "wing",
+    "909": "wok",
+    "910": "wooden spoon",
+    "911": "wool, woolen, woollen",
+    "912": "worm fence, snake fence, snake-rail fence, Virginia fence",
+    "913": "wreck",
+    "914": "yawl",
+    "915": "yurt",
+    "916": "web site, website, internet site, site",
+    "917": "comic book",
+    "918": "crossword puzzle, crossword",
+    "919": "street sign",
+    "920": "traffic light, traffic signal, stoplight",
+    "921": "book jacket, dust cover, dust jacket, dust wrapper",
+    "922": "menu",
+    "923": "plate",
+    "924": "guacamole",
+    "925": "consomme",
+    "926": "hot pot, hotpot",
+    "927": "trifle",
+    "928": "ice cream, icecream",
+    "929": "ice lolly, lolly, lollipop, popsicle",
+    "930": "French loaf",
+    "931": "bagel, beigel",
+    "932": "pretzel",
+    "933": "cheeseburger",
+    "934": "hotdog, hot dog, red hot",
+    "935": "mashed potato",
+    "936": "head cabbage",
+    "937": "broccoli",
+    "938": "cauliflower",
+    "939": "zucchini, courgette",
+    "940": "spaghetti squash",
+    "941": "acorn squash",
+    "942": "butternut squash",
+    "943": "cucumber, cuke",
+    "944": "artichoke, globe artichoke",
+    "945": "bell pepper",
+    "946": "cardoon",
+    "947": "mushroom",
+    "948": "Granny Smith",
+    "949": "strawberry",
+    "950": "orange",
+    "951": "lemon",
+    "952": "fig",
+    "953": "pineapple, ananas",
+    "954": "banana",
+    "955": "jackfruit, jak, jack",
+    "956": "custard apple",
+    "957": "pomegranate",
+    "958": "hay",
+    "959": "carbonara",
+    "960": "chocolate sauce, chocolate syrup",
+    "961": "dough",
+    "962": "meat loaf, meatloaf",
+    "963": "pizza, pizza pie",
+    "964": "potpie",
+    "965": "burrito",
+    "966": "red wine",
+    "967": "espresso",
+    "968": "cup",
+    "969": "eggnog",
+    "970": "alp",
+    "971": "bubble",
+    "972": "cliff, drop, drop-off",
+    "973": "coral reef",
+    "974": "geyser",
+    "975": "lakeside, lakeshore",
+    "976": "promontory, headland, head, foreland",
+    "977": "sandbar, sand bar",
+    "978": "seashore, coast, seacoast, sea-coast",
+    "979": "valley, vale",
+    "980": "volcano",
+    "981": "ballplayer, baseball player",
+    "982": "groom, bridegroom",
+    "983": "scuba diver",
+    "984": "rapeseed",
+    "985": "daisy",
+    "986": "yellow ladys slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum",
+    "987": "corn",
+    "988": "acorn",
+    "989": "hip, rose hip, rosehip",
+    "990": "buckeye, horse chestnut, conker",
+    "991": "coral fungus",
+    "992": "agaric",
+    "993": "gyromitra",
+    "994": "stinkhorn, carrion fungus",
+    "995": "earthstar",
+    "996": "hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa",
+    "997": "bolete",
+    "998": "ear, spike, capitulum",
+    "999": "toilet tissue, toilet paper, bathroom tissue"
+  }
+}

FiTv1-XL-2-256/pipeline.py ADDED Viewed

	@@ -0,0 +1,447 @@

+"""Hub custom pipeline: FiTPipeline.
+Load with native Hugging Face diffusers and trust_remote_code=True.
+"""
+import importlib
+import inspect
+import json
+import sys
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+import diffusers.schedulers as diffusers_schedulers
+import torch
+from huggingface_hub import snapshot_download
+from diffusers import AutoencoderKL
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils.torch_utils import randn_tensor
+# Local component classes are loaded dynamically in from_pretrained.
+DEFAULT_NATIVE_RESOLUTION = 256
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from pathlib import Path
+        >>> import torch
+        >>> from diffusers import DiffusionPipeline, DDIMScheduler
+        >>> model_dir = Path("./FiTv1-XL-2-256").resolve()
+        >>> pipe = DiffusionPipeline.from_pretrained(
+        ...     str(model_dir),
+        ...     local_files_only=True,
+        ...     custom_pipeline=str(model_dir / "pipeline.py"),
+        ...     trust_remote_code=True,
+        ...     torch_dtype=torch.float32,
+        ... )
+        >>> pipe.to("cuda")
+        >>> pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        >>> print(pipe.id2label[207])
+        >>> print(pipe.get_label_ids("golden retriever"))
+        >>> generator = torch.Generator(device="cuda").manual_seed(42)
+        >>> image = pipe(
+        ...     class_labels="golden retriever",
+        ...     height=256,
+        ...     width=256,
+        ...     num_inference_steps=250,
+        ...     guidance_scale=1.5,
+        ...     generator=generator,
+        ... ).images[0]
+        >>> image.save("demo.png")
+        ```
+"""
+class FiTPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for class-conditional image generation with FiTv1 (DDPM sampling).
+    """
+    model_cpu_offload_seq = "transformer->vae"
+    _optional_components = ["vae"]
+    def __init__(
+        self,
+        transformer: Any,
+        scheduler: KarrasDiffusionSchedulers,
+        vae: Any = None,
+        id2label: Optional[Dict[Union[int, str], str]] = None,
+        null_class_id: Optional[int] = None,
+    ):
+        super().__init__()
+        self.register_modules(transformer=transformer, scheduler=scheduler, vae=vae)
+        self.image_processor = VaeImageProcessor()
+        if null_class_id is None:
+            null_class_id = int(getattr(self.transformer.config, "num_classes", 1000))
+        self.register_to_config(null_class_id=int(null_class_id))
+        self._id2label = self._normalize_id2label(id2label)
+        self.labels = self._build_label2id(self._id2label)
+        self._labels_loaded_from_model_index = bool(self._id2label)
+    @property
+    def vae_scale_factor(self) -> int:
+        if self.vae is None:
+            return 8
+        block_out_channels = getattr(self.vae.config, "block_out_channels", None)
+        if block_out_channels:
+            return int(2 ** (len(block_out_channels) - 1))
+        return 8
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path=None, subfolder=None, **kwargs):
+        """Load a self-contained variant folder locally or from the Hub."""
+        repo_root = Path(__file__).resolve().parent
+        if pretrained_model_name_or_path in (None, "", "."):
+            variant = repo_root
+        elif (
+            isinstance(pretrained_model_name_or_path, str)
+            and "/" in pretrained_model_name_or_path
+            and not Path(pretrained_model_name_or_path).exists()
+        ):
+            hub_kwargs = dict(kwargs.pop("hub_kwargs", {}))
+            if subfolder:
+                hub_kwargs.setdefault("allow_patterns", [f"{subfolder}/**"])
+            cache_dir = snapshot_download(pretrained_model_name_or_path, **hub_kwargs)
+            variant = Path(cache_dir) / subfolder if subfolder else Path(cache_dir)
+        else:
+            variant = Path(pretrained_model_name_or_path)
+            if not variant.is_absolute():
+                candidate = (Path.cwd() / variant).resolve()
+                variant = candidate if candidate.exists() else (repo_root / variant).resolve()
+            if subfolder:
+                variant = variant / subfolder
+        id2label_override = kwargs.pop("id2label", None)
+        null_class_id_override = kwargs.pop("null_class_id", None)
+        model_kwargs = dict(kwargs)
+        inserted: List[str] = []
+        def _load_component(folder: str, module_name: str, class_name: str):
+            comp_dir = variant / folder
+            module_path = comp_dir / f"{module_name}.py"
+            has_weights = (comp_dir / "config.json").exists() or (comp_dir / "scheduler_config.json").exists()
+            if not module_path.exists() or not has_weights:
+                return None
+            comp_path = str(comp_dir)
+            if comp_path not in sys.path:
+                sys.path.insert(0, comp_path)
+                inserted.append(comp_path)
+            module = importlib.import_module(module_name)
+            component_cls = getattr(module, class_name)
+            return component_cls.from_pretrained(str(comp_dir), **model_kwargs)
+        try:
+            transformer = _load_component("transformer", "fit_transformer_2d", "FiTTransformer2DModel")
+            if transformer is None:
+                raise ValueError(f"No loadable transformer found under {variant}")
+            scheduler = cls._load_scheduler_from_variant(variant, model_kwargs)
+            vae = None
+            vae_dir = variant / "vae"
+            if vae_dir.exists() and (vae_dir / "config.json").exists():
+                vae = AutoencoderKL.from_pretrained(str(vae_dir), **model_kwargs)
+            id2label = id2label_override or cls._read_id2label_from_model_index(str(variant))
+            null_class_id = null_class_id_override if null_class_id_override is not None else cls._read_null_class_id(
+                str(variant)
+            )
+            pipe = cls(
+                transformer=transformer,
+                scheduler=scheduler,
+                vae=vae,
+                id2label=id2label,
+                null_class_id=null_class_id,
+            )
+            if hasattr(pipe, "register_to_config"):
+                pipe.register_to_config(_name_or_path=str(variant))
+            return pipe
+        finally:
+            for comp_path in inserted:
+                if comp_path in sys.path:
+                    sys.path.remove(comp_path)
+    @classmethod
+    def _load_scheduler_from_variant(cls, variant: Path, model_kwargs: Dict[str, object]) -> KarrasDiffusionSchedulers:
+        scheduler_dir = variant / "scheduler"
+        config_path = scheduler_dir / "scheduler_config.json"
+        if not config_path.exists():
+            raise ValueError(f"No scheduler config found under {scheduler_dir}")
+        scheduler_entry = None
+        model_index_path = variant / "model_index.json"
+        if model_index_path.exists():
+            scheduler_entry = json.loads(model_index_path.read_text(encoding="utf-8")).get("scheduler")
+        if scheduler_entry is None:
+            class_name = json.loads(config_path.read_text(encoding="utf-8")).get("_class_name")
+            if not class_name:
+                raise ValueError(f"Missing `_class_name` in {config_path}")
+            scheduler_entry = ["diffusers", class_name]
+        if not isinstance(scheduler_entry, list) or len(scheduler_entry) != 2:
+            raise ValueError(f"Invalid scheduler entry in model_index.json: {scheduler_entry}")
+        library_name, class_name = scheduler_entry
+        if library_name != "diffusers":
+            raise ValueError(f"Unsupported scheduler library: {library_name}")
+        scheduler_cls = getattr(diffusers_schedulers, class_name)
+        return scheduler_cls.from_pretrained(str(scheduler_dir), **model_kwargs)
+    @staticmethod
+    def _prepare_model_output_for_scheduler(
+        model_out: torch.Tensor,
+        latent_channels: int,
+        scheduler: KarrasDiffusionSchedulers,
+    ) -> torch.Tensor:
+        if model_out.shape[1] != latent_channels * 2:
+            return model_out
+        variance_type = getattr(scheduler.config, "variance_type", None)
+        if scheduler.__class__.__name__ == "DDPMScheduler" and variance_type in ("learned", "learned_range"):
+            return model_out
+        model_output, _ = torch.split(model_out, latent_channels, dim=1)
+        return model_output
+    @staticmethod
+    def _normalize_id2label(id2label: Optional[Dict[Union[int, str], str]]) -> Dict[int, str]:
+        if not id2label:
+            return {}
+        return {int(key): value for key, value in id2label.items()}
+    @staticmethod
+    def _read_id2label_from_model_index(variant_path: Optional[str]) -> Dict[int, str]:
+        if not variant_path:
+            return {}
+        model_index_path = Path(variant_path).resolve() / "model_index.json"
+        if not model_index_path.exists():
+            return {}
+        raw = json.loads(model_index_path.read_text(encoding="utf-8"))
+        id2label = raw.get("id2label")
+        if not isinstance(id2label, dict):
+            return {}
+        return {int(key): value for key, value in id2label.items()}
+    @staticmethod
+    def _read_null_class_id(variant_path: Optional[str]) -> Optional[int]:
+        if not variant_path:
+            return None
+        model_index_path = Path(variant_path).resolve() / "model_index.json"
+        if not model_index_path.exists():
+            return None
+        raw = json.loads(model_index_path.read_text(encoding="utf-8"))
+        if "null_class_id" in raw:
+            return int(raw["null_class_id"])
+        return None
+    @staticmethod
+    def _build_label2id(id2label: Dict[int, str]) -> Dict[str, int]:
+        label2id: Dict[str, int] = {}
+        for class_id, value in id2label.items():
+            for synonym in value.split(","):
+                synonym = synonym.strip()
+                if synonym:
+                    label2id[synonym] = int(class_id)
+        return dict(sorted(label2id.items()))
+    @property
+    def id2label(self) -> Dict[int, str]:
+        self._ensure_labels_loaded()
+        return self._id2label
+    def _ensure_labels_loaded(self) -> None:
+        if self._labels_loaded_from_model_index:
+            return
+        loaded = self._read_id2label_from_model_index(getattr(self.config, "_name_or_path", None))
+        if loaded:
+            self._id2label = loaded
+            self.labels = self._build_label2id(self._id2label)
+        self._labels_loaded_from_model_index = True
+    def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
+        labels = [label] if isinstance(label, str) else label
+        self._ensure_labels_loaded()
+        if not self.labels:
+            raise ValueError("No id2label mapping is available in this checkpoint.")
+        missing = [item for item in labels if item not in self.labels]
+        if missing:
+            preview = ", ".join(list(self.labels.keys())[:8])
+            raise ValueError(f"Unknown labels: {missing}. Example valid labels: {preview}, ...")
+        return [self.labels[item] for item in labels]
+    def _normalize_class_labels(
+        self,
+        class_labels: Union[int, str, List[Union[int, str]], torch.Tensor],
+    ) -> List[int]:
+        if isinstance(class_labels, torch.Tensor):
+            class_labels = class_labels.detach().cpu().tolist()
+        if isinstance(class_labels, int):
+            return [class_labels]
+        if isinstance(class_labels, str):
+            return self.get_label_ids(class_labels)
+        if not class_labels:
+            raise ValueError("`class_labels` cannot be empty.")
+        if isinstance(class_labels[0], str):
+            return self.get_label_ids(class_labels)  # type: ignore[arg-type]
+        return [int(class_id) for class_id in class_labels]  # type: ignore[union-attr]
+    @staticmethod
+    def prepare_extra_step_kwargs(
+        scheduler: KarrasDiffusionSchedulers,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]],
+    ) -> Dict[str, Any]:
+        kwargs: Dict[str, Any] = {}
+        step_params = set(inspect.signature(scheduler.step).parameters.keys())
+        if "generator" in step_params:
+            kwargs["generator"] = generator
+        return kwargs
+    @staticmethod
+    def _expand_timestep(timestep, batch_size: int, device: torch.device) -> torch.Tensor:
+        if not torch.is_tensor(timestep):
+            timestep = torch.tensor([timestep], dtype=torch.long, device=device)
+        elif timestep.ndim == 0:
+            timestep = timestep[None].to(device=device)
+        return timestep.expand(batch_size)
+    @staticmethod
+    def _prepare_grid_mask_size(
+        batch_size: int,
+        n_patch_h: int,
+        n_patch_w: int,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        grid_h = torch.arange(n_patch_h, dtype=torch.long, device=device)
+        grid_w = torch.arange(n_patch_w, dtype=torch.long, device=device)
+        grid = torch.meshgrid(grid_w, grid_h, indexing="xy")
+        grid = torch.cat([grid[0].reshape(1, -1), grid[1].reshape(1, -1)], dim=0).repeat(batch_size, 1, 1)
+        mask = torch.ones(batch_size, n_patch_h * n_patch_w, device=device, dtype=dtype)
+        size = torch.tensor((n_patch_h, n_patch_w), device=device, dtype=torch.long).repeat(batch_size, 1)[:, None, :]
+        return grid, mask, size
+    @torch.inference_mode()
+    def __call__(
+        self,
+        class_labels: Union[int, str, List[Union[int, str]], torch.Tensor] = 207,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 250,
+        guidance_scale: float = 1.5,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        class_labels_list = self._normalize_class_labels(class_labels)
+        batch_size = len(class_labels_list)
+        height = DEFAULT_NATIVE_RESOLUTION if height is None else int(height)
+        width = DEFAULT_NATIVE_RESOLUTION if width is None else int(width)
+        if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0:
+            raise ValueError(
+                f"`height` and `width` must be divisible by {self.vae_scale_factor}, got ({height}, {width})."
+            )
+        if output_type not in {"pil", "np", "pt", "latent"}:
+            raise ValueError(f"Unsupported `output_type`: {output_type}")
+        device = self._execution_device
+        model_dtype = next(self.transformer.parameters()).dtype
+        latent_h = height // self.vae_scale_factor
+        latent_w = width // self.vae_scale_factor
+        patch_size = int(self.transformer.config.patch_size)
+        n_patch_h, n_patch_w = latent_h // patch_size, latent_w // patch_size
+        latent_channels = (patch_size**2) * int(self.transformer.in_channels)
+        extra_step_kwargs = self.prepare_extra_step_kwargs(self.scheduler, generator=generator)
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        if latents is None:
+            latents = randn_tensor(
+                (batch_size, latent_channels, n_patch_h * n_patch_w),
+                generator=generator,
+                device=device,
+                dtype=model_dtype,
+            )
+        else:
+            latents = latents.to(device=device, dtype=model_dtype)
+            expected = (batch_size, latent_channels, n_patch_h * n_patch_w)
+            if tuple(latents.shape) != expected:
+                raise ValueError(f"Invalid `latents` shape: {tuple(latents.shape)}. Expected {expected}.")
+        grid, mask, size = self._prepare_grid_mask_size(batch_size, n_patch_h, n_patch_w, device, model_dtype)
+        class_labels_tensor = torch.tensor(class_labels_list, device=device, dtype=torch.long)
+        using_cfg = guidance_scale > 1.0
+        if using_cfg:
+            y_null = torch.full((batch_size,), int(self.config.null_class_id), device=device, dtype=torch.long)
+            y = torch.cat([class_labels_tensor, y_null], dim=0)
+            grid = torch.cat([grid, grid], dim=0)
+            mask = torch.cat([mask, mask], dim=0)
+            size = torch.cat([size, size], dim=0)
+        for timestep in self.progress_bar(self.scheduler.timesteps):
+            latent_model_input = latents
+            if using_cfg:
+                latent_model_input = torch.cat([latents, latents], dim=0)
+            timestep_tensor = self._expand_timestep(timestep, latent_model_input.shape[0], device)
+            if using_cfg:
+                model_out = self.transformer.forward_with_cfg(
+                    latent_model_input,
+                    timestep_tensor,
+                    y=y,
+                    grid=grid,
+                    mask=mask,
+                    size=size,
+                    cfg_scale=guidance_scale,
+                )
+                model_out = model_out.chunk(2, dim=0)[0]
+            else:
+                model_out = self.transformer(
+                    latents,
+                    timestep_tensor,
+                    y=class_labels_tensor,
+                    grid=grid,
+                    mask=mask,
+                    size=size,
+                )
+            model_output = self._prepare_model_output_for_scheduler(model_out, latent_channels, self.scheduler)
+            latents = self.scheduler.step(model_output, timestep, latents, **extra_step_kwargs).prev_sample
+        latents = latents[..., : n_patch_h * n_patch_w]
+        latents = self.transformer.unpatchify(latents, (latent_h, latent_w))
+        if self.vae is not None:
+            vae_dtype = next(self.vae.parameters()).dtype
+            latents = latents.to(dtype=vae_dtype)
+            latents = self.vae.decode(latents / self.vae.config.scaling_factor).sample
+            image = self.image_processor.postprocess(latents, output_type=output_type)
+        elif output_type == "latent":
+            image = latents
+        else:
+            raise ValueError("Cannot decode latents without a VAE.")
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return ImagePipelineOutput(images=image)
+__all__ = ["FiTPipeline"]

FiTv1-XL-2-256/scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "_class_name": "DDPMScheduler",
+  "_diffusers_version": "0.36.0",
+  "beta_end": 0.02,
+  "beta_schedule": "linear",
+  "beta_start": 0.0001,
+  "clip_sample": false,
+  "clip_sample_range": 1.0,
+  "num_train_timesteps": 1000,
+  "prediction_type": "epsilon",
+  "variance_type": "learned_range",
+  "timestep_spacing": "linspace",
+  "steps_offset": 0,
+  "trained_betas": null
+}

FiTv1-XL-2-256/transformer/config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "_class_name": "FiTTransformer2DModel",
+  "class_dropout_prob": 0.1,
+  "context_size": 256,
+  "depth": 28,
+  "hidden_size": 1152,
+  "in_channels": 4,
+  "learn_sigma": true,
+  "mlp_ratio": 4.0,
+  "num_classes": 1000,
+  "num_heads": 16,
+  "patch_size": 2,
+  "rel_pos_embed": "rope",
+  "use_swiglu": true,
+  "use_swiglu_large": true
+}

FiTv1-XL-2-256/transformer/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e51188080ca2aefd5b1d2d6fe7abc7211663b2dd69049aefb6f40892337aa9e8
+size 3294432464

FiTv1-XL-2-256/transformer/fit_transformer_2d.py ADDED Viewed

	@@ -0,0 +1,993 @@

+# Copyright 2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Self-contained FiT Hub module (generated by scripts/bundle_fit_hub_modules.py)."""
+import torch
+from torch import Tensor
+from typing import List, Tuple
+import torch.nn as nn
+import math
+from math import pi
+from typing import Optional, Any, Union, Tuple
+from torch import nn
+from einops import rearrange, repeat
+from functools import lru_cache
+import numpy as np
+import torch.nn.functional as F
+from torch import nn, Tensor
+from torch.jit import Final
+from timm.layers.mlp import SwiGLU, Mlp
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple
+from functools import partial
+from typing import Optional
+from einops import rearrange
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+try:
+    from diffusers.configuration_utils import ConfigMixin, register_to_config
+    from diffusers.models.modeling_utils import ModelMixin
+except Exception:  # pragma: no cover
+    class ConfigMixin:
+        def register_to_config(self, **kwargs):
+            if not hasattr(self, "_config"):
+                self._config = {}
+            self._config.update(kwargs)
+        @property
+        def config(self):
+            return self._config
+    def register_to_config(func):
+        return func
+    class ModelMixin(nn.Module):
+        pass
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+def get_parameter_dtype(parameter: torch.nn.Module):
+    try:
+        params = tuple(parameter.parameters())
+        if len(params) > 0:
+            return params[0].dtype
+        buffers = tuple(parameter.buffers())
+        if len(buffers) > 0:
+            return buffers[0].dtype
+    except StopIteration:
+        # For torch.nn.DataParallel compatibility in PyTorch 1.5
+        def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
+            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+            return tuples
+        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+        first_tuple = next(gen)
+        return first_tuple[1].dtype
+def create_norm(norm_type: str, dim: int, eps: float = 1e-6):
+    if norm_type is None or norm_type == "":
+        return nn.Identity()
+    norm_type = norm_type.lower()
+    if norm_type == "w_layernorm":
+        return nn.LayerNorm(dim, eps=eps, bias=False)
+    elif norm_type == "layernorm":
+        return nn.LayerNorm(dim, eps=eps, elementwise_affine=False, bias=False)
+    elif norm_type == "w_rmsnorm":
+        return RMSNorm(dim, eps=eps)
+    elif norm_type == "rmsnorm":
+        return RMSNorm(dim, include_weight=False, eps=eps)
+    elif norm_type == "none":
+        return nn.Identity()
+    else:
+        raise NotImplementedError(f"Unknown norm_type: '{norm_type}'")
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, include_weight: bool = True, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.include_weight = include_weight
+        self.weight = nn.Parameter(torch.ones(dim)) if include_weight else None
+    def _norm(self, x: torch.Tensor):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x: torch.Tensor):
+        output = self._norm(x.float()).type_as(x)
+        if self.weight is not None:
+            return output * self.weight
+        return output
+    def reset_parameters(self):
+        if self.weight is not None:
+            torch.nn.init.ones_(self.weight)
+# --------------------------------------------------------
+# FiT: A Flexible Vision Transformer for Image Generation
+#
+# Based on the following repository
+# https://github.com/lucidrains/rotary-embedding-torch
+# https://github.com/jquesnelle/yarn/blob/HEAD/scaled_rope
+# https://colab.research.google.com/drive/1VI2nhlyKvd5cw4-zHvAIk00cAVj2lCCC#scrollTo=b80b3f37
+# --------------------------------------------------------
+#################################################################################
+#                                 NTK Operations                                #
+#################################################################################
+def find_correction_factor(num_rotations, dim, base=10000, max_position_embeddings=2048):
+    return (dim * math.log(max_position_embeddings/(num_rotations * 2 * math.pi)))/(2 * math.log(base)) #Inverse dim formula to find number of rotations
+def find_correction_range(low_rot, high_rot, dim, base=10000, max_position_embeddings=2048):
+    low = math.floor(find_correction_factor(low_rot, dim, base, max_position_embeddings))
+    high = math.ceil(find_correction_factor(high_rot, dim, base, max_position_embeddings))
+    return max(low, 0), min(high, dim-1) #Clamp values just in case
+def linear_ramp_mask(min, max, dim):
+    if min == max:
+        max += 0.001 #Prevent singularity
+    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+    ramp_func = torch.clamp(linear_func, 0, 1)
+    return ramp_func
+def find_newbase_ntk(dim, base=10000, scale=1):
+    # Base change formula
+    return base * scale ** (dim / (dim-2))
+def get_mscale(scale=torch.Tensor):
+    # if scale <= 1:
+    #     return 1.0
+    # return 0.1 * math.log(scale) + 1.0
+    return torch.where(scale <= 1., torch.tensor(1.0), 0.1 * torch.log(scale) + 1.0)
+def get_proportion(L_test, L_train):
+    L_test = L_test * 2
+    return torch.where(torch.tensor(L_test/L_train) <= 1., torch.tensor(1.0), torch.sqrt(torch.log(torch.tensor(L_test))/torch.log(torch.tensor(L_train))))
+    # return torch.sqrt(torch.log(torch.tensor(L_test))/torch.log(torch.tensor(L_train)))
+#################################################################################
+#                                 Rotate Q or K                                 #
+#################################################################################
+def rotate_half(x):
+    x = rearrange(x, '... (d r) -> ... d r', r = 2)
+    x1, x2 = x.unbind(dim = -1)
+    x = torch.stack((-x2, x1), dim = -1)
+    return rearrange(x, '... d r -> ... (d r)')
+#################################################################################
+#                               Core Vision RoPE                                #
+#################################################################################
+class VisionRotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        head_dim: int,  # embed dimension for each head
+        custom_freqs: str = 'normal',
+        theta: int = 10000,
+        online_rope: bool = False,
+        max_cached_len: int = 256,
+        max_pe_len_h: Optional[int] = None,
+        max_pe_len_w: Optional[int] = None,
+        decouple: bool = False,
+        ori_max_pe_len: Optional[int] = None,
+    ):
+        super().__init__()
+        dim = head_dim // 2
+        assert dim % 2 == 0 # accually, this is important
+        self.dim = dim
+        self.custom_freqs = custom_freqs.lower()
+        self.theta = theta
+        self.decouple = decouple
+        self.ori_max_pe_len = ori_max_pe_len
+        self.custom_freqs = custom_freqs.lower()
+        if not online_rope:
+            if self.custom_freqs == 'normal':
+                freqs_h = 1. / (theta ** (torch.arange(0, dim, 2).float() / dim))
+                freqs_w = 1. / (theta ** (torch.arange(0, dim, 2).float() / dim))
+            else:
+                if decouple:
+                    freqs_h = self.get_1d_rope_freqs(theta, dim, max_pe_len_h, ori_max_pe_len)
+                    freqs_w = self.get_1d_rope_freqs(theta, dim, max_pe_len_w, ori_max_pe_len)
+                else:
+                    max_pe_len = max(max_pe_len_h, max_pe_len_w)
+                    freqs_h = self.get_1d_rope_freqs(theta, dim, max_pe_len, ori_max_pe_len)
+                    freqs_w = self.get_1d_rope_freqs(theta, dim, max_pe_len, ori_max_pe_len)
+                attn_factor = 1.0
+                scale = torch.clamp_min(torch.tensor(max(max_pe_len_h, max_pe_len_w)) / ori_max_pe_len, 1.0)   # dynamic scale
+                self.mscale = get_mscale(scale).to(scale) * attn_factor # Get n-d magnitude scaling corrected for interpolation
+                self.proportion1 = get_proportion(max(max_pe_len_h, max_pe_len_w), ori_max_pe_len)
+                self.proportion2 = get_proportion(max_pe_len_h * max_pe_len_w, ori_max_pe_len ** 2)
+            self.register_buffer('freqs_h', freqs_h, persistent=False)
+            self.register_buffer('freqs_w', freqs_w, persistent=False)
+            freqs_h_cached = torch.einsum('..., f -> ... f', torch.arange(max_cached_len), self.freqs_h)
+            freqs_h_cached = repeat(freqs_h_cached, '... n -> ... (n r)', r = 2)
+            self.register_buffer('freqs_h_cached', freqs_h_cached, persistent=False)
+            freqs_w_cached = torch.einsum('..., f -> ... f', torch.arange(max_cached_len), self.freqs_w)
+            freqs_w_cached = repeat(freqs_w_cached, '... n -> ... (n r)', r = 2)
+            self.register_buffer('freqs_w_cached', freqs_w_cached, persistent=False)
+    def get_1d_rope_freqs(self, theta, dim, max_pe_len, ori_max_pe_len):
+        # scaling operations for extrapolation
+        assert isinstance(ori_max_pe_len, int)
+        # scale = max_pe_len / ori_max_pe_len
+        if not isinstance(max_pe_len, torch.Tensor):
+            max_pe_len = torch.tensor(max_pe_len)
+        scale = torch.clamp_min(max_pe_len / ori_max_pe_len, 1.0)   # dynamic scale
+        if self.custom_freqs == 'linear': # equal to position interpolation
+            freqs = 1. / torch.einsum('..., f -> ... f', scale, theta ** (torch.arange(0, dim, 2).float() / dim))
+        elif self.custom_freqs == 'ntk-aware' or self.custom_freqs == 'ntk-aware-pro1' or self.custom_freqs == 'ntk-aware-pro2':
+            freqs = 1. / torch.pow(
+                find_newbase_ntk(dim, theta, scale).view(-1, 1),
+                (torch.arange(0, dim, 2).to(scale).float() / dim)
+            ).squeeze()
+        elif self.custom_freqs == 'ntk-by-parts':
+            #Interpolation constants found experimentally for LLaMA (might not be totally optimal though)
+            #Do not change unless there is a good reason for doing so!
+            beta_0 = 1.25
+            beta_1 = 0.75
+            gamma_0 = 16
+            gamma_1 = 2
+            ntk_factor = 1
+            extrapolation_factor = 1
+            #Three RoPE extrapolation/interpolation methods
+            freqs_base = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
+            freqs_linear = 1.0 / torch.einsum('..., f -> ... f', scale, (theta ** (torch.arange(0, dim, 2).to(scale).float() / dim)))
+            freqs_ntk = 1. / torch.pow(
+                find_newbase_ntk(dim, theta, scale).view(-1, 1),
+                (torch.arange(0, dim, 2).to(scale).float() / dim)
+            ).squeeze()
+            #Combine NTK and Linear
+            low, high = find_correction_range(beta_0, beta_1, dim, theta, ori_max_pe_len)
+            freqs_mask = (1 - linear_ramp_mask(low, high, dim // 2).to(scale)) * ntk_factor
+            freqs = freqs_linear * (1 - freqs_mask) + freqs_ntk * freqs_mask
+            #Combine Extrapolation and NTK and Linear
+            low, high = find_correction_range(gamma_0, gamma_1, dim, theta, ori_max_pe_len)
+            freqs_mask = (1 - linear_ramp_mask(low, high, dim // 2).to(scale)) * extrapolation_factor
+            freqs = freqs * (1 - freqs_mask) + freqs_base * freqs_mask
+        elif self.custom_freqs == 'yarn':
+            #Interpolation constants found experimentally for LLaMA (might not be totally optimal though)
+            #Do not change unless there is a good reason for doing so!
+            beta_fast = 32
+            beta_slow = 1
+            extrapolation_factor = 1
+            freqs_extrapolation = 1.0 / (theta ** (torch.arange(0, dim, 2).to(scale).float() / dim))
+            freqs_interpolation = 1.0 / torch.einsum('..., f -> ... f', scale, (theta ** (torch.arange(0, dim, 2).to(scale).float() / dim)))
+            low, high = find_correction_range(beta_fast, beta_slow, dim, theta, ori_max_pe_len)
+            freqs_mask = (1 - linear_ramp_mask(low, high, dim // 2).to(scale).float()) * extrapolation_factor # Get n-d rotational scaling corrected for extrapolation
+            freqs = freqs_interpolation * (1 - freqs_mask) + freqs_extrapolation * freqs_mask
+        else:
+            raise ValueError(f'Unknown modality {self.custom_freqs}. Only support normal, linear, ntk-aware, ntk-by-parts, yarn!')
+        return freqs
+    def online_get_2d_rope_from_grid(self, grid, size):
+        '''
+        grid: (B, 2, N)
+            N = H * W
+            the first dimension represents width, and the second reprensents height
+            e.g.,   [0. 1. 2. 3. 0. 1. 2. 3. 0. 1. 2. 3.]
+                    [0. 0. 0. 0. 1. 1. 1. 1. 2. 2. 2. 2.]
+        size: (B, 1, 2), h goes first and w goes last
+        '''
+        size = size.squeeze()   # (B, 1, 2) -> (B, 2)
+        if self.decouple:
+            size_h = size[:, 0]
+            size_w = size[:, 1]
+            freqs_h = self.get_1d_rope_freqs(self.theta, self.dim, size_h, self.ori_max_pe_len)
+            freqs_w = self.get_1d_rope_freqs(self.theta, self.dim, size_w, self.ori_max_pe_len)
+        else:
+            size_max = torch.max(size[:, 0], size[:, 1])
+            freqs_h = self.get_1d_rope_freqs(self.theta, self.dim, size_max, self.ori_max_pe_len)
+            freqs_w = self.get_1d_rope_freqs(self.theta, self.dim, size_max, self.ori_max_pe_len)
+        freqs_w = grid[:, 0][..., None] * freqs_w[:, None, :]
+        freqs_w = repeat(freqs_w, '... n -> ... (n r)', r = 2)
+        freqs_h = grid[:, 1][..., None] * freqs_h[:, None, :]
+        freqs_h = repeat(freqs_h, '... n -> ... (n r)', r = 2)
+        freqs = torch.cat([freqs_h, freqs_w], dim=-1)   # (B, N, D)
+        if self.custom_freqs == 'yarn':
+            freqs_cos = freqs.cos() * self.mscale[:, None, None]
+            freqs_sin = freqs.sin() * self.mscale[:, None, None]
+        elif self.custom_freqs == 'ntk-aware-pro1':
+            freqs_cos = freqs.cos() * self.proportion1[:, None, None]
+            freqs_sin = freqs.sin() * self.proportion1[:, None, None]
+        elif self.custom_freqs == 'ntk-aware-pro2':
+            freqs_cos = freqs.cos() * self.proportion2[:, None, None]
+            freqs_sin = freqs.sin() * self.proportion2[:, None, None]
+        else:
+            freqs_cos = freqs.cos()
+            freqs_sin = freqs.sin()
+        return freqs_cos, freqs_sin
+    @lru_cache()
+    def get_2d_rope_from_grid(self, grid):
+        '''
+        grid: (B, 2, N)
+            N = H * W
+            the first dimension represents width, and the second reprensents height
+            e.g.,   [0. 1. 2. 3. 0. 1. 2. 3. 0. 1. 2. 3.]
+                    [0. 0. 0. 0. 1. 1. 1. 1. 2. 2. 2. 2.]
+        '''
+        freqs_w = torch.einsum('..., f -> ... f', grid[:, 0], self.freqs_w)
+        freqs_w = repeat(freqs_w, '... n -> ... (n r)', r = 2)
+        freqs_h = torch.einsum('..., f -> ... f', grid[:, 1], self.freqs_h)
+        freqs_h = repeat(freqs_h, '... n -> ... (n r)', r = 2)
+        freqs = torch.cat([freqs_h, freqs_w], dim=-1)   # (B, N, D)
+        if self.custom_freqs == 'yarn':
+            freqs_cos = freqs.cos() * self.mscale
+            freqs_sin = freqs.sin() * self.mscale
+        elif self.custom_freqs == 'ntk-aware-pro1':
+            freqs_cos = freqs.cos() * self.proportion1
+            freqs_sin = freqs.sin() * self.proportion1
+        elif self.custom_freqs == 'ntk-aware-pro2':
+            freqs_cos = freqs.cos() * self.proportion2
+            freqs_sin = freqs.sin() * self.proportion2
+        else:
+            freqs_cos = freqs.cos()
+            freqs_sin = freqs.sin()
+        return freqs_cos, freqs_sin
+    @lru_cache()
+    def get_cached_2d_rope_from_grid(self, grid: torch.Tensor):
+        '''
+        grid: (B, 2, N)
+            N = H * W
+            the first dimension represents width, and the second reprensents height
+            e.g.,   [0. 1. 2. 3. 0. 1. 2. 3. 0. 1. 2. 3.]
+                    [0. 0. 0. 0. 1. 1. 1. 1. 2. 2. 2. 2.]
+        '''
+        freqs_w, freqs_h = self.freqs_w_cached[grid[:, 0]], self.freqs_h_cached[grid[:, 1]]
+        freqs = torch.cat([freqs_h, freqs_w], dim=-1)   # (B, N, D)
+        if self.custom_freqs == 'yarn':
+            freqs_cos = freqs.cos() * self.mscale
+            freqs_sin = freqs.sin() * self.mscale
+        elif self.custom_freqs == 'ntk-aware-pro1':
+            freqs_cos = freqs.cos() * self.proportion1
+            freqs_sin = freqs.sin() * self.proportion1
+        elif self.custom_freqs == 'ntk-aware-pro2':
+            freqs_cos = freqs.cos() * self.proportion2
+            freqs_sin = freqs.sin() * self.proportion2
+        else:
+            freqs_cos = freqs.cos()
+            freqs_sin = freqs.sin()
+        return freqs_cos, freqs_sin
+    @lru_cache()
+    def get_cached_21d_rope_from_grid(self, grid: torch.Tensor): # for 3d rope formulation 2 !
+        '''
+        grid: (B, 3, N)
+            N = H * W * T
+            the first dimension represents width, and the second reprensents height, and the third reprensents time
+            e.g.,   [0. 1. 2. 3. 0. 1. 2. 3. 0. 1. 2. 3.]
+                    [0. 0. 0. 0. 1. 1. 1. 1. 2. 2. 2. 2.]
+                    [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
+        '''
+        freqs_w, freqs_h = self.freqs_w_cached[grid[:, 0]+grid[:, 2]], self.freqs_h_cached[grid[:, 1]+grid[:, 2]]
+        freqs = torch.cat([freqs_h, freqs_w], dim=-1)   # (B, N, D)
+        if self.custom_freqs == 'yarn':
+            freqs_cos = freqs.cos() * self.mscale
+            freqs_sin = freqs.sin() * self.mscale
+        elif self.custom_freqs == 'ntk-aware-pro1':
+            freqs_cos = freqs.cos() * self.proportion1
+            freqs_sin = freqs.sin() * self.proportion1
+        elif self.custom_freqs == 'ntk-aware-pro2':
+            freqs_cos = freqs.cos() * self.proportion2
+            freqs_sin = freqs.sin() * self.proportion2
+        else:
+            freqs_cos = freqs.cos()
+            freqs_sin = freqs.sin()
+        return freqs_cos, freqs_sin
+    def forward(self, x, grid):
+        '''
+        x: (B, n_head, N, D)
+        grid: (B, 2, N)
+        '''
+        # freqs_cos, freqs_sin = self.get_2d_rope_from_grid(grid)
+        # freqs_cos, freqs_sin = freqs_cos.unsqueeze(1), freqs_sin.unsqueeze(1)
+        # using cache to accelerate, this is the same with the above codes:
+        freqs_cos, freqs_sin = self.get_cached_2d_rope_from_grid(grid)
+        freqs_cos, freqs_sin = freqs_cos.unsqueeze(1), freqs_sin.unsqueeze(1)
+        return  x * freqs_cos + rotate_half(x) * freqs_sin
+#################################################################################
+#           Embedding Layers for Patches, Timesteps and Class Labels            #
+#################################################################################
+class PatchEmbedder(nn.Module):
+    """
+    Embeds latent features into vector representations
+    """
+    def __init__(self,
+        input_dim,
+        embed_dim,
+        bias: bool = True,
+        norm_layer: Optional[Callable] = None,
+    ):
+        super().__init__()
+        self.proj = nn.Linear(input_dim, embed_dim, bias=bias)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x):
+        x = self.proj(x)    # (B, L, patch_size ** 2 * C) -> (B, L, D)
+        x = self.norm(x)
+        return x
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None] * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1]).to(device=t.device)], dim=-1)
+        return embedding.to(dtype=t.dtype)
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class LabelEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, num_classes, hidden_size, dropout_prob):
+        super().__init__()
+        use_cfg_embedding = dropout_prob > 0
+        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+    def token_drop(self, labels, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        labels = torch.where(drop_ids, self.num_classes, labels)
+        return labels
+    def forward(self, labels, train, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            labels = self.token_drop(labels, force_drop_ids)
+        embeddings = self.embedding_table(labels)
+        return embeddings
+#################################################################################
+#                                  Attention                                    #
+#################################################################################
+# modified from timm and eva-02
+# https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py
+# https://github.com/baaivision/EVA/blob/master/EVA-02/asuka/modeling_finetune.py
+class Attention(nn.Module):
+    def __init__(self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        q_norm: Optional[str] = None,
+        k_norm: Optional[str] = None,
+        qk_norm_weight: bool = False,
+        attn_drop: float = 0.,
+        proj_drop: float = 0.,
+        rel_pos_embed: Optional[str] = None,
+        add_rel_pe_to_v: bool = False,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        if q_norm == 'layernorm' and qk_norm_weight == True:
+            q_norm = 'w_layernorm'
+        if k_norm == 'layernorm' and qk_norm_weight == True:
+            k_norm = 'w_layernorm'
+        self.q_norm = create_norm(q_norm, self.head_dim)
+        self.k_norm = create_norm(k_norm, self.head_dim)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.rel_pos_embed = None if rel_pos_embed==None else rel_pos_embed.lower()
+        self.add_rel_pe_to_v = add_rel_pe_to_v
+    def forward(self,
+        x: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        freqs_cos: Optional[torch.Tensor] = None,
+        freqs_sin: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0) # (B, n_h, N, D_h)
+        q, k = self.q_norm(q), self.k_norm(k)
+        if self.rel_pos_embed in ['rope', 'xpos']:  # multiplicative rel_pos_embed
+            if self.add_rel_pe_to_v:
+                v = v * freqs_cos + rotate_half(v) * freqs_sin
+            q = q * freqs_cos + rotate_half(q) * freqs_sin
+            k = k * freqs_cos + rotate_half(k) * freqs_sin
+        attn_mask = mask[:, None, None, :]  # (B, N) -> (B, 1, 1, N)
+        attn_mask = (attn_mask == attn_mask.transpose(-2, -1))  # (B, 1, 1, N) x (B, 1, N, 1) -> (B, 1, N, N)
+        mask = torch.not_equal(mask, torch.zeros_like(mask)).to(mask)   # (B, N) -> (B, N)
+        if x.device.type == "cpu":
+            x = F.scaled_dot_product_attention(
+                q, k, v, attn_mask=attn_mask,
+                dropout_p=self.attn_drop.p if self.training else 0.,
+            )
+        else:
+            with torch.backends.cuda.sdp_kernel(enable_flash=True):
+                '''
+                F.scaled_dot_product_attention is the efficient implementation equivalent to the following:
+                    attn_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0) if is_causal else attn_mask
+                    attn_mask = attn_mask.masked_fill(not attn_mask, -float('inf')) if attn_mask.dtype==torch.bool else attn_mask
+                    attn_weight = torch.softmax((Q @ K.transpose(-2, -1) / math.sqrt(Q.size(-1))) + attn_mask, dim=-1)
+                    attn_weight = torch.dropout(attn_weight, dropout_p)
+                    return attn_weight @ V
+                In conclusion:
+                    boolean attn_mask will mask the attention matrix where attn_mask is False
+                    non-boolean attn_mask will be directly added to Q@K.T
+                '''
+                x = F.scaled_dot_product_attention(
+                    q, k, v, attn_mask=attn_mask,
+                    dropout_p=self.attn_drop.p if self.training else 0.,
+                )
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = x * mask[..., None] # mask: (B, N) -> (B, N, 1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+#################################################################################
+#                               Basic FiT Module                                #
+#################################################################################
+class FiTBlock(nn.Module):
+    """
+    A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning.
+    """
+    def __init__(self,
+        hidden_size,
+        num_heads,
+        mlp_ratio=4.0,
+        swiglu=True,
+        swiglu_large=False,
+        rel_pos_embed=None,
+        add_rel_pe_to_v=False,
+        norm_layer: str = 'layernorm',
+        q_norm: Optional[str] = None,
+        k_norm: Optional[str] = None,
+        qk_norm_weight: bool = False,
+        qkv_bias=True,
+        ffn_bias=True,
+        adaln_bias=True,
+        adaln_type='normal',
+        adaln_lora_dim: int = None,
+        **block_kwargs
+    ):
+        super().__init__()
+        self.norm1 = create_norm(norm_layer, hidden_size)
+        self.norm2 = create_norm(norm_layer, hidden_size)
+        self.attn = Attention(
+            hidden_size, num_heads=num_heads, rel_pos_embed=rel_pos_embed,
+            q_norm=q_norm, k_norm=k_norm, qk_norm_weight=qk_norm_weight,
+            qkv_bias=qkv_bias, add_rel_pe_to_v=add_rel_pe_to_v,
+            **block_kwargs
+        )
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        if swiglu:
+            if swiglu_large:
+                self.mlp = SwiGLU(in_features=hidden_size, hidden_features=mlp_hidden_dim, bias=ffn_bias)
+            else:
+                self.mlp = SwiGLU(in_features=hidden_size, hidden_features=(mlp_hidden_dim*2)//3, bias=ffn_bias)
+        else:
+            self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=lambda: nn.GELU(approximate="tanh"), bias=ffn_bias)
+        if adaln_type == 'normal':
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(hidden_size, 6 * hidden_size, bias=adaln_bias)
+            )
+        elif adaln_type == 'lora':
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(hidden_size, adaln_lora_dim, bias=adaln_bias),
+                nn.Linear(adaln_lora_dim, 6 * hidden_size, bias=adaln_bias)
+            )
+        elif adaln_type == 'swiglu':
+            self.adaLN_modulation = SwiGLU(
+                in_features=hidden_size, hidden_features=(hidden_size//4)*3, out_features=6*hidden_size, bias=adaln_bias
+            )
+    def forward(self, x, c, mask, freqs_cos, freqs_sin, global_adaln=0.0):
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.adaLN_modulation(c) + global_adaln).chunk(6, dim=1)
+        x = x + gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1(x), shift_msa, scale_msa), mask, freqs_cos, freqs_sin)
+        x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
+        return x
+class FinalLayer(nn.Module):
+    """
+    The final layer of DiT.
+    """
+    def __init__(self, hidden_size, patch_size, out_channels, norm_layer: str = 'layernorm', adaln_bias=True, adaln_type='normal'):
+        super().__init__()
+        self.norm_final = create_norm(norm_type=norm_layer, dim=hidden_size)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        if adaln_type == 'swiglu':
+            self.adaLN_modulation = SwiGLU(in_features=hidden_size, hidden_features=hidden_size//2, out_features=2*hidden_size, bias=adaln_bias)
+        else:   # adaln_type in ['normal', 'lora']
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(hidden_size, 2 * hidden_size, bias=adaln_bias)
+            )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class FiTTransformer2DModel(ModelMixin, ConfigMixin):
+    """
+    FiT backbone as a Hugging Face Diffusers `ModelMixin` / `ConfigMixin` module.
+    Checkpoints from the original FiT layout load with identical state dict keys.
+    """
+    config_name = "config.json"
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        context_size: int = 256,
+        patch_size: int = 2,
+        in_channels: int = 4,
+        hidden_size: int = 1152,
+        depth: int = 28,
+        num_heads: int = 16,
+        mlp_ratio: float = 4.0,
+        class_dropout_prob: float = 0.1,
+        num_classes: int = 1000,
+        learn_sigma: bool = True,
+        use_sit: bool = False,
+        use_checkpoint: bool = False,
+        use_swiglu: bool = False,
+        use_swiglu_large: bool = False,
+        rel_pos_embed: Optional[str] = "rope",
+        norm_type: str = "layernorm",
+        q_norm: Optional[str] = None,
+        k_norm: Optional[str] = None,
+        qk_norm_weight: bool = False,
+        qkv_bias: bool = True,
+        ffn_bias: bool = True,
+        adaln_bias: bool = True,
+        adaln_type: str = "normal",
+        adaln_lora_dim: Optional[int] = None,
+        rope_theta: float = 10000.0,
+        custom_freqs: str = "normal",
+        max_pe_len_h: Optional[int] = None,
+        max_pe_len_w: Optional[int] = None,
+        decouple: bool = False,
+        ori_max_pe_len: Optional[int] = None,
+        online_rope: bool = False,
+        add_rel_pe_to_v: bool = False,
+        pretrain_ckpt: Optional[str] = None,
+        ignore_keys: Optional[list] = None,
+        finetune: Optional[str] = None,
+        time_shifting: int = 1,
+    ):
+        super().__init__()
+        self.context_size = context_size
+        self.hidden_size = hidden_size
+        assert not (learn_sigma and use_sit)
+        self.learn_sigma = learn_sigma
+        self.use_sit = use_sit
+        self.use_checkpoint = use_checkpoint
+        self.depth = depth
+        self.mlp_ratio = mlp_ratio
+        self.class_dropout_prob = class_dropout_prob
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.out_channels = self.in_channels * 2 if learn_sigma else in_channels
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.adaln_type = adaln_type
+        self.online_rope = online_rope
+        self.time_shifting = time_shifting
+        self.x_embedder = PatchEmbedder(in_channels * patch_size**2, hidden_size, bias=True)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob)
+        self.rope_embedder = VisionRotaryEmbedding(
+            head_dim=hidden_size // num_heads,
+            theta=rope_theta,
+            custom_freqs=custom_freqs,
+            online_rope=online_rope,
+            max_pe_len_h=max_pe_len_h,
+            max_pe_len_w=max_pe_len_w,
+            decouple=decouple,
+            ori_max_pe_len=ori_max_pe_len,
+        )
+        if adaln_type == "lora":
+            self.global_adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(hidden_size, 6 * hidden_size, bias=adaln_bias),
+            )
+        else:
+            self.global_adaLN_modulation = None
+        self.blocks = nn.ModuleList(
+            [
+                FiTBlock(
+                    hidden_size,
+                    num_heads,
+                    mlp_ratio=mlp_ratio,
+                    swiglu=use_swiglu,
+                    swiglu_large=use_swiglu_large,
+                    rel_pos_embed=rel_pos_embed,
+                    add_rel_pe_to_v=add_rel_pe_to_v,
+                    norm_layer=norm_type,
+                    q_norm=q_norm,
+                    k_norm=k_norm,
+                    qk_norm_weight=qk_norm_weight,
+                    qkv_bias=qkv_bias,
+                    ffn_bias=ffn_bias,
+                    adaln_bias=adaln_bias,
+                    adaln_type=adaln_type,
+                    adaln_lora_dim=adaln_lora_dim,
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.final_layer = FinalLayer(
+            hidden_size,
+            patch_size,
+            self.out_channels,
+            norm_layer=norm_type,
+            adaln_bias=adaln_bias,
+            adaln_type=adaln_type,
+        )
+        self.initialize_weights(pretrain_ckpt=pretrain_ckpt, ignore=ignore_keys)
+        if finetune is not None:
+            self.apply_finetune(finetune_type=finetune, unfreeze=ignore_keys)
+    def initialize_weights(self, pretrain_ckpt=None, ignore=None):
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        w = self.x_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.x_embedder.proj.bias, 0)
+        nn.init.normal_(self.y_embedder.embedding_table.weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        for block in self.blocks:
+            if self.adaln_type in ["normal", "lora"]:
+                nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+                nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+            elif self.adaln_type == "swiglu":
+                nn.init.constant_(block.adaLN_modulation.fc2.weight, 0)
+                nn.init.constant_(block.adaLN_modulation.fc2.bias, 0)
+        if self.adaln_type == "lora":
+            nn.init.constant_(self.global_adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(self.global_adaLN_modulation[-1].bias, 0)
+        if self.adaln_type == "swiglu":
+            nn.init.constant_(self.final_layer.adaLN_modulation.fc2.weight, 0)
+            nn.init.constant_(self.final_layer.adaLN_modulation.fc2.bias, 0)
+        else:
+            nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+        keys = list(self.state_dict().keys())
+        ignore_keys = []
+        if ignore is not None:
+            for ign in ignore:
+                for key in keys:
+                    if ign in key:
+                        ignore_keys.append(key)
+        ignore_keys = list(set(ignore_keys))
+    def unpatchify(self, x, hw):
+        h, w = hw
+        p = self.patch_size
+        if self.use_sit:
+            x = rearrange(x, "b (h w) c -> b h w c", h=h // p, w=w // p)
+            x = rearrange(x, "b h w (c p1 p2) -> b c (h p1) (w p2)", p1=p, p2=p)
+        else:
+            x = rearrange(x, "b c (h w) -> b c h w", h=h // p, w=w // p)
+            x = rearrange(x, "b (c p1 p2) h w -> b c (h p1) (w p2)", p1=p, p2=p)
+        return x
+    def forward(self, x, t, y, grid, mask, size=None):
+        dtype = self.x_embedder.proj.weight.dtype
+        x = x.to(dtype=dtype)
+        mask = mask.to(dtype=dtype)
+        # Flow-matching (FiTv2 / use_sit) expects t in [0, 1]. Improved diffusion (FiTv1)
+        # passes integer timesteps 0..T-1 directly to TimestepEmbedder, like DiT.
+        if self.use_sit:
+            t = torch.clamp(self.time_shifting * t / (1 + (self.time_shifting - 1) * t), max=1.0)
+        t = t.float().to(dtype)
+        if not self.use_sit:
+            x = rearrange(x, "B C N -> B N C")
+        x = self.x_embedder(x)
+        t = self.t_embedder(t)
+        y = self.y_embedder(y, self.training)
+        c = t + y
+        if self.online_rope:
+            freqs_cos, freqs_sin = self.rope_embedder.online_get_2d_rope_from_grid(grid, size)
+            freqs_cos, freqs_sin = freqs_cos.unsqueeze(1), freqs_sin.unsqueeze(1)
+        else:
+            freqs_cos, freqs_sin = self.rope_embedder.get_cached_2d_rope_from_grid(grid)
+            freqs_cos, freqs_sin = freqs_cos.unsqueeze(1), freqs_sin.unsqueeze(1)
+        freqs_cos = freqs_cos.to(dtype=dtype)
+        freqs_sin = freqs_sin.to(dtype=dtype)
+        if self.global_adaLN_modulation is not None:
+            global_adaln = self.global_adaLN_modulation(c)
+        else:
+            global_adaln = 0.0
+        if not self.use_checkpoint:
+            for block in self.blocks:
+                x = block(x, c, mask, freqs_cos, freqs_sin, global_adaln)
+        else:
+            for block in self.blocks:
+                x = torch.utils.checkpoint.checkpoint(
+                    self.ckpt_wrapper(block), x, c, mask, freqs_cos, freqs_sin, global_adaln, use_reentrant=False
+                )
+        x = self.final_layer(x, c)
+        x = x * mask[..., None]
+        if not self.use_sit:
+            x = rearrange(x, "B N C -> B C N")
+        return x
+    def forward_with_cfg(self, x, t, y, grid, mask, size, cfg_scale, scale_pow=0.0):
+        half = x[: len(x) // 2]
+        combined = torch.cat([half, half], dim=0)
+        model_out = self.forward(combined, t, y, grid, mask, size)
+        C_cfg = 3 * self.patch_size * self.patch_size
+        if self.use_sit:
+            eps, rest = model_out[:, :, :C_cfg], model_out[:, :, C_cfg:]
+        else:
+            eps, rest = model_out[:, :C_cfg], model_out[:, C_cfg:]
+        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+        if scale_pow == 0.0:
+            real_cfg_scale = cfg_scale
+        else:
+            scale_step = (1 - torch.cos(((1 - torch.clamp_max(t, 1.0)) ** scale_pow) * torch.pi)) * 1 / 2
+            real_cfg_scale = (cfg_scale - 1) * scale_step + 1
+            real_cfg_scale = real_cfg_scale[: len(x) // 2].view(-1, 1, 1)
+            if self.use_sit:
+                t = t / (self.time_shifting + (1 - self.time_shifting) * t)
+        half_eps = uncond_eps + real_cfg_scale * (cond_eps - uncond_eps)
+        eps = torch.cat([half_eps, half_eps], dim=0)
+        if self.use_sit:
+            return torch.cat([eps, rest], dim=2)
+        return torch.cat([eps, rest], dim=1)
+    def ckpt_wrapper(self, module):
+        def ckpt_forward(*inputs):
+            return module(*inputs)
+        return ckpt_forward
+    def apply_finetune(self, finetune_type, unfreeze):
+        if finetune_type == "full":
+            return
+        for _, param in self.named_parameters():
+            param.requires_grad = False
+        if unfreeze is None:
+            return
+        for unf in unfreeze:
+            for name, param in self.named_parameters():
+                if unf in name:
+                    param.requires_grad = True

FiTv1-XL-2-256/vae/config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "_class_name": "AutoencoderKL",
+  "_diffusers_version": "0.38.0",
+  "_name_or_path": "stabilityai/sd-vae-ft-ema",
+  "act_fn": "silu",
+  "block_out_channels": [
+    128,
+    256,
+    512,
+    512
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "force_upcast": true,
+  "in_channels": 3,
+  "latent_channels": 4,
+  "latents_mean": null,
+  "latents_std": null,
+  "layers_per_block": 2,
+  "mid_block_add_attention": true,
+  "norm_num_groups": 32,
+  "out_channels": 3,
+  "sample_size": 256,
+  "scaling_factor": 0.18215,
+  "shift_factor": null,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ],
+  "use_post_quant_conv": true,
+  "use_quant_conv": true
+}

FiTv1-XL-2-256/vae/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:703abdcd7c389316b5128faa9b750a530ea1680b453170b27afebac5e4db30c4
+size 334643268