Text Generation
Safetensors
NemotronH_Nano_Omni_Reasoning_V3
nvidia
unsloth
nemotron-3
multimodal
conversational
custom_code
File size: 3,738 Bytes
1cc1a90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import torch
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoImageProcessor, AutoProcessor
from PIL import Image
from pathlib import Path


model_path = "."
device = "cuda:0"
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, device_map=device, torch_dtype=torch.bfloat16).eval()
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
image_processor = AutoImageProcessor.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)

generation_config = dict(max_new_tokens=1024, do_sample=False, eos_token_id=tokenizer.eos_token_id)
img_lst = [
    "images/example1a.jpeg",
    "images/example1b.jpeg",
    "images/table.png",
    "images/tech.png",
]

print("="*50)
print("Text-only test")
print("="*50)
messages = [
    {"role": "system", "content": "/no_think"},
    {"role": "user", "content": [{"type": "text", "text": "Write a short haiku about the moon."}]},
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
text_inputs = tokenizer([prompt], return_tensors="pt").to(device)
text_outputs = model.generate(
    input_ids=text_inputs.input_ids,
    attention_mask=text_inputs.attention_mask,
    max_new_tokens=64,
)
print(tokenizer.batch_decode(text_outputs[:, text_inputs.input_ids.shape[1]:], skip_special_tokens=True)[0])

print("="*50)
print("Test single image")
print("="*50)
for idx, img_path in enumerate(img_lst):
    images = [Image.open(img_lst[idx])]
    messages = [
        {"role": "system", "content": "/no_think"},
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "<image>\nDescribe the image.",
                },
            ],
        }
    ]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[prompt],
        images=[Image.open(img_lst[idx])],
        return_tensors="pt",
    )
    inputs = inputs.to(device)

    # Inference: Generation of the output
    generated_ids = model.generate(
        pixel_values=inputs.pixel_values,
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=128,
    )
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    print(f"Prompt: {prompt}\nOutput: {output_text[0]}\n\n\n")

print("="*50)
print("Test multi-images")
print("="*50)
multi_img_lst = [
    "images/example1a.jpeg",
    "images/example1b.jpeg",
]
images = [Image.open(p) for p in multi_img_lst]
messages = [
    {"role": "system", "content": "/no_think"},
    {"role": "user", "content": "Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail."},
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
    text=[prompt],
    images=images,
    return_tensors="pt",
)
inputs = inputs.to(device)

generated_ids = model.generate(
    pixel_values=inputs.pixel_values,
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1024,
)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(f"Output: {output_text[0]}\n")