import torch from transformers import AutoTokenizer, AutoModelForCausalLM, AutoProcessor from PIL import Image import video_io model_path = "." device = "cuda:0" model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, device_map=device, dtype=torch.bfloat16).eval() tokenizer = AutoTokenizer.from_pretrained(model_path) processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) generation_config = dict(max_new_tokens=1024, do_sample=False, eos_token_id=tokenizer.eos_token_id) video_path = "images/demo.mp4" video_fps = 1 video_nframe = 8 video_nframe_max = -1 # Get frames and metadata image_urls, metadata = video_io.maybe_path_or_url_to_data_urls( video_path, fps=max(0, int(video_fps)), nframe=max(0, int(video_nframe)), nframe_max=int(video_nframe_max), ) frames = [video_io.pil_image_from_base64(image_url) for image_url in image_urls] print(f"Metadata: {metadata}") # Build prompt with tokens per frame (since