Add files using upload-large-folder tool

09ddc2a verified about 2 months ago

25.9 kB

	"""Inference utilities for MolmoAct2"""

	from dataclasses import dataclass
	from typing import Any, Iterable, Optional, Sequence, Tuple

	import torch
	from torch.nn import functional as F
	from transformers.cache_utils import Cache
	from transformers.configuration_utils import PretrainedConfig


	@dataclass
	class _ActionFlowInputs:
	trajectory: torch.Tensor
	context: Any
	modulations: Sequence[Any]
	action_dim_is_pad: Optional[torch.Tensor]


	@dataclass
	class _ActionFlowCudaGraph:
	key: Tuple[Any, ...]
	graph: torch.cuda.CUDAGraph
	static_inputs: _ActionFlowInputs
	output: torch.Tensor


	@dataclass
	class _DepthDecodeCudaGraphLayerStage:
	residual: torch.Tensor
	query: torch.Tensor
	key: torch.Tensor
	value: torch.Tensor


	@dataclass
	class _DepthDecodeCudaGraphPostStage:
	graph: torch.cuda.CUDAGraph
	attn_context: torch.Tensor


	@dataclass
	class _DepthDecodeCudaGraph:
	cache_key: Tuple[Any, ...]
	pre_graph: torch.cuda.CUDAGraph
	token_ids: torch.Tensor
	cos: torch.Tensor
	sin: torch.Tensor
	positions: torch.Tensor
	stages: Sequence[_DepthDecodeCudaGraphLayerStage]
	post_graphs: Sequence[_DepthDecodeCudaGraphPostStage]
	output: torch.Tensor


	@dataclass
	class _DepthDecodeCudaGraphSpec:
	eligible: bool
	cache_key_prefix: Tuple[Any, ...]
	num_hidden_layers: int
	head_dim: int
	num_attention_heads: int


	def _cache_seq_len_int(past_key_values: Optional[Cache]) -> int:
	if past_key_values is None:
	return 0
	seq_len = past_key_values.get_seq_length()
	if torch.is_tensor(seq_len):
	return int(seq_len.item())
	return int(seq_len)


	def _cache_max_len_int(past_key_values: Optional[Cache]) -> int:
	if past_key_values is None:
	return -1
	max_len = past_key_values.get_max_cache_shape()
	if torch.is_tensor(max_len):
	return int(max_len.item())
	return int(max_len)


	def _iter_cache_key_values(
	past_key_values: Cache,
	) -> Iterable[Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]]:
	layers = getattr(past_key_values, "layers", None)
	if layers is not None:
	for layer in layers:
	yield getattr(layer, "keys", None), getattr(layer, "values", None)
	return
	for layer in past_key_values:
	yield layer[0], layer[1]


	class _DepthDecodeStaticLayerCache:
	is_compileable = False
	is_sliding = False

	def __init__(self, max_cache_len: int) -> None:
	self.max_cache_len = int(max_cache_len)
	self.cumulative_length = 0
	self.keys: Optional[torch.Tensor] = None
	self.values: Optional[torch.Tensor] = None

	def _allocate(self, key_states: torch.Tensor, value_states: torch.Tensor) -> None:
	bsz, n_heads = key_states.shape[:2]
	self.keys = torch.empty(
	(bsz, n_heads, self.max_cache_len, key_states.shape[-1]),
	dtype=key_states.dtype,
	device=key_states.device,
	)
	self.values = torch.empty(
	(bsz, n_heads, self.max_cache_len, value_states.shape[-1]),
	dtype=value_states.dtype,
	device=value_states.device,
	)

	def update(
	self,
	key_states: torch.Tensor,
	value_states: torch.Tensor,
	*args,
	**kwargs,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	if self.keys is None:
	self._allocate(key_states, value_states)
	start = self.cumulative_length
	end = start + key_states.shape[-2]
	if end > self.max_cache_len:
	raise RuntimeError(
	f"KV cache length {end} exceeds max_cache_len={self.max_cache_len}."
	)
	self.keys[:, :, start:end, :].copy_(key_states)
	self.values[:, :, start:end, :].copy_(value_states)
	self.cumulative_length = end
	return self.keys[:, :, :end, :], self.values[:, :, :end, :]

	def get_seq_length(self) -> int:
	return self.cumulative_length

	def get_max_cache_shape(self) -> int:
	return -1

	def reset(self) -> None:
	self.cumulative_length = 0


	class _DepthDecodeStaticCache(Cache):
	def __init__(self, config: PretrainedConfig, max_cache_len: int) -> None:
	text_config = config.get_text_config(decoder=True)
	super().__init__(
	layers=[
	_DepthDecodeStaticLayerCache(max_cache_len=max_cache_len)
	for _ in range(text_config.num_hidden_layers)
	]
	)

	def get_seq_length(self, layer_idx: int = 0) -> int:
	return self.layers[layer_idx].get_seq_length()

	def get_max_cache_shape(self, layer_idx: int = 0) -> int:
	return self.layers[layer_idx].get_max_cache_shape()

	def reset(self) -> None:
	for layer in self.layers:
	layer.reset()


	class ActionCudaGraphManager:
	def __init__(self, model: Any) -> None:
	self.model = model
	self.enabled = True
	self.action_flow_graph: Optional[_ActionFlowCudaGraph] = None

	def set_enabled(self, enabled: bool) -> None:
	self.enabled = bool(enabled)

	def can_use_action_flow(self, inputs: _ActionFlowInputs) -> bool:
	action_model = self.model
	if not self.enabled:
	return False
	if action_model.training or action_model._require_action_expert().training:
	return False
	if inputs.trajectory.device.type != "cuda":
	return False

	def all_on_cuda():
	yield inputs.trajectory
	for k, v in inputs.context.kv_contexts:
	yield k
	yield v
	for t in (
	inputs.context.cross_mask,
	inputs.context.self_mask,
	inputs.context.valid_action,
	inputs.action_dim_is_pad,
	):
	if t is not None:
	yield t
	if inputs.context.rope_cache is not None:
	yield from inputs.context.rope_cache
	for step in inputs.modulations:
	yield step.conditioning
	for block_modulation in step.block_modulations:
	yield from block_modulation
	yield from step.final_modulation

	return all(t.device.type == "cuda" for t in all_on_cuda())

	def run_action_flow(
	self,
	inputs: _ActionFlowInputs,
	steps: int,
	run_loop,
	) -> torch.Tensor:
	key = _cuda_graph_key(inputs, steps)
	cache = self.action_flow_graph
	if cache is None or cache.key != key:
	static_inputs = _clone_static_inputs(inputs)
	graph, output = _capture_cuda_graph(
	lambda: run_loop(static_inputs, steps),
	inputs.trajectory.device,
	after_warmup=lambda: static_inputs.trajectory.copy_(inputs.trajectory),
	)
	cache = _ActionFlowCudaGraph(
	key=key,
	graph=graph,
	static_inputs=static_inputs,
	output=output,
	)
	self.action_flow_graph = cache
	else:
	_copy_inputs_(cache.static_inputs, inputs)

	cache.graph.replay()
	return cache.output.clone()


	class DepthDecodeCudaGraphManager:
	def __init__(self, model: Any) -> None:
	self.model = model
	self.backbone = model.model
	self.enabled = True
	self.graph: Optional[_DepthDecodeCudaGraph] = None
	self.graph_spec: Optional[_DepthDecodeCudaGraphSpec] = None

	def set_enabled(self, enabled: bool) -> None:
	self.enabled = bool(enabled)

	def make_static_cache(self, max_cache_len: int) -> _DepthDecodeStaticCache:
	return _DepthDecodeStaticCache(
	config=self.model.config.text_config,
	max_cache_len=max_cache_len,
	)

	def _depth_decode_spec(self) -> _DepthDecodeCudaGraphSpec:
	static = self.graph_spec
	if static is None:
	cfg = self.backbone.transformer.config
	rotary_emb = getattr(self.backbone.transformer, "rotary_emb", None)
	static = _DepthDecodeCudaGraphSpec(
	eligible=(
	not cfg.norm_after
	and cfg.rope_scaling_layers is None
	and getattr(rotary_emb, "rope_type", None) == "default"
	and cfg._attn_implementation == "sdpa"
	),
	cache_key_prefix=(
	cfg.hidden_size,
	cfg.num_attention_heads,
	cfg.num_key_value_heads,
	cfg.head_dim,
	cfg.num_hidden_layers,
	cfg.use_qk_norm,
	cfg.qk_norm_type,
	cfg._attn_implementation,
	),
	num_hidden_layers=cfg.num_hidden_layers,
	head_dim=cfg.head_dim,
	num_attention_heads=cfg.num_attention_heads,
	)
	self.graph_spec = static
	return static

	def can_use(
	self,
	next_input_ids: torch.Tensor,
	*,
	past_key_values: Cache,
	attention_bias: torch.Tensor,
	) -> bool:
	if (
	not self.enabled
	or self.model.training
	or self.backbone.transformer.training
	):
	return False
	if next_input_ids.device.type != "cuda":
	return False
	if (
	next_input_ids.ndim != 2
	or next_input_ids.shape[0] != 1
	or next_input_ids.shape[1] != 1
	):
	return False
	if not isinstance(past_key_values, _DepthDecodeStaticCache):
	return False
	if (
	not torch.is_tensor(attention_bias)
	or attention_bias.device != next_input_ids.device
	):
	return False
	return self._depth_decode_spec().eligible

	def _depth_decode_key(
	self,
	next_input_ids: torch.Tensor,
	attention_bias: torch.Tensor,
	) -> Tuple[Any, ...]:
	device = next_input_ids.device
	return (
	self._depth_decode_spec().cache_key_prefix,
	device.type,
	device.index,
	self.model.lm_head.weight.dtype,
	attention_bias.shape[-1],
	)

	def _select_depth_decode_rope(
	self, cos: torch.Tensor, sin: torch.Tensor, *, past_length: int
	) -> None:
	emb = self.backbone.transformer.rotary_emb
	cos.copy_(emb._pos_cos_cache[0, :, past_length : past_length + 1, :])
	sin.copy_(emb._pos_sin_cache[0, :, past_length : past_length + 1, :])

	def _depth_decode_pre_layer(
	self,
	layer_idx: int,
	hidden_states: torch.Tensor,
	cos: torch.Tensor,
	sin: torch.Tensor,
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
	block = self.backbone.transformer.blocks[layer_idx]
	attention = block.self_attn
	residual = hidden_states
	hidden_states = block.attn_norm(hidden_states)

	input_shape = hidden_states.shape[:-1]
	hidden_shape = (*input_shape, -1, attention.head_dim)
	qkv = attention.att_proj(hidden_states)
	query_states, key_states, value_states = qkv.split(attention.fused_dims, dim=-1)
	value_states = value_states.view(hidden_shape)

	apply_qk_norm = attention.q_norm is not None and attention.k_norm is not None
	norm_after_view = apply_qk_norm and attention.qk_norm_type == "qwen3"

	if apply_qk_norm and not norm_after_view:
	query_states = attention.q_norm(query_states)
	key_states = attention.k_norm(key_states)

	query_states = query_states.view(hidden_shape)
	key_states = key_states.view(hidden_shape)

	if norm_after_view:
	query_states = attention.q_norm(query_states)
	key_states = attention.k_norm(key_states)

	query_states = query_states.transpose(1, 2)
	key_states = key_states.transpose(1, 2)
	value_states = value_states.transpose(1, 2)
	query_states, key_states = _apply_rotary_pos_emb(
	query_states, key_states, cos, sin
	)
	return residual, query_states, key_states, value_states

	def _depth_decode_pre0(
	self,
	token_ids: torch.Tensor,
	cos: torch.Tensor,
	sin: torch.Tensor,
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
	inputs_embeds = self.model._embed_base_tokens(token_ids)
	return self._depth_decode_pre_layer(0, inputs_embeds, cos, sin)

	def _depth_decode_post_layer(
	self,
	layer_idx: int,
	residual: torch.Tensor,
	attn_context: torch.Tensor,
	) -> torch.Tensor:
	block = self.backbone.transformer.blocks[layer_idx]
	attention = block.self_attn
	input_shape = residual.shape[:-1]
	attn_output = attn_context.reshape(*input_shape, -1).contiguous()
	attn_output = attention.attn_out(attn_output)
	hidden_states = residual + block.dropout(attn_output)

	residual = hidden_states
	hidden_states = block.ff_norm(hidden_states)
	hidden_states = block.mlp(hidden_states)
	hidden_states = residual + block.dropout(hidden_states)
	return hidden_states

	def _depth_decode_post_and_pre_next(
	self,
	layer_idx: int,
	residual: torch.Tensor,
	attn_context: torch.Tensor,
	cos: torch.Tensor,
	sin: torch.Tensor,
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
	hidden_states = self._depth_decode_post_layer(layer_idx, residual, attn_context)
	return self._depth_decode_pre_layer(layer_idx + 1, hidden_states, cos, sin)

	def _depth_decode_last_post(
	self,
	layer_idx: int,
	residual: torch.Tensor,
	attn_context: torch.Tensor,
	) -> torch.Tensor:
	hidden_states = self._depth_decode_post_layer(layer_idx, residual, attn_context)
	return self.backbone.transformer.ln_f(hidden_states)

	def _build_depth_decode_graph(
	self,
	next_input_ids: torch.Tensor,
	*,
	past_length: int,
	attention_bias: torch.Tensor,
	) -> _DepthDecodeCudaGraph:
	text_config = self.backbone.transformer.config
	device = next_input_ids.device
	dtype = self.model.lm_head.weight.dtype
	static = self._depth_decode_spec()
	num_layers = static.num_hidden_layers
	head_dim = static.head_dim
	max_cache_len = int(attention_bias.shape[-1])
	max_rope_len = max(int(text_config.max_position_embeddings or 0), max_cache_len)
	self.backbone.transformer.prepare_rope_cache(
	device=device, max_seq_len=max_rope_len
	)

	token_ids = torch.empty((1, 1), device=device, dtype=torch.long)
	cos = torch.empty((1, 1, head_dim), device=device, dtype=dtype)
	sin = torch.empty_like(cos)
	positions = torch.arange(max_cache_len, device=device, dtype=torch.long)
	context_shape = (1, 1, static.num_attention_heads, head_dim)

	token_ids.copy_(next_input_ids)
	self._select_depth_decode_rope(cos, sin, past_length=past_length)

	pre_graph, pre_output = _capture_cuda_graph(
	lambda: self._depth_decode_pre0(token_ids, cos, sin),
	device,
	)
	stages = [_DepthDecodeCudaGraphLayerStage(*pre_output)]
	post_graphs = []
	for layer_idx in range(num_layers - 1):
	stage = stages[-1]
	attn_context = torch.empty(context_shape, device=device, dtype=dtype)
	graph, output = _capture_cuda_graph(
	lambda layer_idx=layer_idx, stage=stage, attn_context=attn_context: (
	self._depth_decode_post_and_pre_next(
	layer_idx,
	stage.residual,
	attn_context,
	cos,
	sin,
	)
	),
	device,
	)
	post_graphs.append(
	_DepthDecodeCudaGraphPostStage(graph=graph, attn_context=attn_context)
	)
	stages.append(_DepthDecodeCudaGraphLayerStage(*output))

	last_stage = stages[-1]
	last_attn_context = torch.empty(context_shape, device=device, dtype=dtype)
	last_graph, last_output = _capture_cuda_graph(
	lambda: self._depth_decode_last_post(
	num_layers - 1,
	last_stage.residual,
	last_attn_context,
	),
	device,
	)
	post_graphs.append(
	_DepthDecodeCudaGraphPostStage(
	graph=last_graph, attn_context=last_attn_context
	)
	)
	return _DepthDecodeCudaGraph(
	cache_key=self._depth_decode_key(next_input_ids, attention_bias),
	pre_graph=pre_graph,
	token_ids=token_ids,
	cos=cos,
	sin=sin,
	positions=positions,
	stages=tuple(stages),
	post_graphs=tuple(post_graphs),
	output=last_output,
	)

	def _get_depth_decode_graph(
	self,
	next_input_ids: torch.Tensor,
	*,
	past_length: int,
	attention_bias: torch.Tensor,
	) -> _DepthDecodeCudaGraph:
	key = self._depth_decode_key(next_input_ids, attention_bias)
	decode_graph = self.graph
	if decode_graph is None or decode_graph.cache_key != key:
	decode_graph = self._build_depth_decode_graph(
	next_input_ids,
	past_length=past_length,
	attention_bias=attention_bias,
	)
	self.graph = decode_graph
	else:
	decode_graph.token_ids.copy_(next_input_ids)
	self._select_depth_decode_rope(
	decode_graph.cos, decode_graph.sin, past_length=past_length
	)
	return decode_graph

	def _run_depth_decode_attention_core(
	self,
	layer_idx: int,
	stage: _DepthDecodeCudaGraphLayerStage,
	*,
	past_key_values: Cache,
	attention_bias: torch.Tensor,
	cache_position: torch.Tensor,
	cos: torch.Tensor,
	sin: torch.Tensor,
	) -> torch.Tensor:
	attention = self.backbone.transformer.blocks[layer_idx].self_attn
	cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
	key_states, value_states = past_key_values.update(
	stage.key,
	stage.value,
	layer_idx,
	cache_kwargs,
	)
	key_states = _repeat_kv(key_states, attention.num_key_value_groups)
	value_states = _repeat_kv(value_states, attention.num_key_value_groups)
	attn_output = F.scaled_dot_product_attention(
	stage.query,
	key_states,
	value_states,
	attn_mask=attention_bias,
	dropout_p=0.0,
	is_causal=False,
	)
	return attn_output.transpose(1, 2)

	def run(
	self,
	next_input_ids: torch.Tensor,
	*,
	past_key_values: Cache,
	attention_bias: torch.Tensor,
	past_length: int,
	) -> Tuple[torch.Tensor, Cache]:
	end = past_length + 1
	decode_graph = self._get_depth_decode_graph(
	next_input_ids,
	past_length=past_length,
	attention_bias=attention_bias,
	)
	cache_position = decode_graph.positions[past_length:end]
	attention_bias_q = attention_bias[:, :, past_length:end, :end]

	decode_graph.pre_graph.replay()

	for layer_idx, post_graph in enumerate(decode_graph.post_graphs):
	attn_context = self._run_depth_decode_attention_core(
	layer_idx,
	decode_graph.stages[layer_idx],
	past_key_values=past_key_values,
	attention_bias=attention_bias_q,
	cache_position=cache_position,
	cos=decode_graph.cos,
	sin=decode_graph.sin,
	)
	post_graph.attn_context.copy_(attn_context)
	post_graph.graph.replay()

	return decode_graph.output, past_key_values


	def _cuda_graph_tensor_signature(
	tensor: Optional[torch.Tensor],
	) -> Optional[Tuple[Any, ...]]:
	if tensor is None:
	return None
	return (
	tuple(tensor.shape),
	tuple(tensor.stride()),
	str(tensor.dtype),
	str(tensor.device),
	)


	def _cuda_graph_context_signature(context: Any) -> Tuple[Any, ...]:
	sig = _cuda_graph_tensor_signature
	return (
	tuple((sig(k), sig(v)) for k, v in context.kv_contexts),
	sig(context.cross_mask),
	sig(context.self_mask),
	sig(context.valid_action),
	None
	if context.rope_cache is None
	else tuple(sig(t) for t in context.rope_cache),
	)


	def _cuda_graph_modulation_signature(modulations: Sequence[Any]) -> Tuple[Any, ...]:
	sig = _cuda_graph_tensor_signature
	return tuple(
	(
	sig(step.conditioning),
	tuple(
	tuple(sig(t) for t in block_modulation)
	for block_modulation in step.block_modulations
	),
	tuple(sig(t) for t in step.final_modulation),
	)
	for step in modulations
	)


	def _cuda_graph_key(inputs: _ActionFlowInputs, steps: int) -> Tuple[Any, ...]:
	sig = _cuda_graph_tensor_signature
	return (
	sig(inputs.trajectory),
	_cuda_graph_context_signature(inputs.context),
	_cuda_graph_modulation_signature(inputs.modulations),
	sig(inputs.action_dim_is_pad),
	int(steps),
	)


	def _clone_static_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
	if tensor is None:
	return None
	static = torch.empty_strided(
	tuple(tensor.shape),
	tuple(tensor.stride()),
	device=tensor.device,
	dtype=tensor.dtype,
	)
	static.copy_(tensor)
	return static


	def _clone_static_context(context: Any) -> Any:
	rope_cache = None
	if context.rope_cache is not None:
	rope_cache = tuple(_clone_static_tensor(t) for t in context.rope_cache)
	return context.__class__(
	kv_contexts=tuple(
	(_clone_static_tensor(k), _clone_static_tensor(v))
	for k, v in context.kv_contexts
	),
	cross_mask=_clone_static_tensor(context.cross_mask),
	self_mask=_clone_static_tensor(context.self_mask),
	valid_action=_clone_static_tensor(context.valid_action),
	rope_cache=rope_cache,
	)


	def _clone_static_modulations(modulations: Sequence[Any]) -> Sequence[Any]:
	return tuple(
	step.__class__(
	conditioning=_clone_static_tensor(step.conditioning),
	block_modulations=tuple(
	tuple(_clone_static_tensor(t) for t in block_modulation)
	for block_modulation in step.block_modulations
	),
	final_modulation=tuple(
	_clone_static_tensor(t) for t in step.final_modulation
	),
	)
	for step in modulations
	)


	def _clone_static_inputs(inputs: _ActionFlowInputs) -> _ActionFlowInputs:
	return _ActionFlowInputs(
	trajectory=_clone_static_tensor(inputs.trajectory),
	context=_clone_static_context(inputs.context),
	modulations=_clone_static_modulations(inputs.modulations),
	action_dim_is_pad=_clone_static_tensor(inputs.action_dim_is_pad),
	)


	def _copy_context_(dst: Any, src: Any) -> None:
	for (dst_k, dst_v), (src_k, src_v) in zip(dst.kv_contexts, src.kv_contexts):
	dst_k.copy_(src_k)
	dst_v.copy_(src_v)
	if src.cross_mask is not None:
	dst.cross_mask.copy_(src.cross_mask)
	if src.self_mask is not None:
	dst.self_mask.copy_(src.self_mask)
	if src.valid_action is not None:
	dst.valid_action.copy_(src.valid_action)
	if src.rope_cache is not None:
	for dst_tensor, src_tensor in zip(dst.rope_cache, src.rope_cache):
	dst_tensor.copy_(src_tensor)


	def _copy_inputs_(dst: _ActionFlowInputs, src: _ActionFlowInputs) -> None:
	dst.trajectory.copy_(src.trajectory)
	_copy_context_(dst.context, src.context)
	if src.action_dim_is_pad is not None:
	dst.action_dim_is_pad.copy_(src.action_dim_is_pad)


	def _rotate_half(x: torch.Tensor) -> torch.Tensor:
	x1 = x[..., : x.shape[-1] // 2]
	x2 = x[..., x.shape[-1] // 2 :]
	return torch.cat((-x2, x1), dim=-1)


	def _apply_rotary_pos_emb(
	q: torch.Tensor,
	k: torch.Tensor,
	cos: torch.Tensor,
	sin: torch.Tensor,
	unsqueeze_dim: int = 1,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	cos = cos.unsqueeze(unsqueeze_dim)
	sin = sin.unsqueeze(unsqueeze_dim)
	q_embed = (q * cos) + (_rotate_half(q) * sin)
	k_embed = (k * cos) + (_rotate_half(k) * sin)
	return q_embed, k_embed


	def _repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
	batch, num_key_value_heads, slen, head_dim = hidden_states.shape
	if n_rep == 1:
	return hidden_states
	hidden_states = hidden_states[:, :, None, :, :].expand(
	batch, num_key_value_heads, n_rep, slen, head_dim
	)
	return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


	def _capture_cuda_graph(
	fn,
	device: torch.device,
	*,
	after_warmup=None,
	) -> Tuple[torch.cuda.CUDAGraph, Any]:
	warmup_stream = torch.cuda.Stream(device=device)
	warmup_stream.wait_stream(torch.cuda.current_stream(device))
	with torch.cuda.stream(warmup_stream):
	fn()
	torch.cuda.current_stream(device).wait_stream(warmup_stream)
	if after_warmup is not None:
	after_warmup()

	graph = torch.cuda.CUDAGraph()
	with torch.cuda.graph(graph):
	output = fn()
	return graph, output