modeling_llama4.py 58 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391
  1. # coding=utf-8
  2. # Copyright 2025 The LLAMA4 and HuggingFace Inc. team. All rights reserved.
  3. #
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. import math
  17. from dataclasses import dataclass
  18. from typing import Callable, Optional, Union
  19. import torch
  20. import torch.nn as nn
  21. import torch.nn.functional as F
  22. from transformers.models.llama4.configuration_llama4 import Llama4VisionConfig
  23. from ...activations import ACT2FN
  24. from ...cache_utils import Cache, DynamicCache
  25. from ...generation import GenerationMixin
  26. from ...integrations import use_kernel_forward_from_hub
  27. from ...masking_utils import create_causal_mask, create_chunked_causal_mask
  28. from ...modeling_flash_attention_utils import FlashAttentionKwargs
  29. from ...modeling_layers import GradientCheckpointingLayer
  30. from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPast, CausalLMOutputWithPast, ModelOutput
  31. from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
  32. from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
  33. from ...processing_utils import Unpack
  34. from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
  35. from ...utils.deprecation import deprecate_kwarg
  36. from ...utils.generic import check_model_inputs
  37. from .configuration_llama4 import Llama4Config, Llama4TextConfig
  38. logger = logging.get_logger(__name__)
  39. class Llama4TextExperts(nn.Module):
  40. def __init__(self, config: Llama4TextConfig):
  41. super().__init__()
  42. self.num_experts = config.num_local_experts
  43. self.intermediate_size = config.intermediate_size
  44. self.hidden_size = config.hidden_size
  45. self.expert_dim = self.intermediate_size
  46. self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_size, 2 * self.expert_dim))
  47. self.down_proj = nn.Parameter(torch.empty((self.num_experts, self.expert_dim, self.hidden_size)))
  48. self.act_fn = ACT2FN[config.hidden_act]
  49. def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  50. """
  51. This should really not be run on a single machine, as we are reaching compute bound:
  52. - the inputs are expected to be "sorted" per expert already.
  53. - the weights are viewed with another dim, to match num_expert, 1, shape * num_tokens, shape
  54. Args:
  55. hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
  56. selected_experts (torch.Tensor): (batch_size * token_num, top_k)
  57. routing_weights (torch.Tensor): (batch_size * token_num, top_k)
  58. Returns:
  59. torch.Tensor
  60. """
  61. hidden_states = hidden_states.view(self.gate_up_proj.shape[0], -1, self.hidden_size)
  62. gate_up = torch.bmm(hidden_states, self.gate_up_proj)
  63. gate, up = gate_up.chunk(2, dim=-1) # not supported for DTensors
  64. next_states = torch.bmm((up * self.act_fn(gate)), self.down_proj)
  65. next_states = next_states.view(-1, self.hidden_size)
  66. return next_states
  67. # Phi3MLP
  68. class Llama4TextMLP(nn.Module):
  69. def __init__(self, config, intermediate_size=None):
  70. super().__init__()
  71. if intermediate_size is None:
  72. intermediate_size = config.intermediate_size
  73. self.config = config
  74. self.gate_proj = nn.Linear(config.hidden_size, intermediate_size, bias=False)
  75. self.up_proj = nn.Linear(config.hidden_size, intermediate_size, bias=False)
  76. self.down_proj = nn.Linear(intermediate_size, config.hidden_size, bias=False)
  77. self.activation_fn = ACT2FN[config.hidden_act]
  78. def forward(self, x):
  79. down_proj = self.activation_fn(self.gate_proj(x)) * self.up_proj(x)
  80. return self.down_proj(down_proj)
  81. class Llama4TextL2Norm(torch.nn.Module):
  82. def __init__(self, eps: float = 1e-6):
  83. super().__init__()
  84. self.eps = eps
  85. def _norm(self, x):
  86. return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
  87. def forward(self, x):
  88. return self._norm(x.float()).type_as(x)
  89. def extra_repr(self):
  90. return f"eps={self.eps}"
  91. class Llama4TextRMSNorm(nn.Module):
  92. def __init__(self, hidden_size, eps=1e-5):
  93. """
  94. Llama4RMSNorm is equivalent to T5LayerNorm
  95. """
  96. super().__init__()
  97. self.eps = eps
  98. self.weight = nn.Parameter(torch.ones(hidden_size))
  99. def _norm(self, x):
  100. return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
  101. def forward(self, x):
  102. output = self._norm(x.float()).type_as(x)
  103. return output * self.weight
  104. def extra_repr(self):
  105. return f"{tuple(self.weight.shape)}, eps={self.eps}"
  106. class Llama4Router(nn.Linear):
  107. def __init__(self, config):
  108. super().__init__(config.hidden_size, config.num_local_experts, bias=False)
  109. self.num_experts = config.num_local_experts
  110. self.top_k = config.num_experts_per_tok
  111. def forward(self, hidden_states):
  112. router_logits = super().forward(hidden_states)
  113. router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=1)
  114. router_scores = torch.full_like(router_logits, float("-inf")).scatter_(1, router_indices, router_top_value)
  115. router_scores = torch.nn.functional.sigmoid(router_scores.float()).to(router_scores.dtype)
  116. return router_scores, router_logits
  117. @use_kernel_forward_from_hub("Llama4TextMoe")
  118. class Llama4TextMoe(nn.Module):
  119. def __init__(self, config):
  120. super().__init__()
  121. self.top_k = config.num_experts_per_tok
  122. self.hidden_dim = config.hidden_size
  123. self.num_experts = config.num_local_experts
  124. self.experts = Llama4TextExperts(config)
  125. self.router = Llama4Router(config)
  126. self.shared_expert = Llama4TextMLP(config)
  127. def forward(self, hidden_states):
  128. hidden_states = hidden_states.reshape(-1, self.hidden_dim)
  129. router_scores, router_logits = self.router(hidden_states)
  130. routed_in = hidden_states.repeat(router_scores.shape[1], 1)
  131. routed_in = routed_in * router_scores.transpose(0, 1).reshape(-1, 1)
  132. routed_out = self.experts(routed_in)
  133. out = self.shared_expert(hidden_states)
  134. out.add_(routed_out.reshape(router_scores.shape[1], -1, routed_out.shape[-1]).sum(dim=0))
  135. return out, router_logits
  136. class Llama4TextRotaryEmbedding(nn.Module):
  137. inv_freq: torch.Tensor # fix linting for `register_buffer`
  138. def __init__(self, config: Llama4TextConfig, device=None):
  139. super().__init__()
  140. # BC: "rope_type" was originally "type"
  141. self.rope_type = "llama3" if config.rope_scaling is not None else "default"
  142. self.max_seq_len_cached = config.max_position_embeddings
  143. self.original_max_seq_len = config.max_position_embeddings
  144. self.config = config
  145. self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
  146. inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
  147. self.register_buffer("inv_freq", inv_freq, persistent=False)
  148. self.original_inv_freq = self.inv_freq
  149. @torch.no_grad()
  150. @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope)
  151. def forward(self, x, position_ids):
  152. inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
  153. position_ids_expanded = position_ids[:, None, :].float()
  154. device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
  155. with torch.autocast(device_type=device_type, enabled=False): # Force float32
  156. freqs = (inv_freq_expanded.to(x.device) @ position_ids_expanded).transpose(1, 2)
  157. freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # Convert to complex representation
  158. freqs_cis = freqs_cis * self.attention_scaling
  159. return freqs_cis
  160. def apply_rotary_emb(
  161. xq: torch.Tensor,
  162. xk: torch.Tensor,
  163. freqs_cis: torch.Tensor,
  164. ) -> tuple[torch.Tensor, torch.Tensor]:
  165. xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
  166. xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
  167. xq_out = torch.view_as_real(xq_ * freqs_cis[:, :, None, :]).flatten(3)
  168. xk_out = torch.view_as_real(xk_ * freqs_cis[:, :, None, :]).flatten(3)
  169. return xq_out.type_as(xq), xk_out.type_as(xk)
  170. def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
  171. """
  172. This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
  173. num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
  174. """
  175. batch, num_key_value_heads, slen, head_dim = hidden_states.shape
  176. if n_rep == 1:
  177. return hidden_states
  178. hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
  179. return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
  180. # Adapted from transformers.models.llama.modeling_llama.eager_attention_forward -> llama4 doesn't cast attn weights to fp32
  181. def eager_attention_forward(
  182. module: nn.Module,
  183. query: torch.Tensor,
  184. key: torch.Tensor,
  185. value: torch.Tensor,
  186. attention_mask: Optional[torch.Tensor],
  187. scaling: float,
  188. dropout: float = 0.0,
  189. **kwargs,
  190. ):
  191. key_states = repeat_kv(key, module.num_key_value_groups)
  192. value_states = repeat_kv(value, module.num_key_value_groups)
  193. attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
  194. if attention_mask is not None:
  195. causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
  196. attn_weights = attn_weights + causal_mask
  197. attn_weights = nn.functional.softmax(attn_weights, dim=-1)
  198. attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
  199. attn_output = torch.matmul(attn_weights, value_states)
  200. attn_output = attn_output.transpose(1, 2).contiguous()
  201. return attn_output, attn_weights
  202. # Adapted from transformers.models.llama.modeling_llama.eager_attention_forward -> llama4 doesn't cast attn weights to fp32
  203. def vision_eager_attention_forward(
  204. module: nn.Module,
  205. query: torch.Tensor,
  206. key: torch.Tensor,
  207. value: torch.Tensor,
  208. attention_mask: Optional[torch.Tensor],
  209. scaling: float,
  210. dropout: float = 0.0,
  211. **kwargs,
  212. ):
  213. key_states = repeat_kv(key, module.num_key_value_groups)
  214. value_states = repeat_kv(value, module.num_key_value_groups)
  215. attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * module.head_dim**-0.5
  216. if attention_mask is not None:
  217. causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
  218. attn_weights = attn_weights + causal_mask
  219. attn_weights = nn.functional.softmax(attn_weights, dim=-1)
  220. attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
  221. attn_output = torch.matmul(attn_weights, value_states)
  222. attn_output = attn_output.transpose(1, 2).contiguous()
  223. return attn_output, attn_weights
  224. class Llama4TextAttention(nn.Module):
  225. """Multi-headed attention from 'Attention Is All You Need' paper"""
  226. def __init__(self, config: Llama4TextConfig, layer_idx):
  227. super().__init__()
  228. self.config = config
  229. self.layer_idx = layer_idx
  230. self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
  231. self.num_attention_heads = config.num_attention_heads
  232. self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
  233. self.num_key_value_heads = config.num_key_value_heads
  234. self.scaling = self.head_dim**-0.5
  235. self.attn_scale = config.attn_scale
  236. self.floor_scale = config.floor_scale
  237. self.attn_temperature_tuning = config.attn_temperature_tuning
  238. self.attention_dropout = config.attention_dropout
  239. self.is_causal = True
  240. self.use_rope = config.no_rope_layers[layer_idx]
  241. self.q_proj = nn.Linear(
  242. config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
  243. )
  244. self.k_proj = nn.Linear(
  245. config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
  246. )
  247. self.v_proj = nn.Linear(
  248. config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
  249. )
  250. self.o_proj = nn.Linear(
  251. config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
  252. )
  253. if self.config.use_qk_norm and self.use_rope:
  254. self.qk_norm = Llama4TextL2Norm(config.rms_norm_eps)
  255. @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
  256. def forward(
  257. self,
  258. hidden_states: torch.Tensor,
  259. position_embeddings: tuple[torch.Tensor, torch.Tensor],
  260. attention_mask: Optional[torch.Tensor],
  261. past_key_values: Optional[Cache] = None,
  262. cache_position: Optional[torch.LongTensor] = None,
  263. **kwargs: Unpack[FlashAttentionKwargs],
  264. ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
  265. input_shape = hidden_states.shape[:-1]
  266. hidden_shape = (*input_shape, -1, self.head_dim)
  267. query_states = self.q_proj(hidden_states).view(hidden_shape)
  268. key_states = self.k_proj(hidden_states).view(*input_shape, -1, self.head_dim)
  269. value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
  270. if self.use_rope: # the 16E model skips rope for long context on certain layers
  271. query_states, key_states = apply_rotary_emb(
  272. query_states, key_states, position_embeddings.to(query_states.device)
  273. )
  274. if hasattr(self, "qk_norm"): # the 128E model does not use qk_norm
  275. query_states = self.qk_norm(query_states)
  276. key_states = self.qk_norm(key_states)
  277. # Use temperature tuning from https://huggingface.co/papers/2501.19399) to NoROPE layers
  278. if self.attn_temperature_tuning and not self.use_rope:
  279. attn_scales = (
  280. torch.log1p(torch.floor((cache_position.float() + 1.0) / self.floor_scale)) * self.attn_scale + 1.0
  281. )
  282. attn_scales = attn_scales.view((1, input_shape[-1], 1, 1)).expand((*input_shape, 1, 1)) # batch size > 1
  283. query_states = (query_states * attn_scales).to(query_states.dtype)
  284. query_states = query_states.transpose(1, 2)
  285. key_states = key_states.transpose(1, 2)
  286. if past_key_values is not None:
  287. # sin and cos are specific to RoPE models; cache_position needed for the static cache
  288. cache_kwargs = {"cache_position": cache_position}
  289. key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
  290. attention_interface: Callable = eager_attention_forward
  291. if self.config._attn_implementation != "eager":
  292. attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
  293. attn_output, attn_weights = attention_interface(
  294. self,
  295. query_states,
  296. key_states,
  297. value_states,
  298. attention_mask,
  299. dropout=0.0 if not self.training else self.attention_dropout,
  300. scaling=self.scaling,
  301. **kwargs,
  302. )
  303. attn_output = attn_output.reshape(*input_shape, -1).contiguous()
  304. attn_output = self.o_proj(attn_output)
  305. return attn_output, attn_weights
  306. class Llama4TextDecoderLayer(GradientCheckpointingLayer):
  307. def __init__(self, config, layer_idx):
  308. super().__init__()
  309. self.hidden_size = config.hidden_size
  310. self.layer_idx = layer_idx
  311. self.attention_type = config.layer_types[layer_idx]
  312. self.self_attn = Llama4TextAttention(config, layer_idx)
  313. self.is_moe_layer = layer_idx in config.moe_layers
  314. if self.is_moe_layer: # the 128E model interleaves dense / sparse
  315. self.feed_forward = Llama4TextMoe(config)
  316. else:
  317. self.feed_forward = Llama4TextMLP(config, intermediate_size=config.intermediate_size_mlp)
  318. self.input_layernorm = Llama4TextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
  319. self.post_attention_layernorm = Llama4TextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
  320. @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
  321. def forward(
  322. self,
  323. hidden_states: torch.Tensor,
  324. attention_mask: Optional[torch.Tensor] = None,
  325. position_ids: Optional[torch.LongTensor] = None,
  326. past_key_values: Optional[Cache] = None,
  327. use_cache: Optional[bool] = False,
  328. cache_position: Optional[torch.LongTensor] = None,
  329. position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
  330. **kwargs: Unpack[FlashAttentionKwargs],
  331. ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
  332. residual = hidden_states
  333. hidden_states = self.input_layernorm(hidden_states)
  334. # Self Attention
  335. attention_states, _ = self.self_attn(
  336. hidden_states=hidden_states,
  337. position_embeddings=position_embeddings,
  338. attention_mask=attention_mask,
  339. past_key_values=past_key_values,
  340. use_cache=use_cache,
  341. cache_position=cache_position,
  342. **kwargs,
  343. )
  344. hidden_states = residual + attention_states
  345. # Fully Connected
  346. residual = hidden_states
  347. hidden_states = self.post_attention_layernorm(hidden_states)
  348. hidden_states = self.feed_forward(hidden_states)
  349. if self.is_moe_layer:
  350. hidden_states, _ = hidden_states
  351. hidden_states = residual + hidden_states.view(residual.shape)
  352. return hidden_states
  353. @auto_docstring
  354. class Llama4PreTrainedModel(PreTrainedModel):
  355. config: Llama4Config
  356. supports_gradient_checkpointing = True
  357. _skip_keys_device_placement = ["past_key_values"]
  358. _supports_flash_attn = False
  359. _supports_sdpa = True
  360. _supports_flex_attn = True
  361. _can_compile_fullgraph = True
  362. _supports_attention_backend = True
  363. def _init_weights(self, module):
  364. std = (
  365. self.config.initializer_range
  366. if hasattr(self.config, "initializer_range")
  367. else self.config.text_config.initializer_range
  368. )
  369. if isinstance(module, nn.Linear):
  370. module.weight.data.normal_(mean=0.0, std=std)
  371. if module.bias is not None:
  372. module.bias.data.zero_()
  373. elif isinstance(module, nn.Embedding):
  374. module.weight.data.normal_(mean=0.0, std=std)
  375. if module.padding_idx is not None:
  376. module.weight.data[module.padding_idx].zero_()
  377. elif isinstance(module, nn.LayerNorm):
  378. module.weight.data.fill_(1.0)
  379. module.bias.data.zero_()
  380. elif isinstance(module, Llama4TextRMSNorm):
  381. module.weight.data.fill_(1.0)
  382. elif isinstance(module, Llama4TextExperts):
  383. module.gate_up_proj.data.normal_(mean=0.0, std=std)
  384. module.down_proj.data.normal_(mean=0.0, std=std)
  385. elif isinstance(module, Llama4VisionModel):
  386. module.class_embedding.data.normal_(std=module.scale)
  387. module.positional_embedding_vlm.data.normal_(std=module.scale)
  388. @auto_docstring
  389. class Llama4TextModel(Llama4PreTrainedModel):
  390. _no_split_modules = ["Llama4TextDecoderLayer"]
  391. base_model_prefix = "model"
  392. config: Llama4TextConfig
  393. _can_record_outputs = {
  394. "attentions": Llama4TextAttention,
  395. "hidden_states": Llama4TextDecoderLayer,
  396. "router_logits": Llama4TextMoe,
  397. }
  398. def __init__(self, config: Llama4TextConfig):
  399. super().__init__(config)
  400. self.padding_idx = config.pad_token_id
  401. self.vocab_size = config.vocab_size
  402. self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
  403. self.layers = nn.ModuleList(
  404. [Llama4TextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
  405. )
  406. self.norm = Llama4TextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
  407. self.rotary_emb = Llama4TextRotaryEmbedding(config=config)
  408. self.gradient_checkpointing = False
  409. # Initialize weights and apply final processing
  410. self.post_init()
  411. @can_return_tuple
  412. @check_model_inputs()
  413. @auto_docstring
  414. def forward(
  415. self,
  416. input_ids: Optional[torch.LongTensor] = None,
  417. attention_mask: Optional[torch.Tensor] = None,
  418. position_ids: Optional[torch.LongTensor] = None,
  419. past_key_values: Optional[Cache] = None,
  420. inputs_embeds: Optional[torch.FloatTensor] = None,
  421. use_cache: Optional[bool] = None,
  422. cache_position: Optional[torch.LongTensor] = None,
  423. **kwargs: Unpack[TransformersKwargs],
  424. ) -> Union[tuple, BaseModelOutputWithPast]:
  425. if (input_ids is None) ^ (inputs_embeds is not None):
  426. raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
  427. if inputs_embeds is None:
  428. inputs_embeds = self.embed_tokens(input_ids.to(self.embed_tokens.weight.device))
  429. if use_cache and past_key_values is None:
  430. past_key_values = DynamicCache(config=self.config)
  431. if cache_position is None:
  432. past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
  433. cache_position = torch.arange(
  434. past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
  435. )
  436. if position_ids is None:
  437. position_ids = cache_position.unsqueeze(0)
  438. # It may already have been prepared by e.g. `generate`
  439. if not isinstance(causal_mask_mapping := attention_mask, dict):
  440. # Prepare mask arguments
  441. mask_kwargs = {
  442. "config": self.config,
  443. "input_embeds": inputs_embeds,
  444. "attention_mask": attention_mask,
  445. "cache_position": cache_position,
  446. "past_key_values": past_key_values,
  447. "position_ids": position_ids,
  448. }
  449. # Create the masks
  450. causal_mask_mapping = {
  451. "full_attention": create_causal_mask(**mask_kwargs),
  452. "chunked_attention": create_chunked_causal_mask(**mask_kwargs),
  453. }
  454. hidden_states = inputs_embeds
  455. # create position embeddings to be shared across the decoder layers
  456. freq_cis = self.rotary_emb(hidden_states, position_ids)
  457. for decoder_layer in self.layers[: self.config.num_hidden_layers]:
  458. hidden_states = decoder_layer(
  459. hidden_states,
  460. attention_mask=causal_mask_mapping[decoder_layer.attention_type],
  461. position_ids=position_ids,
  462. past_key_values=past_key_values,
  463. use_cache=use_cache,
  464. cache_position=cache_position,
  465. position_embeddings=freq_cis,
  466. **kwargs,
  467. )
  468. hidden_states = self.norm(hidden_states)
  469. return BaseModelOutputWithPast(
  470. last_hidden_state=hidden_states,
  471. past_key_values=past_key_values if use_cache else None,
  472. )
  473. class Llama4ForCausalLM(Llama4PreTrainedModel, GenerationMixin):
  474. _no_split_modules = ["Llama4TextDecoderLayer"]
  475. base_model_prefix = "language_model"
  476. _tied_weights_keys = ["lm_head.weight"]
  477. _tp_plan = {"lm_head": "colwise_rep"}
  478. config: Llama4TextConfig
  479. def __init__(self, config: Llama4TextConfig):
  480. super().__init__(config)
  481. self.model = Llama4TextModel(config)
  482. self.vocab_size = config.vocab_size
  483. self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
  484. # Initialize weights and apply final processing
  485. self.post_init()
  486. @can_return_tuple
  487. @auto_docstring
  488. def forward(
  489. self,
  490. input_ids: Optional[torch.LongTensor] = None,
  491. attention_mask: Optional[torch.Tensor] = None,
  492. position_ids: Optional[torch.LongTensor] = None,
  493. past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
  494. inputs_embeds: Optional[torch.FloatTensor] = None,
  495. labels: Optional[torch.LongTensor] = None,
  496. use_cache: Optional[bool] = None,
  497. cache_position: Optional[torch.LongTensor] = None,
  498. logits_to_keep: Union[int, torch.Tensor] = 0,
  499. **kwargs: Unpack[TransformersKwargs],
  500. ) -> Union[tuple, CausalLMOutputWithPast]:
  501. r"""
  502. labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
  503. Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
  504. config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
  505. (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
  506. Example:
  507. ```python
  508. >>> from transformers import AutoTokenizer, Llama4ForCausalLM
  509. >>> model = Llama4ForCausalLM.from_pretrained("meta-llama4/Llama4-2-7b-hf")
  510. >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama4/Llama4-2-7b-hf")
  511. >>> prompt = "Hey, are you conscious? Can you talk to me?"
  512. >>> inputs = tokenizer(prompt, return_tensors="pt")
  513. >>> # Generate
  514. >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
  515. >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
  516. "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
  517. ```"""
  518. outputs = self.model(
  519. input_ids=input_ids,
  520. attention_mask=attention_mask,
  521. position_ids=position_ids,
  522. past_key_values=past_key_values,
  523. inputs_embeds=inputs_embeds,
  524. use_cache=use_cache,
  525. cache_position=cache_position,
  526. **kwargs,
  527. )
  528. hidden_states = outputs[0]
  529. # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
  530. slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
  531. logits = self.lm_head(hidden_states[:, slice_indices, :])
  532. loss = None
  533. if labels is not None:
  534. loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
  535. return CausalLMOutputWithPast(
  536. loss=loss,
  537. logits=logits,
  538. past_key_values=outputs.past_key_values,
  539. hidden_states=outputs.hidden_states,
  540. attentions=outputs.attentions,
  541. )
  542. @dataclass
  543. @auto_docstring(
  544. custom_intro="""
  545. Base class for Llava causal language model (or autoregressive) outputs.
  546. """
  547. )
  548. class Llama4CausalLMOutputWithPast(ModelOutput):
  549. r"""
  550. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  551. Language modeling loss (for next-token prediction).
  552. logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  553. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  554. past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  555. It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
  556. Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
  557. `past_key_values` input) to speed up sequential decoding.
  558. image_hidden_states (`torch.FloatTensor`, *optional*):
  559. A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
  560. image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
  561. """
  562. loss: Optional[torch.FloatTensor] = None
  563. logits: Optional[torch.FloatTensor] = None
  564. past_key_values: Optional[Cache] = None
  565. hidden_states: Optional[tuple[torch.FloatTensor]] = None
  566. attentions: Optional[tuple[torch.FloatTensor]] = None
  567. image_hidden_states: Optional[torch.FloatTensor] = None
  568. class Llama4VisionMLP2(torch.nn.Module):
  569. def __init__(self, config):
  570. super().__init__()
  571. self.hidden_size = config.hidden_size
  572. self.intermediate_size = config.intermediate_size
  573. self.fc1 = nn.Linear(self.intermediate_size, config.projector_input_dim, bias=False)
  574. self.fc2 = nn.Linear(config.projector_output_dim, config.projector_output_dim, bias=False)
  575. self.activation_fn = nn.GELU() # ACT2FN[config.hidden_act]
  576. self.dropout = config.projector_dropout
  577. def forward(self, hidden_states):
  578. hidden_states = self.fc1(hidden_states)
  579. hidden_states = self.activation_fn(hidden_states)
  580. hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
  581. return self.activation_fn(self.fc2(hidden_states))
  582. class Llama4MultiModalProjector(nn.Module):
  583. def __init__(self, config):
  584. super().__init__()
  585. self.linear_1 = nn.Linear(
  586. config.vision_config.vision_output_dim,
  587. config.text_config.hidden_size,
  588. bias=False,
  589. )
  590. def forward(self, image_features):
  591. hidden_states = self.linear_1(image_features)
  592. return hidden_states
  593. def pixel_shuffle(input_tensor, shuffle_ratio):
  594. # input_tensor: [batch_size, num_patches, channels]
  595. batch_size, num_patches, channels = input_tensor.shape
  596. patch_size = int(math.sqrt(num_patches))
  597. input_tensor = input_tensor.view(batch_size, patch_size, patch_size, -1)
  598. batch_size, height, width, channels = input_tensor.size()
  599. reshaped_tensor = input_tensor.view(batch_size, height, int(width * shuffle_ratio), int(channels / shuffle_ratio))
  600. reshaped_tensor = reshaped_tensor.permute(0, 2, 1, 3).contiguous()
  601. reshaped_tensor = reshaped_tensor.view(
  602. batch_size, int(height * shuffle_ratio), int(width * shuffle_ratio), int(channels / (shuffle_ratio**2))
  603. )
  604. reshaped_tensor = reshaped_tensor.permute(0, 2, 1, 3).contiguous()
  605. output_tensor = reshaped_tensor.view(batch_size, -1, reshaped_tensor.shape[-1])
  606. return output_tensor
  607. class Llama4VisionPixelShuffleMLP(nn.Module):
  608. def __init__(self, config):
  609. super().__init__()
  610. self.pixel_shuffle_ratio = config.pixel_shuffle_ratio
  611. self.inner_dim = int(config.projector_input_dim // (self.pixel_shuffle_ratio**2))
  612. self.output_dim = config.projector_output_dim
  613. self.mlp = Llama4VisionMLP2(config)
  614. def forward(self, encoded_patches: torch.Tensor) -> torch.Tensor:
  615. encoded_patches = pixel_shuffle(encoded_patches, self.pixel_shuffle_ratio)
  616. return self.mlp(encoded_patches)
  617. # TODO there is a different RoPE for vision encoder, defined as below
  618. def reshape_for_broadcast(freqs_ci: torch.Tensor, query: torch.Tensor):
  619. ndim = query.ndim
  620. shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(query.shape)]
  621. return freqs_ci.view(*shape)
  622. def vision_apply_rotary_emb(
  623. query: torch.Tensor,
  624. key: torch.Tensor,
  625. freqs_ci: torch.Tensor,
  626. ) -> tuple[torch.Tensor, torch.Tensor]:
  627. query_ = torch.view_as_complex(query.float().reshape(*query.shape[:-1], -1, 2))
  628. key_ = torch.view_as_complex(key.float().reshape(*key.shape[:-1], -1, 2))
  629. freqs_ci = reshape_for_broadcast(freqs_ci=freqs_ci, query=query_) # freqs_ci[:,:,None,:]
  630. freqs_ci = freqs_ci.to(query_.device)
  631. query_out = torch.view_as_real(query_ * freqs_ci).flatten(3)
  632. key_out = torch.view_as_real(key_ * freqs_ci).flatten(3)
  633. return query_out.type_as(query), key_out.type_as(key) # but this drops to 8e-3
  634. class Llama4VisionAttention(nn.Module):
  635. def __init__(self, config: Llama4VisionConfig):
  636. super().__init__()
  637. self.config = config
  638. self.embed_dim = config.hidden_size
  639. self.num_heads = config.num_attention_heads
  640. self.head_dim = config.hidden_size // config.num_attention_heads
  641. self.num_key_value_groups = 1
  642. self.attention_dropout = config.attention_dropout
  643. self.scaling = self.head_dim**-0.5
  644. self.q_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=True)
  645. self.k_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=True)
  646. self.v_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=True)
  647. self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.embed_dim, bias=True)
  648. def forward(
  649. self,
  650. hidden_states: torch.Tensor,
  651. freqs_ci: torch.Tensor,
  652. attention_mask: Optional[torch.Tensor] = None,
  653. past_key_values: Optional[Cache] = None,
  654. **kwargs: Unpack[FlashAttentionKwargs],
  655. ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
  656. input_shape = hidden_states.shape[:-1]
  657. hidden_shape = (*input_shape, -1, self.head_dim)
  658. query_states = self.q_proj(hidden_states).view(hidden_shape)
  659. key_states = self.k_proj(hidden_states).view(hidden_shape)
  660. value_states = self.v_proj(hidden_states).view(hidden_shape)
  661. query_states, key_states = vision_apply_rotary_emb(query_states, key_states, freqs_ci=freqs_ci)
  662. query_states = query_states.transpose(1, 2)
  663. key_states = key_states.transpose(1, 2)
  664. value_states = value_states.transpose(1, 2)
  665. attention_interface: Callable = vision_eager_attention_forward
  666. # flex disable because breaks on TP 8, embed is 88 not power of 2
  667. if self.config._attn_implementation not in ["eager", "flex_attention"]:
  668. attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
  669. attn_output, attn_weights = attention_interface(
  670. self,
  671. query_states,
  672. key_states,
  673. value_states,
  674. None,
  675. dropout=0.0 if not self.training else self.attention_dropout,
  676. scaling=None, # TODO Might be enforced here for TP compatibility as scaling is not just sqrt(head_dim)
  677. is_causal=False, # HAS TO BE ENFORCED
  678. **kwargs,
  679. )
  680. attn_output = attn_output.reshape(*input_shape, -1).contiguous()
  681. attn_output = self.o_proj(attn_output)
  682. return attn_output, attn_weights
  683. class Llama4VisionMLP(nn.Module):
  684. def __init__(self, config):
  685. super().__init__()
  686. self.config = config
  687. self.activation_fn = nn.GELU() # ACT2FN[config.hidden_act]
  688. self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size, bias=True)
  689. self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size, bias=True)
  690. def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  691. hidden_states = self.fc1(hidden_states)
  692. hidden_states = self.activation_fn(hidden_states)
  693. hidden_states = self.fc2(hidden_states)
  694. return hidden_states
  695. class Llama4VisionEncoderLayer(GradientCheckpointingLayer):
  696. def __init__(self, config: Llama4VisionConfig):
  697. super().__init__()
  698. self.hidden_size = config.hidden_size
  699. self.self_attn = Llama4VisionAttention(config)
  700. self.mlp = Llama4VisionMLP(config)
  701. self.input_layernorm = nn.LayerNorm(config.hidden_size)
  702. self.post_attention_layernorm = nn.LayerNorm(config.hidden_size)
  703. def forward(
  704. self,
  705. hidden_state: torch.Tensor,
  706. freqs_ci: torch.Tensor,
  707. attention_mask: Optional[torch.Tensor] = None,
  708. output_attentions: Optional[bool] = None,
  709. ):
  710. # Self Attention
  711. residual = hidden_state
  712. hidden_state = self.input_layernorm(hidden_state)
  713. hidden_state, attn_weights = self.self_attn(
  714. hidden_state,
  715. freqs_ci=freqs_ci,
  716. attention_mask=attention_mask,
  717. )
  718. hidden_state = residual + hidden_state
  719. # Feed forward
  720. residual = hidden_state
  721. hidden_state = self.post_attention_layernorm(hidden_state)
  722. hidden_state = self.mlp(hidden_state)
  723. hidden_state = residual + hidden_state
  724. outputs = (hidden_state,)
  725. if output_attentions:
  726. outputs += (attn_weights,)
  727. return outputs
  728. class Llama4VisionEncoder(nn.Module):
  729. """
  730. Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
  731. [`Llama4VisionEncoderLayer`].
  732. Args:
  733. config: Llama4VisionConfig
  734. """
  735. def __init__(self, config: Llama4VisionConfig):
  736. super().__init__()
  737. self.config = config
  738. self.layers = nn.ModuleList([Llama4VisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])
  739. self.gradient_checkpointing = False
  740. self.config = config
  741. def forward(
  742. self,
  743. hidden_states: torch.Tensor,
  744. freqs_ci: torch.Tensor, # TODO move this to an attribute instead of keeping it around
  745. attention_mask: Optional[torch.Tensor] = None,
  746. output_attentions: Optional[bool] = None,
  747. output_hidden_states: Optional[bool] = None,
  748. return_dict: Optional[bool] = None,
  749. ) -> Union[tuple, BaseModelOutput]:
  750. r"""
  751. Args:
  752. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  753. Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
  754. This is useful if you want more control over how to convert `input_ids` indices into associated vectors
  755. than the model's internal embedding lookup matrix.
  756. attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
  757. Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
  758. - 1 for tokens that are **not masked**,
  759. - 0 for tokens that are **masked**.
  760. [What are attention masks?](../glossary#attention-mask)
  761. output_attentions (`bool`, *optional*):
  762. Whether or not to return the attentions tensors of all attention layers. See `attentions` under
  763. returned tensors for more detail.
  764. output_hidden_states (`bool`, *optional*):
  765. Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
  766. for more detail.
  767. return_dict (`bool`, *optional*):
  768. Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
  769. """
  770. output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
  771. output_hidden_states = (
  772. output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
  773. )
  774. return_dict = return_dict if return_dict is not None else self.config.use_return_dict
  775. encoder_states = () if output_hidden_states else None
  776. all_attentions = () if output_attentions else None
  777. for encoder_layer in self.layers:
  778. if output_hidden_states:
  779. encoder_states = encoder_states + (hidden_states,)
  780. layer_outputs = encoder_layer(
  781. hidden_state=hidden_states,
  782. attention_mask=attention_mask,
  783. output_attentions=output_attentions,
  784. freqs_ci=freqs_ci,
  785. )
  786. if output_attentions:
  787. all_attentions = all_attentions + (layer_outputs[1],)
  788. hidden_states = layer_outputs[0]
  789. if output_hidden_states:
  790. encoder_states = encoder_states + (hidden_states,)
  791. if not return_dict:
  792. return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
  793. return BaseModelOutput(
  794. last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
  795. )
  796. class Llama4UnfoldConvolution(nn.Module):
  797. def __init__(self, config):
  798. super().__init__()
  799. kernel_size = config.patch_size
  800. if isinstance(kernel_size, int):
  801. kernel_size = (kernel_size, kernel_size)
  802. self.unfold = torch.nn.Unfold(kernel_size=kernel_size, stride=config.patch_size)
  803. self.linear = nn.Linear(
  804. config.num_channels * kernel_size[0] * kernel_size[1],
  805. config.hidden_size,
  806. bias=False,
  807. )
  808. def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  809. hidden_states = self.unfold(hidden_states)
  810. hidden_states = hidden_states.permute(0, 2, 1)
  811. hidden_states = self.linear(hidden_states)
  812. return hidden_states
  813. class Llama4VisionRotaryEmbedding(nn.Module):
  814. def __init__(self, config):
  815. super().__init__()
  816. idx = config.image_size // config.patch_size
  817. img_idx = torch.arange(idx**2, dtype=torch.int32).reshape(idx**2, 1)
  818. img_idx = torch.cat([img_idx, img_idx[:1]], dim=0)
  819. img_idx[-1, -1] = -2 # ID_CLS_TOKEN
  820. frequencies_x = img_idx % idx # get the coordinates of the 2d matrix along x
  821. frequencies_y = img_idx // idx # get the coordinates of the 2d matrix along y
  822. freq_dim = config.hidden_size // config.num_attention_heads // 2
  823. rope_freq = 1.0 / (config.rope_theta ** (torch.arange(0, freq_dim, 2)[: (freq_dim // 2)].float() / freq_dim))
  824. freqs_x = ((frequencies_x + 1)[..., None] * rope_freq[None, None, :]).repeat_interleave(2, dim=-1)
  825. freqs_y = ((frequencies_y + 1)[..., None] * rope_freq[None, None, :]).repeat_interleave(2, dim=-1)
  826. freqs = torch.cat([freqs_x, freqs_y], dim=-1).float().contiguous()[..., ::2]
  827. freqs = freqs.masked_fill(img_idx.reshape(-1, 1, 1) < 0, 0)
  828. freq_cis = torch.view_as_complex(torch.stack([torch.cos(freqs), torch.sin(freqs)], dim=-1))
  829. self.freqs_ci = freq_cis # idx**2, idx**2, idx * 2
  830. def forward(self, hidden_states):
  831. return self.freqs_ci.to(hidden_states.device)
  832. class Llama4VisionModel(Llama4PreTrainedModel):
  833. base_model_prefix = "vision_model"
  834. _no_split_modules = ["Llama4VisionEncoderLayer"]
  835. config: Llama4VisionConfig
  836. def __init__(self, config: Llama4VisionConfig):
  837. super().__init__(config)
  838. self.image_size = config.image_size
  839. self.patch_size = config.patch_size
  840. self.hidden_size = config.hidden_size
  841. self.num_channels = config.num_channels
  842. self.num_patches = (self.image_size // self.patch_size) ** 2 + 1
  843. self.scale = config.hidden_size**-0.5
  844. self.patch_embedding = Llama4UnfoldConvolution(config)
  845. self.class_embedding = nn.Parameter(self.scale * torch.randn(self.hidden_size))
  846. self.positional_embedding_vlm = nn.Parameter(self.scale * torch.randn(self.num_patches, self.hidden_size))
  847. self.rotary_embedding = Llama4VisionRotaryEmbedding(config)
  848. # layer norms
  849. self.layernorm_pre = nn.LayerNorm(self.hidden_size)
  850. self.layernorm_post = nn.LayerNorm(self.hidden_size)
  851. # encoders
  852. self.model = Llama4VisionEncoder(config)
  853. self.vision_adapter = Llama4VisionPixelShuffleMLP(config)
  854. self.post_init()
  855. def get_input_embeddings(self):
  856. """
  857. This function is used to fetch the first embedding layer to activate grads on inputs.
  858. """
  859. return self.patch_embedding
  860. def forward(
  861. self,
  862. pixel_values: torch.Tensor,
  863. attention_mask: Optional[torch.Tensor] = None,
  864. output_attentions: Optional[bool] = None,
  865. output_hidden_states: Optional[bool] = None,
  866. return_dict: Optional[bool] = None,
  867. ) -> Union[BaseModelOutput, tuple[torch.Tensor, ...]]:
  868. r"""
  869. Example:
  870. ```python
  871. >>> from PIL import Image
  872. >>> import requests
  873. >>> from transformers import AutoProcessor, MllamaVisionModel
  874. >>> checkpoint = "meta-llama/Llama-3.2-11B-Vision"
  875. >>> model = MllamaVisionModel.from_pretrained(checkpoint)
  876. >>> processor = AutoProcessor.from_pretrained(checkpoint)
  877. >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
  878. >>> image = Image.open(requests.get(url, stream=True).raw)
  879. >>> inputs = processor(images=image, return_tensors="pt")
  880. >>> output = model(**inputs)
  881. >>> print(output.last_hidden_state.shape)
  882. torch.Size([1, 1, 4, 1025, 7680])
  883. ```
  884. """
  885. output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
  886. output_hidden_states = (
  887. output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
  888. )
  889. return_dict = return_dict if return_dict is not None else self.config.use_return_dict
  890. # num_concurrent_media and num_chunks are both currently 1
  891. batch_size_times_num_tiles, num_channels, height, width = pixel_values.shape
  892. num_concurrent_media = 1
  893. num_chunks = 1
  894. hidden_state = self.patch_embedding(pixel_values)
  895. _, num_patches, hidden_dim = hidden_state.shape
  896. # Add cls token
  897. hidden_state = hidden_state.reshape(
  898. batch_size_times_num_tiles * num_concurrent_media * num_chunks, num_patches, hidden_dim
  899. )
  900. class_embedding = self.class_embedding.expand(hidden_state.shape[0], 1, hidden_state.shape[-1])
  901. hidden_state = torch.cat([hidden_state, class_embedding], dim=1)
  902. num_patches += 1
  903. # Position embeddings
  904. hidden_state = hidden_state.reshape(
  905. batch_size_times_num_tiles * num_concurrent_media, num_chunks, num_patches, hidden_dim
  906. )
  907. positional_embedding = self.positional_embedding_vlm.to(dtype=hidden_state.dtype, device=hidden_state.device)
  908. hidden_state = hidden_state + positional_embedding
  909. hidden_state = self.layernorm_pre(hidden_state)
  910. hidden_state = hidden_state.view(batch_size_times_num_tiles, -1, hidden_dim)
  911. freqs_ci = self.rotary_embedding(pixel_values)
  912. output = self.model(
  913. hidden_state,
  914. attention_mask=None,
  915. output_hidden_states=output_hidden_states,
  916. output_attentions=output_attentions,
  917. freqs_ci=freqs_ci,
  918. )
  919. hidden_state = output.last_hidden_state
  920. hidden_state = self.layernorm_post(hidden_state)
  921. hidden_state = hidden_state[:, :-1, :]
  922. # now, we use Llama4VisionPixelShuffle + mlp to project embeddings
  923. hidden_state = self.vision_adapter(hidden_state)
  924. hidden_states = output.hidden_states if output_hidden_states else None
  925. if output_attentions:
  926. attentions = output[2]
  927. else:
  928. attentions = None
  929. if not return_dict:
  930. return tuple(v for v in [hidden_state, hidden_states, attentions] if v is not None)
  931. return BaseModelOutput(
  932. last_hidden_state=hidden_state,
  933. hidden_states=hidden_states,
  934. attentions=attentions,
  935. )
  936. class Llama4ForConditionalGeneration(Llama4PreTrainedModel, GenerationMixin):
  937. _no_split_modules = ["Llama4TextDecoderLayer", "Llama4VisionEncoderLayer"]
  938. _tp_plan = {}
  939. base_model_prefix = ""
  940. config: Llama4Config
  941. def __init__(self, config: Llama4Config):
  942. super().__init__(config)
  943. self.vision_model = Llama4VisionModel(config.vision_config)
  944. self.multi_modal_projector = Llama4MultiModalProjector(config)
  945. self.language_model = Llama4ForCausalLM(config.text_config)
  946. self.vocab_size = config.text_config.vocab_size
  947. self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
  948. self.post_init()
  949. def get_input_embeddings(self):
  950. return self.language_model.get_input_embeddings()
  951. def set_input_embeddings(self, value):
  952. self.language_model.set_input_embeddings(value)
  953. def get_output_embeddings(self):
  954. return self.language_model.get_output_embeddings()
  955. def set_output_embeddings(self, new_embeddings):
  956. self.language_model.set_output_embeddings(new_embeddings)
  957. def set_decoder(self, decoder):
  958. self.language_model.set_decoder(decoder)
  959. def get_decoder(self):
  960. return self.language_model.get_decoder()
  961. def get_image_features(
  962. self,
  963. pixel_values: torch.FloatTensor,
  964. vision_feature_select_strategy: str,
  965. **kwargs,
  966. ):
  967. """
  968. Obtains image last hidden states from the vision tower and apply al projection.
  969. Args:
  970. pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
  971. The tensors corresponding to the input images.
  972. vision_feature_select_strategy (`str`):
  973. The feature selection strategy used to select the vision feature from the vision backbone.
  974. Can be one of `"default"` or `"full"`
  975. Returns:
  976. image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
  977. """
  978. if vision_feature_select_strategy not in ["default", "full"]:
  979. raise ValueError(f"Unexpected select feature strategy: {self.vision_feature_select_strategy}")
  980. kwargs = {k: v for k, v in kwargs.items() if v is not None}
  981. image_outputs = self.vision_model(pixel_values, output_hidden_states=False, **kwargs)
  982. hidden_state = image_outputs.last_hidden_state
  983. return hidden_state
  984. def get_placeholder_mask(
  985. self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
  986. ):
  987. """
  988. Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
  989. equal to the length of multimodal features. If the lengths are different, an error is raised.
  990. """
  991. if input_ids is None:
  992. special_image_mask = inputs_embeds == self.get_input_embeddings()(
  993. torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
  994. )
  995. special_image_mask = special_image_mask.all(-1)
  996. else:
  997. special_image_mask = input_ids == self.config.image_token_id
  998. n_image_tokens = special_image_mask.sum()
  999. special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
  1000. if inputs_embeds[special_image_mask].numel() != image_features.numel():
  1001. raise ValueError(
  1002. f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0]}"
  1003. )
  1004. return special_image_mask
  1005. @auto_docstring
  1006. @deprecate_kwarg("vision_feature_layer", version="4.58")
  1007. def forward(
  1008. self,
  1009. input_ids: Optional[torch.LongTensor] = None,
  1010. pixel_values: Optional[torch.FloatTensor] = None,
  1011. attention_mask: Optional[torch.Tensor] = None,
  1012. position_ids: Optional[torch.LongTensor] = None,
  1013. past_key_values: Optional[Cache] = None,
  1014. inputs_embeds: Optional[torch.FloatTensor] = None,
  1015. vision_feature_layer: Optional[Union[int, list[int]]] = None,
  1016. vision_feature_select_strategy: Optional[str] = None,
  1017. labels: Optional[torch.LongTensor] = None,
  1018. use_cache: Optional[bool] = None,
  1019. output_attentions: Optional[bool] = None,
  1020. output_hidden_states: Optional[bool] = None,
  1021. return_dict: Optional[bool] = None,
  1022. cache_position: Optional[torch.LongTensor] = None,
  1023. logits_to_keep: Union[int, torch.Tensor] = 0,
  1024. **kwargs: Unpack[TransformersKwargs],
  1025. ) -> Union[tuple, Llama4CausalLMOutputWithPast]:
  1026. r"""
  1027. labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
  1028. Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
  1029. config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
  1030. (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
  1031. Example:
  1032. ```python
  1033. >>> from PIL import Image
  1034. >>> import requests
  1035. >>> from transformers import AutoProcessor, LlavaForConditionalGeneration
  1036. >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
  1037. >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
  1038. >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
  1039. >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
  1040. >>> image = Image.open(requests.get(url, stream=True).raw)
  1041. >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
  1042. >>> # Generate
  1043. >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
  1044. >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
  1045. "USER: \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
  1046. ```"""
  1047. output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
  1048. output_hidden_states = (
  1049. output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
  1050. )
  1051. return_dict = return_dict if return_dict is not None else self.config.use_return_dict
  1052. vision_feature_select_strategy = (
  1053. vision_feature_select_strategy
  1054. if vision_feature_select_strategy is not None
  1055. else self.config.vision_config.vision_feature_select_strategy
  1056. )
  1057. if (input_ids is None) ^ (inputs_embeds is not None):
  1058. raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
  1059. if pixel_values is not None and inputs_embeds is not None:
  1060. raise ValueError(
  1061. "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
  1062. )
  1063. if inputs_embeds is None:
  1064. inputs_embeds = self.get_input_embeddings()(input_ids)
  1065. if pixel_values is not None:
  1066. image_features = self.get_image_features(
  1067. pixel_values=pixel_values,
  1068. vision_feature_select_strategy=vision_feature_select_strategy,
  1069. )
  1070. vision_flat = image_features.view(-1, image_features.size(-1))
  1071. projected_vision_flat = self.multi_modal_projector(vision_flat).to(
  1072. inputs_embeds.device, inputs_embeds.dtype
  1073. )
  1074. special_image_mask = self.get_placeholder_mask(
  1075. input_ids, inputs_embeds=inputs_embeds, image_features=projected_vision_flat
  1076. )
  1077. inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, projected_vision_flat)
  1078. outputs = self.language_model(
  1079. attention_mask=attention_mask,
  1080. position_ids=position_ids,
  1081. past_key_values=past_key_values,
  1082. inputs_embeds=inputs_embeds,
  1083. use_cache=use_cache,
  1084. output_attentions=output_attentions,
  1085. output_hidden_states=output_hidden_states,
  1086. return_dict=return_dict,
  1087. cache_position=cache_position,
  1088. logits_to_keep=logits_to_keep,
  1089. **kwargs,
  1090. )
  1091. logits = outputs[0]
  1092. loss = None
  1093. if labels is not None:
  1094. # Shift so that tokens < n predict n
  1095. if attention_mask is not None:
  1096. # we use the input attention mask to shift the logits and labels, because it is 2D.
  1097. # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
  1098. shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device)
  1099. shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
  1100. shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
  1101. else:
  1102. shift_logits = logits[..., :-1, :].contiguous()
  1103. shift_labels = labels[..., 1:].contiguous()
  1104. # Flatten the tokens
  1105. loss_fct = nn.CrossEntropyLoss()
  1106. loss = loss_fct(
  1107. shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
  1108. )
  1109. if not return_dict:
  1110. output = (logits,) + outputs[1:]
  1111. return (loss,) + output if loss is not None else output
  1112. return Llama4CausalLMOutputWithPast(
  1113. loss=loss,
  1114. logits=logits,
  1115. past_key_values=outputs.past_key_values,
  1116. hidden_states=outputs.hidden_states,
  1117. attentions=outputs.attentions,
  1118. image_hidden_states=image_features if pixel_values is not None else None,
  1119. )
  1120. def prepare_inputs_for_generation(
  1121. self,
  1122. input_ids,
  1123. past_key_values=None,
  1124. inputs_embeds=None,
  1125. pixel_values=None,
  1126. attention_mask=None,
  1127. cache_position=None,
  1128. logits_to_keep=None,
  1129. **kwargs,
  1130. ):
  1131. # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
  1132. model_inputs = self.language_model.prepare_inputs_for_generation(
  1133. input_ids,
  1134. past_key_values=past_key_values,
  1135. inputs_embeds=inputs_embeds,
  1136. attention_mask=attention_mask,
  1137. cache_position=cache_position,
  1138. logits_to_keep=logits_to_keep,
  1139. **kwargs,
  1140. )
  1141. if cache_position[0] == 0:
  1142. # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
  1143. # Otherwise we need pixel values to be passed to model
  1144. model_inputs["pixel_values"] = pixel_values
  1145. return model_inputs
  1146. __all__ = [
  1147. "Llama4PreTrainedModel",
  1148. "Llama4TextModel",
  1149. "Llama4VisionModel",
  1150. "Llama4ForCausalLM",
  1151. "Llama4ForConditionalGeneration",
  1152. ]