plamo-13b-instruct / modeling_plamo.py

Upload folder using huggingface_hub

65dd6ae 11 months ago

29 kB

	from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Union

	import numpy as np
	import torch
	from torch import nn
	from torch.nn import functional as F
	from transformers import PretrainedConfig, PreTrainedModel
	from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast


	class DecoderInput(NamedTuple):
	hidden_states: torch.Tensor
	position_ids: torch.Tensor
	attention_mask: Optional[torch.Tensor] = None
	past_key_values: Optional[List[torch.FloatTensor]] = None
	output_hidden_states: Optional[bool] = False
	output_attentions: Optional[bool] = False
	use_cache: Optional[bool] = False
	gradient_checkpointing: bool = False


	class DecoderOutput(NamedTuple):
	hidden_states: torch.Tensor
	all_hidden_states: Optional[Tuple[torch.Tensor, ...]]
	all_self_attns: Optional[Tuple[torch.Tensor, ...]]
	next_decoder_cache: Optional[Tuple[torch.Tensor, ...]]


	class PlamoConfig(PretrainedConfig): # type: ignore
	model_type: str = "plamo"

	def __init__(
	self,
	vocab_size: int = 32000,
	hidden_size: int = 4096,
	intermediate_size: int = 13312,
	num_hidden_layers: int = 32,
	num_attention_heads: int = 32,
	num_key_value_heads: Optional[int] = None,
	max_position_embeddings: int = 2048,
	initializer_range: float = 0.02,
	rms_norm_eps: float = 1e-6,
	use_cache: bool = True,
	tokenizer_class: str = "PlamoTokenizer",
	pad_token_id: Optional[int] = None,
	bos_token_id: int = 1,
	eos_token_id: int = 2,
	n_shared_head: int = 8,
	tie_word_embeddings: bool = False,
	**kwargs: Any,
	) -> None:
	self.vocab_size = vocab_size
	self.max_position_embeddings = max_position_embeddings
	self.hidden_size = hidden_size
	self.intermediate_size = intermediate_size
	self.num_hidden_layers = num_hidden_layers
	self.num_attention_heads = num_attention_heads

	# for backward compatibility
	if num_key_value_heads is None:
	num_key_value_heads = num_attention_heads

	self.num_key_value_heads = num_key_value_heads
	self.initializer_range = initializer_range
	self.rms_norm_eps = rms_norm_eps
	self.use_cache = use_cache

	self.n_shared_head = n_shared_head

	super().__init__(
	tokenizer_class=tokenizer_class,
	pad_token_id=pad_token_id,
	bos_token_id=bos_token_id,
	eos_token_id=eos_token_id,
	tie_word_embeddings=tie_word_embeddings,
	**kwargs,
	)


	# Copied from transformers.models.bart.modeling_bart._make_causal_mask
	def _make_causal_mask(
	input_ids_shape: Tuple[int, int], dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
	) -> torch.Tensor:
	"""
	Make causal mask used for bi-directional self-attention.
	"""
	bsz, tgt_len = input_ids_shape
	mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
	mask_cond = torch.arange(mask.size(-1), device=device)
	mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
	mask = mask.to(dtype)

	if past_key_values_length > 0:
	mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
	return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)


	# Copied from transformers.models.bart.modeling_bart._expand_mask
	def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None) -> torch.Tensor:
	"""
	Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
	"""
	bsz, src_len = mask.size()
	tgt_len = tgt_len if tgt_len is not None else src_len

	expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)

	inverted_mask = 1.0 - expanded_mask

	return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) # type: ignore


	class RotaryEmbedding(torch.nn.Module):
	def __init__(
	self, dim: int, max_position_embeddings: int = 2048, base: int = 10000, device: Optional[torch.device] = None
	) -> None:
	super().__init__()

	self.dim = dim
	self.max_position_embeddings = max_position_embeddings
	self.base = base
	inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
	self.register_buffer("inv_freq", inv_freq, persistent=False)

	# Build here to make `torch.jit.trace` work.
	self._set_cos_sin_cache(
	seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
	)

	def _set_cos_sin_cache(self, seq_len: int, device: Any, dtype: Any) -> None:
	self.max_seq_len_cached = seq_len
	t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) # type: ignore

	freqs = torch.einsum("i,j->ij", t, self.inv_freq)
	# Different from paper, but it uses a different permutation in order to obtain the same calculation
	emb = torch.cat((freqs, freqs), dim=-1)
	self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
	self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)

	def forward(self, x: torch.Tensor, seq_len: int) -> Tuple[torch.Tensor, torch.Tensor]:
	# x: [bs, num_attention_heads, seq_len, head_size]
	if seq_len > self.max_seq_len_cached:
	self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)

	return (
	self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype), # type: ignore
	self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype), # type: ignore
	)


	def _rotate_half(x: torch.Tensor) -> torch.Tensor:
	"""Rotates half the hidden dims of the input."""
	x1 = x[..., : x.shape[-1] // 2]
	x2 = x[..., x.shape[-1] // 2 :]
	return torch.cat((-x2, x1), dim=-1)


	def _rotary_pos_emb(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, position_ids: torch.Tensor) -> torch.Tensor:
	# The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
	cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
	sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
	cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
	sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
	x_embed = (x * cos) + (_rotate_half(x) * sin)
	return x_embed


	class RMSNorm(nn.Module):
	def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
	super().__init__()
	self.weight = nn.Parameter(torch.ones(hidden_size))
	self.variance_epsilon = eps

	def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
	input_dtype = hidden_states.dtype
	hidden_states = hidden_states.to(torch.float32)
	variance = hidden_states.pow(2).mean(-1, keepdim=True)
	hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
	return self.weight * hidden_states.to(input_dtype)


	class Attention(torch.nn.Module):
	def __init__(self, config: PlamoConfig) -> None:
	super().__init__()
	self.config = config
	self.hidden_size = config.hidden_size
	head_dim = self.hidden_size // config.num_attention_heads
	self.max_position_embeddings = config.max_position_embeddings

	self.q_num_heads = config.num_attention_heads
	self.qk_dim = self.v_dim = head_dim
	self.k_num_heads = self.v_num_heads = int(np.ceil(self.q_num_heads / config.n_shared_head))

	self.q_proj = nn.Linear(self.hidden_size, self.q_num_heads * self.qk_dim, bias=False)
	self.k_proj = nn.Linear(self.hidden_size, self.k_num_heads * self.qk_dim, bias=False)
	self.v_proj = nn.Linear(self.hidden_size, self.v_num_heads * self.v_dim, bias=False)
	self.o_proj = nn.Linear(self.q_num_heads * self.v_dim, self.hidden_size, bias=False)
	self.rotary_emb = RotaryEmbedding(self.qk_dim, max_position_embeddings=self.max_position_embeddings)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.Tensor] = None,
	past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
	output_attentions: bool = False,
	use_cache: bool = False,
	) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor, torch.Tensor]]]:
	bsz, q_len, _ = hidden_states.size()

	query_states = self.q_proj(hidden_states).view(bsz, q_len, self.q_num_heads, self.qk_dim).transpose(1, 2)
	key_states = self.k_proj(hidden_states).view(bsz, q_len, self.k_num_heads, self.qk_dim).transpose(1, 2)
	value_states = self.v_proj(hidden_states).view(bsz, q_len, self.v_num_heads, self.v_dim).transpose(1, 2)

	def _expand_kv(t: torch.Tensor, repeat: int, target: int) -> torch.Tensor:
	return t.repeat(1, repeat, 1, 1)[:, :target]

	# expand shared kv
	assert self.k_num_heads == self.v_num_heads
	key_states = _expand_kv(key_states, self.config.n_shared_head, self.q_num_heads)
	value_states = _expand_kv(value_states, self.config.n_shared_head, self.q_num_heads)

	kv_seq_len = key_states.shape[-2]
	if past_key_value is not None:
	kv_seq_len += past_key_value[0].shape[-2]
	cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
	assert position_ids is not None
	query_states = _rotary_pos_emb(query_states, cos, sin, position_ids)
	key_states = _rotary_pos_emb(key_states, cos, sin, position_ids)
	# [bsz, nh, t, hd]

	if past_key_value is not None:
	# reuse k, v, self_attention
	key_states = torch.cat([past_key_value[0], key_states], dim=2)
	value_states = torch.cat([past_key_value[1], value_states], dim=2)

	past_key_value = (key_states, value_states) if use_cache else None

	attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask=attention_mask)
	attn_output = attn_output.transpose(1, 2)

	attn_output = attn_output.reshape(bsz, q_len, self.q_num_heads * self.v_dim)
	attn_output = self.o_proj(attn_output)

	if not output_attentions:
	attn_weights = None

	return attn_output, attn_weights, past_key_value


	class MLP(nn.Module):
	def __init__(self, config: PlamoConfig) -> None:
	super().__init__()
	self.config = config
	self.hidden_size = config.hidden_size
	self.intermediate_size = config.intermediate_size
	self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
	self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
	self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
	self.act_fn = torch.nn.functional.silu

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) # type: ignore


	class PlamoDecoderLayer(torch.nn.Module):
	def __init__(self, config: PlamoConfig) -> None:
	super().__init__()
	self.config = config
	self.hidden_size = config.hidden_size
	self.self_attn = Attention(config)
	self.mlp = MLP(config)
	self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_value: Optional[Tuple[torch.Tensor]] = None,
	output_attentions: Optional[bool] = False,
	use_cache: Optional[bool] = False,
	) -> Tuple[Any, ...]:
	# from LlamaDecoder
	residual = hidden_states

	hidden_states = self.norm(hidden_states)

	# Self Attention
	hidden_states_sa, self_attn_weights, present_key_value = self.self_attn(
	hidden_states=hidden_states,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_value=past_key_value,
	output_attentions=output_attentions,
	use_cache=use_cache,
	)

	# Fully Connected
	hidden_states_mlp = self.mlp(hidden_states)

	# Residual
	hidden_states = residual + hidden_states_sa + hidden_states_mlp

	outputs: Any = (hidden_states,)

	if output_attentions:
	outputs += (self_attn_weights,)

	if use_cache:
	outputs += (present_key_value,)

	return outputs # type: ignore


	class PlamoDecoder(torch.nn.Module):
	def __init__(self, config: PlamoConfig) -> None:
	super().__init__()
	self.layers = torch.nn.ModuleList([PlamoDecoderLayer(config) for _ in range(config.num_hidden_layers)])

	def forward(self, x: DecoderInput) -> DecoderOutput:
	all_hidden_states: Optional[Tuple[torch.Tensor, ...]] = () if x.output_hidden_states else None
	all_self_attns: Optional[Tuple[torch.Tensor, ...]] = () if x.output_attentions else None
	next_decoder_cache: Optional[Tuple[torch.Tensor, ...]] = () if x.use_cache else None
	hidden_states = x.hidden_states

	for idx, decoder_layer in enumerate(self.layers):
	if x.output_hidden_states:
	assert all_hidden_states is not None
	all_hidden_states += (hidden_states,)

	past_key_value = x.past_key_values[idx] if x.past_key_values is not None else None

	if self.training and x.gradient_checkpointing:

	def create_custom_forward(module): # type: ignore
	def custom_forward(*inputs): # type: ignore
	# None for past_key_value
	return module(*inputs, x.output_attentions, None)

	return custom_forward

	layer_outputs = torch.utils.checkpoint.checkpoint(
	create_custom_forward(decoder_layer), # type: ignore
	hidden_states,
	x.attention_mask,
	x.position_ids,
	None,
	)
	else:
	layer_outputs = decoder_layer(
	hidden_states,
	attention_mask=x.attention_mask,
	position_ids=x.position_ids,
	past_key_value=past_key_value,
	output_attentions=x.output_attentions,
	use_cache=x.use_cache,
	)

	hidden_states = layer_outputs[0]

	if x.use_cache:
	cache = layer_outputs[2 if x.output_attentions else 1]
	assert cache is not None
	assert next_decoder_cache is not None
	next_decoder_cache += (cache,)

	if x.output_attentions:
	assert layer_outputs[1] is not None
	assert all_self_attns is not None
	all_self_attns += (layer_outputs[1],)
	return DecoderOutput(hidden_states, all_hidden_states, all_self_attns, next_decoder_cache)


	class PlamoPreTrainedModel(PreTrainedModel): # type: ignore
	config_class = PlamoConfig
	_no_split_modules: List[str]
	base_model_prefix = "model"
	supports_gradient_checkpointing = True
	_no_split_modules = ["PlamoDecoderLayer"]
	_skip_keys_device_placement = "past_key_values"
	_keys_to_ignore_on_load_unexpected = [r"decoder\.version"]

	def _init_weights(self, module: torch.nn.Module) -> None:
	std = self.config.initializer_range
	if isinstance(module, nn.Linear):
	module.weight.data.normal_(mean=0.0, std=std)
	if module.bias is not None:
	module.bias.data.zero_()
	elif isinstance(module, nn.Embedding):
	module.weight.data.normal_(mean=0.0, std=std)
	if module.padding_idx is not None:
	module.weight.data[module.padding_idx].zero_()

	def _set_gradient_checkpointing(self, module: torch.nn.Module, value: bool = False) -> None:
	module.gradient_checkpointing = value # type: ignore


	class PlamoModel(PlamoPreTrainedModel):
	def __init__(self, config: PlamoConfig):
	super().__init__(config)
	self.padding_idx = config.pad_token_id
	self.vocab_size = config.vocab_size

	self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
	self.layers = PlamoDecoder(config) # type: ignore
	self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)

	self.gradient_checkpointing = False
	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self) -> torch.nn.Embedding:
	return self.embed_tokens

	def set_input_embeddings(self, value: torch.nn.Embedding) -> None:
	self.embed_tokens = value

	# Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
	def _prepare_decoder_attention_mask(
	self,
	attention_mask: torch.Tensor,
	input_shape: Tuple[int, int],
	inputs_embeds: Optional[torch.FloatTensor],
	past_key_values_length: int,
	) -> Optional[torch.Tensor]:
	# create causal mask
	# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
	combined_attention_mask: Optional[torch.Tensor] = None
	if input_shape[-1] > 1:
	assert inputs_embeds is not None
	combined_attention_mask = _make_causal_mask(
	input_shape,
	inputs_embeds.dtype,
	device=inputs_embeds.device,
	past_key_values_length=past_key_values_length,
	)

	if attention_mask is not None:
	# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
	assert inputs_embeds is not None
	expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
	inputs_embeds.device
	)
	combined_attention_mask = (
	expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
	)

	return combined_attention_mask

	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.Tensor] = None,
	past_key_values: Optional[List[torch.FloatTensor]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple, BaseModelOutputWithPast]:
	assert input_ids is not None
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	use_cache = use_cache if use_cache is not None else self.config.use_cache

	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	# retrieve input_ids and inputs_embeds
	if input_ids is not None and inputs_embeds is not None:
	raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
	elif input_ids is not None:
	batch_size, seq_length = input_ids.shape
	else:
	raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")

	seq_length_with_past = seq_length
	past_key_values_length = 0

	if past_key_values is not None:
	past_key_values_length = past_key_values[0][0].shape[2]
	seq_length_with_past = seq_length_with_past + past_key_values_length

	if position_ids is None:
	device = input_ids.device
	position_ids = torch.arange(
	past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
	)
	position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
	else:
	position_ids = position_ids.view(-1, seq_length).long()

	if inputs_embeds is None:
	inputs_embeds = self.embed_tokens(input_ids)
	# embed positions
	if attention_mask is None:
	attention_mask = torch.ones(
	(batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
	)
	attention_mask = self._prepare_decoder_attention_mask(
	attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
	)

	hidden_states = inputs_embeds

	if self.gradient_checkpointing and self.training:
	if use_cache:
	use_cache = False

	# decoder layers
	out = self.layers(
	DecoderInput(
	hidden_states,
	position_ids,
	attention_mask,
	past_key_values,
	output_hidden_states,
	output_attentions,
	use_cache,
	self.gradient_checkpointing,
	)
	)
	assert isinstance(out, DecoderOutput)
	hidden_states = out.hidden_states
	all_hidden_states = out.all_hidden_states
	all_self_attns = out.all_self_attns
	next_decoder_cache = out.next_decoder_cache

	hidden_states = self.norm(hidden_states)

	# add hidden states from the last decoder layer
	if output_hidden_states:
	assert all_hidden_states is not None
	all_hidden_states += (hidden_states,)

	next_cache = next_decoder_cache if use_cache else None
	if not return_dict:
	return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
	return BaseModelOutputWithPast(
	last_hidden_state=hidden_states,
	past_key_values=next_cache,
	hidden_states=all_hidden_states,
	attentions=all_self_attns,
	)


	class PlamoForCausalLM(PlamoPreTrainedModel):
	def __init__(self, config: PretrainedConfig) -> None:
	super().__init__(config)
	self.model = PlamoModel(config)

	self.lm_head: torch.nn.Module = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self) -> torch.nn.Embedding:
	return self.model.embed_tokens

	def set_input_embeddings(self, value: torch.nn.Embedding) -> None:
	self.model.embed_tokens = value

	def get_output_embeddings(self) -> torch.nn.Module:
	return self.lm_head

	def set_output_embeddings(self, new_embeddings: torch.nn.Module) -> None:
	self.lm_head = new_embeddings

	def set_decoder(self, decoder: PlamoModel) -> None:
	self.model = decoder

	def get_decoder(self) -> PlamoModel:
	return self.model

	def forward( # type: ignore
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.Tensor] = None,
	past_key_values: Optional[List[torch.FloatTensor]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple, CausalLMOutputWithPast]:
	r"""
	Args:
	labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, optional):
	Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
	config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
	(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

	Returns:

	Example:

	```python
	>>> from transformers import AutoTokenizer, LlamaForCausalLM

	>>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
	>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)

	>>> prompt = "Hey, are you consciours? Can you talk to me?"
	>>> inputs = tokenizer(prompt, return_tensors="pt")

	>>> # Generate
	>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
	>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
	"Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
	```"""
	assert input_ids is not None

	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
	outputs = self.model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	hidden_states = outputs[0]
	logits = self.lm_head(hidden_states)

	loss = None
	if labels is not None:
	# Shift so that tokens < n predict n
	shift_logits = logits[..., :-1, :].contiguous()
	shift_labels = labels[..., 1:].contiguous()
	# Flatten the tokens
	loss_fct = nn.CrossEntropyLoss()
	shift_logits = shift_logits.view(-1, self.config.vocab_size)
	shift_labels = shift_labels.view(-1)
	# Enable model parallelism
	shift_labels = shift_labels.to(shift_logits.device)
	loss = loss_fct(shift_logits, shift_labels)

	if not return_dict:
	output = (logits,) + outputs[1:]
	return (loss,) + output if loss is not None else output

	return CausalLMOutputWithPast(
	loss=loss,
	logits=logits,
	past_key_values=outputs.past_key_values,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	)

	def prepare_inputs_for_generation(
	self,
	input_ids: torch.Tensor,
	past_key_values: Optional[List[torch.FloatTensor]] = None,
	attention_mask: Optional[torch.Tensor] = None,
	inputs_embeds: Optional[torch.Tensor] = None,
	**kwargs: Any,
	) -> Dict[str, Any]:
	if past_key_values:
	input_ids = input_ids[:, -1:]

	position_ids = kwargs.get("position_ids", None)
	if attention_mask is not None and position_ids is None:
	# create position_ids on the fly for batch generation
	position_ids = attention_mask.long().cumsum(-1) - 1
	position_ids.masked_fill_(attention_mask == 0, 1)
	if past_key_values:
	position_ids = position_ids[:, -1].unsqueeze(-1)

	# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
	if inputs_embeds is not None and past_key_values is None:
	model_inputs: Dict[str, Any] = {"inputs_embeds": inputs_embeds}
	else:
	model_inputs = {"input_ids": input_ids}

	model_inputs.update(
	{
	"position_ids": position_ids,
	"past_key_values": past_key_values,
	"use_cache": kwargs.get("use_cache"),
	"attention_mask": attention_mask,
	}
	)
	return model_inputs

	@staticmethod
	def _reorder_cache(past_key_values: List[torch.FloatTensor], beam_idx: int) -> Tuple[Any, ...]:
	reordered_past: Tuple[Any, ...] = ()
	for layer_past in past_key_values:
	reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
	return reordered_past