o �J�h�D�@sddlZddlZddlZddlZddlZddlZddlZddlZddl Z ddl Z ddl Z ddl Z ddl Z ddlmZddlmZddlmZmZddlmZddlmZmZmZmZmZmZmZmZmZm Z ddl!m"Z"ddl#Z#ddl$m%Z%dd l&m'Z'dd l#m(Z(m)Z)dd l*m+Z+m,Z,dd l-m.Z.d dl/m0Z0d dl1m2Z2d dl3m4Z4d dl5m6Z6m7Z7m8Z8d dl9m:Z:m;Z;m<Z<d dl=m>Z>d dl?m@Z@mAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZHd dlImJZJmKZKd dlLmMZMd dlNmOZOd dlPmQZQmRZRmSZSmTZTmUZUmVZVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZemfZfmgZgmhZhmiZimjZjmkZkmlZlmmZmmnZnmoZompZpmqZqmrZrmsZsd dltmuZumvZvd dlwmxZxmyZymzZzm{Z{d dl|m}Z}m~Z~e j��dd����Z�e j��dd����Z�ee��r�ddl�m�Z�m�Z�m�Z�dd l�m�Z�dd!l�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�e'��ej��'d"��Z�e�e'��d#�k�r�dd$l�m�Z�el��r�dd%l�m�Z�dd&l�m�Z�dd'l�m�Z�eq��e��Z�d(a�d)a�d)a�d*d+�Z�d,d-�Z�ey��r�ddl�m�m#Z�dd.l�m�Z�e'��e��e'��d/�kZ�nd)Z�ej��r�d d0lPm�Z�ed1d2d3�Z�e)j�j�e)j�j�e)j�j�e)j�j�e)j�j�e)j�j�e)j�j�e)j�j�e)j�j�e)j�j�e)j�j�e)j�j�e)j�j�e)j�j�d4�Z�ed�d5d6��Z�ed7d8��Z�ed9d:��Z�d;e e)j�d<ffd=d>�Z�d;e e)j�d<ffd?d@�Z�d;e e)j�d<ffdAdB�Z�dCdD�Z�dEdF�Z�dGdH�Z�d�dJdK�Z�d�dLdM�Z� )  (d�dNe e�e j�fdOe�dPee e�e#j�fdQe�fdRdS�Z�dTdU�Z�dVe#j(dWe�fdXdY�Z�d�dZe)j�fd[d\�Z�d]eee�d^ee�e#j(fdWeeee�ee�ffd_d`�Z�d]eee�d^ee�e#j(fdWeeee�ee�ffdadb�Z�d�dcdd�Z�dedf�Z�dgdh�Z�        )   d�didj�Z�d�dke�dlee�dWe�fdmdn�Z�Gdod<�d<�Z�Gdpd2�d2e)j�e�e8e_e:�Z�eae�j߃e�_�e�j�j�du�r<e�j�j�j�dqdrdsdt�e�j�_�Gdudv�dve)jăZ�Gdwdx�dxe)jăZ�Gdydz�dze)jăZ�eGd{d|�d|e^��Z�Gd}d~�d~e)jăZ�Gdd��d�e)jăZ�d�dqe)j�d�e�dWe)j�fd�d��Z�d�d��Z�d�d��Z�dS)��N)�contextmanager)� dataclass)�partial�wraps)�Process) �Any�Callable�Dict�List�Optional�Set�Tuple�Type�TypeVar�Union)� is_zipfile)�"split_torch_state_dict_into_shards)�version)�Tensor�nn)�CrossEntropyLoss�Identity)� checkpoint�)�get_activation)�PretrainedConfig)�custom_object_save)� CompileConfig�GenerationConfig�GenerationMixin)�PeftAdapterMixin�deepspeed_config�is_deepspeed_zero3_enabled)� LOSS_MAPPING) �Conv1D�apply_chunking_to_forward� find_pruneable_heads_and_indices�id_tensor_storage�#is_torch_greater_or_equal_than_1_13�prune_conv1d_layer� prune_layer�prune_linear_layer�!translate_to_torch_parallel_style)�AutoHfQuantizer� HfQuantizer)�get_module_from_name)�auto_conversion)#�ACCELERATE_MIN_VERSION�ADAPTER_SAFE_WEIGHTS_NAME�ADAPTER_WEIGHTS_NAME� CONFIG_NAME� DUMMY_INPUTS�FLAX_WEIGHTS_NAME�SAFE_WEIGHTS_INDEX_NAME�SAFE_WEIGHTS_NAME�TF2_WEIGHTS_NAME�TF_WEIGHTS_NAME�WEIGHTS_INDEX_NAME� WEIGHTS_NAME�ContextManagers� ModelOutput�PushToHubMixin� cached_file� copy_func� download_url�extract_commit_hash�has_file�is_accelerate_available�is_bitsandbytes_available�is_flash_attn_2_available�is_offline_mode�is_optimum_available�is_peft_available� is_remote_url�is_safetensors_available�is_torch_flex_attn_available�is_torch_greater_or_equal�is_torch_sdpa_available�is_torch_xla_available�logging�replace_return_docstrings� strtobool)�create_and_tag_model_card�get_checkpoint_shard_files)�ENV_VARS_TRUE_VALUES�is_sagemaker_mp_enabled�is_torch_fx_proxy�is_torchdynamo_compiling)�BitsAndBytesConfig�QuantizationMethod� XLA_USE_BF16�0�XLA_DOWNCAST_BF16)�dispatch_model�infer_auto_device_map�init_empty_weights)�add_hook_to_module) �$check_tied_parameters_on_same_device�extract_model_from_parallel�find_tied_parameters�get_balanced_memory�get_max_memory�load_offloaded_weights�offload_weight�save_offload_index�set_module_tensor_to_device� accelerate�0.31)�get_state_dict_from_offload)� safe_open)� load_file)� save_fileTFcCs@tj��otj��ottj�dd��dkottj�dd��dkS)NZACCELERATE_USE_FSDP�FalserZFSDP_CPU_RAM_EFFICIENT_LOADING)�torch� distributed� is_available�is_initializedrS�os�environ�get�rzrz�YC:\pinokio\api\whisper-webui.git\app\env\lib\site-packages\transformers\modeling_utils.py�is_fsdp_enabled�s ���r|cCs*tj��otj��ottj�dd��dkS)N� LOCAL_RANK�����r)rsrtrurv�intrwrxryrzrzrzr{�is_local_dist_rank_0�s ��r��� __version__z1.10)�find_adapter_config_file�SpecificPreTrainedModelType�PreTrainedModel)�bound)�uniform_�normal_� trunc_normal_� constant_�xavier_uniform_�xavier_normal_�kaiming_uniform_�kaiming_normal_�uniform�normal�xavier_uniform� xavier_normal�kaiming_uniform�kaiming_normalc cs��t}|rdadd�}t��D] \}}ttjj||�qzdVW|a|r6t��D]\}}ttjj||�q)dSdS|a|rOt��D] \}}ttjj||�qBww)z� Context manager to globally disable weight initialization to speed up loading large models. TODO(Patrick): Delete safety argument `_enable=True` at next major version. . Fc_sdS�Nrz)�args�kwargsrzrzr{� _skip_init�sz#no_init_weights.<locals>._skip_initN)� _init_weights�TORCH_INIT_FUNCTIONS�items�setattrrsr�init)�_enableZold_init_weightsr��name� init_funcrzrzr{�no_init_weights�s*����r�cc��dazdVWdadSdaw�NTF)� _is_quantizedrzrzrzr{�set_quantized_state�� �r�ccr�r�)�_is_ds_init_calledrzrzrzr{�set_zero3_state�r�r�� parameter�ModuleUtilsMixinc C�bzt|���jWSty0dtjdtttt ffdd�}|j |d�}t|�}|djYSw)N�module�returncS�dd�|j��D�}|S)NcS�"g|] \}}t�|�r||f�qSrz�rs� is_tensor��.0�k�vrzrzr{� <listcomp>��"zHget_parameter_device.<locals>.find_tensor_attributes.<locals>.<listcomp>��__dict__r��r��tuplesrzrzr{�find_tensor_attributes��z4get_parameter_device.<locals>.find_tensor_attributes��get_members_fnr) �next� parameters�device� StopIterationr�Moduler r �strr�_named_members�r�r��genZ first_tuplerzrzr{�get_parameter_device�s   �r�c Cr�)z` Returns the first parameter dtype (can be non-floating) or asserts if none were found. r�r�cSr�)NcSr�rzr�r�rzrzr{r�r�zMget_first_parameter_dtype.<locals>.find_tensor_attributes.<locals>.<listcomp>r�r�rzrzr{r�r�z9get_first_parameter_dtype.<locals>.find_tensor_attributesr�r) r�r��dtyper�rr�r r r�rr�r�rzrzr{�get_first_parameter_dtypes   �r�cCsd}|��D]7}|j}|��r=ttvrt�rtjSttvr8t�r8|jtj kr-tjS|jtj kr8tj S|jSq|durD|Sdt j dttttffdd�}|j|d�}d}|D]}|}|d��ro|djSq^|dury|djS|��D]}|j}|��r�|jSq}|S)zz Returns the first found floating dtype in parameters if there is one, otherwise returns the last dtype it found. Nr�r�cSr�)NcSr�rzr�r�rzrzr{r�4r�zGget_parameter_dtype.<locals>.find_tensor_attributes.<locals>.<listcomp>r�r�rzrzr{r�3r�z3get_parameter_dtype.<locals>.find_tensor_attributesr�r)r�r��is_floating_pointr\rVrPrs�bfloat16r^�float�double�float32rr�r r r�rr��buffers)r�� last_dtype�tr�r�Z last_tuple�tuplerzrzr{�get_parameter_dtypes>       �   �   �r�cCs(|��D] }|��r|jSqtd��)z_ Returns the first found floating dtype in `state_dict` or asserts if none were found. z5couldn't find any floating point dtypes in state_dict)�valuesr�r�� ValueError�� state_dictr�rzrzr{�get_state_dict_float_dtypeJs  �r�cCs.|��D] }|��r|jSqt|���jS)zt Returns the first found floating dtype in `state_dict` if there is one, otherwise returns the first dtype. )r�r�r�r�r�rzrzr{�get_state_dict_dtypeUs  �r�cCsN|tjkrdSt�dt|��}|durtd|�d���t|��d�}|dS)z� Returns the size (in bytes) occupied by one parameter of type `dtype`. Example: ```py >>> dtype_byte_size(torch.float32) 4 ``` g�?z [^\d](\d+)_?Nz`dtype` is not a valid dtype: �.r�)rs�bool�re�searchr�r�r�groups)r�Z bit_searchZbit_sizerzrzr{�dtype_byte_sizebs r��cs�|jjdkrdSt�fdd�|D��dkrdSt�rdSt|dd�s.t�|jj�d��dSt t |� �� ���}�||vrL|�|j |� �|j kSdS) a> Checks if `model_to_load` supports param buffer assignment (such as when loading in empty weights) by first checking if the model explicitly disables it, then by ensuring that the state dict keys are a subset of the model's parameters. Note: We fully disable this if we are using `deepspeed` �metaFc�g|] }|���r|�qSrz�� startswith�r��key�� start_prefixrzr{r���z9check_support_param_buffer_assignment.<locals>.<listcomp>rZ!_supports_param_buffer_assignmentTzA does not support param buffer assignment, loading will be slower)r��type�lenr"�getattr�logger�debug� __class__�__name__r��iterr��keysr�)� model_to_loadr�r�� first_keyrzr�r{�%check_support_param_buffer_assignmentvs   � r�cs@tj�|t�}tj�|t�}tj�|�}tj�|�}|s9|r!t�s9t�r(ttfntf}tdd�|��d|�d���d} |rS|rOt�rEd} nt� d|�d��n|sSd} | rW|n|} t | d d d �� } t � | �} Wd �n1spwYt t| d ����} | d ���|������fdd��D�}�fdd��D�}|r�t|�dks�t|�dkr�d|jj��}t|�dkr�d�dd�|D��}|d|�d�7}t|�dkr�d�dd�|D��}|d|�d�7}t|��tr�ddini}| r�tn ttj fddi|��}| D]}|tj�||��}|j|dd�~t��q�tjjj� ||�S)a� This is the same as [`torch.nn.Module.load_state_dict`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html?highlight=load_state_dict#torch.nn.Module.load_state_dict) but for a sharded checkpoint. This load is performed efficiently: each checkpoint shard is loaded one by one in RAM and deleted after being loaded in the model. Args: model (`torch.nn.Module`): The model in which to load the checkpoint. folder (`str` or `os.PathLike`): A path to a folder containing the sharded checkpoint. strict (`bool`, *optional`, defaults to `True`): Whether to strictly enforce that the keys in the model state dict match the keys in the sharded checkpoint. prefer_safe (`bool`, *optional*, defaults to `False`) If both safetensors and PyTorch save files are present in checkpoint and `prefer_safe` is True, the safetensors files will be loaded. Otherwise, PyTorch files are always loaded when possible. Returns: `NamedTuple`: A named tuple with `missing_keys` and `unexpected_keys` fields - `missing_keys` is a list of str containing the missing keys - `unexpected_keys` is a list of str containing the unexpected keys zCan't find a checkpoint index (� or z) in r�FTz"Cannot load sharded checkpoint at z+ safely since safetensors is not installed!�r�utf-8��encodingN� weight_mapc�g|]}|�vr|�qSrzrzr�)� loaded_keysrzr{r���z+load_sharded_checkpoint.<locals>.<listcomp>cr�rzrzr�)� model_keysrzr{r��rr�#Error(s) in loading state_dict for �,cS�g|]}d|�d��qS��"rz�r�r�rzrzr{r��rz Missing key(s): cSrrrzrrzrzr{r��r� weights_only� map_location�cpu)�strict)!rw�path�joinr;r7�isfilerLr�r��warning�open�json�load�list�setr�r�r�r�r�r�� RuntimeErrorr(�safe_load_filerrs�load_state_dict�gc�collectr�modulesr��_IncompatibleKeys)�model�folderr Z prefer_safe� index_fileZsafe_index_fileZ index_presentZsafe_index_present� filenamesZ load_safeZ load_index�f�indexZ shard_files� missing_keys�unexpected_keys� error_messageZstr_missing_keysZstr_unexpected_keys�weights_only_kwarg�loader� shard_filer�rz)rrr{�load_sharded_checkpoint�sX  � �  �     r)�checkpoint_file� is_quantizedr r c Cs�|�d�r5t�r5t|dd�� }|��}Wd�n1swY|�d�dvr1td|�d���t|�SzT|durVt�rItj � �rItj � �d ksOt �rTt �sT|sTd }nd }i}t|t�rt|d krtt�tj�t�d �krtt|�rtd di}trzd|ini}tj|fd|i|�|��WSty�}z?z#t|��}|�d�dkr�td��td|�d��|�1s�wYWnttfy�td|�d|�d���wWYd}~dSd}~ww)z] Reads a PyTorch checkpoint file, returning properly formatted errors if they arise. � .safetensors�pt�� frameworkN�format)r-�tf�flax�mlxz"The safetensors archive passed at zf does not contain the valid metadata. Make sure you save your model with the `save_pretrained` method.rr�r �2.1.0�mmapTr r �rz�You seem to have cloned a repository without having git-lfs installed. Please install git-lfs and run `git lfs install` followed by `git lfs pull` in the folder you cloned.zUnable to locate the file z_ which is necessary to load this pretrained model. Make sure you have saved the model properly.z9Unable to load weights from pytorch checkpoint file for 'z' at 'zZ'. If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True.)�endswithrLro�metadatary�OSErrorrr"rsrtrv�get_rankr|r�� isinstancer�r�parser�rr(r� Exceptionr�readr��UnicodeDecodeError) r*r+r r r!r8� extra_argsr&�erzrzr{r�s�  � ������������ � ��� ������rcsZi}|��D]$\�}�fdd�|D�}�dkrt|�}|�|���r&d|_q||�<q|S)z� Sets the `_is_hf_initialized` flag in all submodules of a given model when all its weights are in the loaded state dict. cs.h|]}|���d��r|���d�d��qS�r�r�)r��replacer�� module_namerzr{� <setcomp>:s.z-set_initialized_submodules.<locals>.<setcomp>r�T)� named_modulesr� issupersetr��_is_hf_initialized)rZstate_dict_keys�not_initialized_submodulesr�rrzrDr{�set_initialized_submodules3s rK�tensorr�cCs2|��r|�d�d��|��}|S|��}|S)Nr~)�nelement�view�data_ptr� element_size)rL�stoprzrzr{�_end_ptrEs �rRr�cs�g}t|dd�dur�fdd�|jD�}|�|�t|dd�dur0�fdd�|jD�}|�|�|��D]\}}�rA��d|��n|}|�t||d��q4|S)N�_tied_weights_keysc�"g|] }�r ��d|��n|�qS�r�rzr��prefixrzr{r�Qr�z)_get_tied_weight_keys.<locals>.<listcomp>�_dynamic_tied_weights_keyscrTrUrzrrVrzr{r�Tr�r�rV)r�rS�extendrX�named_children�_get_tied_weight_keys)r�rWZtied_weight_keys�namesr�� submoduleZ local_prefixrzrVr{r[Ns  r[�tensorsr�cCs�g}|D]V}t|�dkr|�|�qg}|D]}||}|�|��t|�|f�q|��|d\}}} |�| h�|dd�D]\} } }| |krP|�|h�n|d�|�| }q@qg} g} |D]}t|�dkrq| �|���qa| �|�qa| | fS)N�rrr~)r��appendrOrR�sort�add�pop)r^r��filtered_tensors�shared�areasr�rL�_� last_stop� last_name�startrQZdisjoint_tensors�shared_tensorsrzrzr{�_find_disjoint\s2   �  rlc Cs�g}g}|D]9}t|�dkrqt�t�}|D]}||}|j|��t|�f}||�|�qt|�dkr:|�|�q|�|�q||fS)Nr_r) r�� collections� defaultdictrr�rOrRrbr`) r^r�rk� identicalrerfr�rL�arearzrzr{�_find_identical{s     rqc s`g}g}i}i}i}d|jj�d�} |��D]4} d} d| vr,| �dd�} |s(| n||| <d| vr>| �dd�} |s:| n||| <| rJ|�| �|�| �qi|�|�}|rt| d7} |��D]\} } | d| �d | �d �7} q[| d 7} t�| �t||�D] \} } |� | �|| <qyt |d d��|� �}�dur��|_ g�ddt jf���fdd� ��||||d�~�S)NzA pretrained model of type `z` �gamma�weight�beta�bias�rcontains parameters that have been renamed internally (a few are listed below but more are present in the model): �* `�` -> `�` �nIf you are using a model from the Hub, consider submitting a PR to adjust these weights and help future users.� _metadatar�Fr�c s(�durin ���dd�i�}||d<|�|dgg�f}t�fdd�|D��dkryt�rtddl}t|j�dd�dd����fd d�|��D�}t|�dkrs|jj|dd ��t j � �dkrd|j |�Wd�n1snwYn|j |�|j ��D]\}} | dur��| |�|d |�q~dS) Nr~�assign_to_params_buffersTcr�rzr�r�rVrzr{r��r�z=_load_state_dict_into_model.<locals>.load.<locals>.<listcomp>rF)rW�recursecsg|] }|�vr�|�qSrzrzr)�named_parametersrzr{r����Z modifier_rankr�)ryr�r"� deepspeed�dictr~r��zero�GatheredParametersrsrtr:�_load_from_state_dict�_modulesr�) r�r�rWr|�local_metadatar�r�Zparams_to_gatherr��child�� error_msgsrr8)r~rWr{r�s*   ��� ��z)_load_state_dict_into_model.<locals>.load)rWr|)r�F)r�r�r�rCr`r�r�� info_once�ziprcr��copyr{rr�) r�r�r�r|�old_keys�new_keys� renamed_keys� renamed_gamma� renamed_beta� warning_msgr��new_key�old_keyrzr�r{�_load_state_dict_into_model�sH     �   r�cCs�t|�dkr|�|�rd�|�d�dd��}|�d�}|}t|�dkr?t||d�r6t||d�}|d=nd}nt|�dks$||krEd}||dfS)z� A helper util to find the last sub-module and the param/buffer name. If `start_prefix` is supplied it'll be removed from the start of the key rr�rN)r�r�r�split�hasattrr�)rZlong_keyr�� split_keyr]rzrzr{�find_submodule_and_param_name�s   � r�cCsh|D]/}t|||�\}}|dur1t||�}t|tjj�r&tj�|�d��}n|�d�}t|||�qdS)z� Moves `loaded_state_dict_keys` in model to meta device which frees up the memory taken by those params. `start_prefix` is used for models which insert their name into model keys, e.g. `bert` in `bert.pooler.dense.weight` Nr�)r�r�r;rsr� Parameter�tor�)r�loaded_state_dict_keysr�r�r]� param_name�new_valrzrzr{�_move_model_to_meta�s    ��r�c( s�g}g}g}i}i}| du}dt|���}|��D]d}d}d|vr/|�dd�}|s+|n|||<d|vrA|�dd�}|s=|n|||<ttjjd�r]d|vrR|�dd �}d |vr\|�d d �}nd |vrg|�d d�}d |vrq|�d d �}|r}|�|�|�|�qi|�|�}|r�|d 7}|��D]\}}|d |�d|�d�7}q�|d7}t � |�t ||�D] \}}|� |�||<q�tt d�}|��D�]�\�}�|vr�q���|�r؈t|�d���}i}|o�|jt jk}| du�r%t �|��r%|�s%| du�r t�fdd�| D���r | t jk�r |�t j�}dtt�t�j�v�rt j|d<n|�| �}|}��d�} | D]}!t||!d�}|du�r=n�q.t|t jjt j f��sLd}|du�re| du�r\|�|j�}|�!��re|�"�}||d<|du�rqd}"n9t|�dk�r�||v�r�d�#|�d�dd��}t|�dk�r�||v�s}|dk�r�d|v�r�t$��d���||}"|"dk�r�| �s�t%|�||�}q�|"dk�r�|du�r�t%|�||�}q�|�r�| j&�r�| j'||�||"|d��s�t(��r�t)��r�dnd}"t|�|"fi|��q�| �*||�|"|| �t(��st+��rKt,|��\}#}$t|#|$�}%d}&t(��r t)��s d}&i}'t|#d��r4|#j-j.j/dk�r4d |'d!<t|%�|%j0�|&�fi|'�|%j1��}%t2|#|$|%�q�|||fS)"a� This is somewhat similar to `_load_state_dict_into_model`, but deals with a model that has some or all of its params on a `meta` device. It replaces the model params with the data from the `state_dict`, while moving the params back to the normal device, but only for `loaded_state_dict_keys`. `start_prefix` is used for models which insert their name into model keys, e.g. `bert` in `bert.pooler.dense.weight` Nz This model rrrsrtru� weight_norm�weight_g�!parametrizations.weight.original0�weight_v�!parametrizations.weight.original1rvrwrxryrz� float8_e4m3fnc3��|] }|��d�vVqdS�r�N�r��r�Zmodule_to_keep_in_fp32�r�rzr{� <genexpr>j�� �z3_load_state_dict_into_meta_model.<locals>.<genexpr>r�r��valuer rr~r�z doesn't have any device set.�disk)� param_device� device_mapr�Z Int8ParamsF� requires_grad)3r�r�rCr�r�utils�parametrizationsr`r�r�r�r�rcrsr�r�r�r�r��any�float16r�r�r�inspect� signaturerkr�r�r�r;r�r� is_contiguous� contiguousrr�ri� requires_parameters_quantization�check_quantized_paramr|r��create_quantized_paramr"r/rsr�r��datar�r�)(rr�r�� expected_keysr��offload_folder� offload_index�state_dict_folder�state_dict_indexr�� hf_quantizer�is_safetensors�keep_in_fp32_modulesr$�pretrained_model_name_or_pathr�r�r�r�r�r+r�r�r�r�r�Zis_torch_e4m3fn_available�paramrEZset_module_kwargsZis_param_float8_e4m3fn� old_param�splitsr�r�r�� tensor_namer�Zparam_toZ val_kwargsrzr�r{� _load_state_dict_into_meta_model s�      �    �      ��   �    �     � ��� ��  " � r�� weights_name�variantcCs>|dur|�d�}|dd�|g|dd�}d�|�}|S)Nr�r~)r�r)r�r�r�rzrzr{� _add_variant�s   r�c @s<eZdZdZedd��Zedd��Zdd�Zdd �Ze d e j fd d ��Z e d e j fd d��Z de d e fdd�Zed-dd��Z d.de deede j de jd e f dd�Z d/dee deded e fdd �Zd!d"�Zd0d#ed$ed efd%d&�Zd'eeee j effd efd(d)�Z *d1d'eeee j effd$ed efd+d,�ZdS)2r�zH A few utilities for `torch.nn.Modules`, to be used as a mixin. cOsFzddl}Wn tytd��w|�t���}|��}|j|_dS)Nr�FYou need to install psutil (pip install psutil) to use memory tracing.)�psutil� ImportErrorrrw�getpid� memory_info�rss�mem_rss_pre_forward)r�r�r�r��process�memrzrzr{�_hook_rss_memory_pre_forward�s  �z-ModuleUtilsMixin._hook_rss_memory_pre_forwardcOslzddl}Wn tytd��w|�t���}|��}|j|_|j|j}|t |d�r0|j nd|_ dS)Nrr�� mem_rss_diff) r�r�rrwr�r�r��mem_rss_post_forwardr�r�r�)r�r�r�r�r�r�r�rzrzr{�_hook_rss_memory_post_forward�s  � z.ModuleUtilsMixin._hook_rss_memory_post_forwardcCs2|��D]}|�|j�|�|j�q|��dS)a% Add a memory hook before and after each sub-module forward pass to record increase in memory consumption. Increase in memory consumption is stored in a `mem_rss_diff` attribute for each module and can be reset to zero with `model.reset_memory_hooks_state()`. N)r�register_forward_pre_hookr��register_forward_hookr��reset_memory_hooks_state��selfr�rzrzr{�add_memory_hooks�s   z!ModuleUtilsMixin.add_memory_hookscCs$|��D] }d|_d|_d|_qdS)z� Reset the `mem_rss_diff` attribute of each module (see [`~modeling_utils.ModuleUtilsMixin.add_memory_hooks`]). rN)rr�r�r�r�rzrzr{r��s �z)ModuleUtilsMixin.reset_memory_hooks_stater�cC�t|�S)z� `torch.device`: The device on which the module is (assuming that all the module parameters are on the same device). )r��r�rzrzr{r�szModuleUtilsMixin.devicecCr�)zw `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype). )r�r�rzrzr{r� szModuleUtilsMixin.dtype�encoder_attention_maskcCst|��dkr|dd�ddd�dd�f}|��dkr&|dd�dddd�f}|j|jd�}d|t�|j�j}|S)z� Invert an attention mask (e.g., switches 0. and 1.). Args: encoder_attention_mask (`torch.Tensor`): An attention mask. Returns: `torch.Tensor`: The inverted attention mask. �Nr_�r���?)�dimr�r�rs�finfo�min)r�r�Zencoder_extended_attention_maskrzrzr{�invert_attention_masks  z&ModuleUtilsMixin.invert_attention_maskNc Cs�|dur t�dt�n|j}|\}}tj||d�}|dddd�f�||d�|ddd�dfk}|�|j�}|j d|j dkr]|j d|j d}tj tj |||f||jd�|gdd�}|dd�ddd�dd�f|dd�dddd�f}|S)N�NThe `device` argument is deprecated and will be removed in v5 of Transformers.)r�r�r�r�r~��axis) �warnings�warn� FutureWarningr�rs�arange�repeatr�r��shape�cat�ones) � input_shape�attention_maskr�� batch_size� seq_lengthZseq_idsZ causal_maskZprefix_seq_len�extended_attention_maskrzrzr{�*create_extended_attention_mask_for_decoder(s&�. ��4z;ModuleUtilsMixin.create_extended_attention_mask_for_decoderr�r�r�r�cCs�|dur|j}|��dkr|jjs|durt�dt�|��dkr0|dd�ddd�dd�f}n+|��dkrO|jjrBt�|||�}n|dd�dddd�f}n t d|�d|j �d���|j |d�}d |t � |�j}|S) a� Makes broadcastable attention and causal masks so that future and masked tokens are ignored. Arguments: attention_mask (`torch.Tensor`): Mask with ones indicating tokens to attend to, zeros for tokens to ignore. input_shape (`Tuple[int]`): The shape of the input to the model. Returns: `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`. Nr_r�r�z!Wrong shape for input_ids (shape z) or attention_mask (shape �)r�r�)r�r��config� is_decoderr�r�r�r�r�r�r�r�rsr�r�)r�r�r�r�r�r�rzrzr{�get_extended_attention_maskDs*�  �� z,ModuleUtilsMixin.get_extended_attention_maskF� head_mask�num_hidden_layers�is_attention_chunkedcCs8|dur|�||�}|dur|�d�}|Sdg|}|S)a� Prepare the head mask if needed. Args: head_mask (`torch.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*): The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard). num_hidden_layers (`int`): The number of hidden layers in the model. is_attention_chunked (`bool`, *optional*, defaults to `False`): Whether or not the attentions scores are computed by chunks or not. Returns: `torch.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or list with `[None]` for each layer. NTr~)�_convert_head_mask_to_5d� unsqueeze)r�rrrrzrzr{� get_head_maskxs   �zModuleUtilsMixin.get_head_maskcCs�|��dkr|�d��d��d��d�}|�|dddd�}n|��dkr/|�d��d��d�}|��dks>Jd|������|j|jd�}|S)zD-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]rrr~r_�zhead_mask.dim != 5, instead r�)r�r�expandr�r�)r�rrrzrzr{r�s  z)ModuleUtilsMixin._convert_head_mask_to_5d�only_trainable�exclude_embeddingsc s�|rdd�|��D���fdd�|��D�}nt|���}g}t|dd�}|r3t�r/ddl}ntd��|D]9}|js<|sn|rgt ||j j �rgt |d �rO|� �}n t |d �rY|jj}nd }|�|��d |�q5|�|���q5t|�S) a� Get number of (optionally, trainable or non-embeddings) parameters in the module. Args: only_trainable (`bool`, *optional*, defaults to `False`): Whether or not to return only the number of trainable parameters exclude_embeddings (`bool`, *optional*, defaults to `False`): Whether or not to return only the number of non-embeddings parameters Returns: `int`: The number of parameters. cSs&g|]\}}t|tj�r|�d��qS)�.weight)r;r� Embedding)r�r�� module_typerzrzr{r��s ��z3ModuleUtilsMixin.num_parameters.<locals>.<listcomp>csg|] \}}|�vr|�qSrzrz)r�r�r��Zembedding_param_namesrzr{r��s�is_loaded_in_4bitFrNz�bitsandbytes is not installed but it seems that the model has been loaded in 4bit precision, something went wrong make sure to install bitsandbytes with `pip install bitsandbytes`. You also need a GPU. rP� quant_storagerr_)rGr~rr�r�rF� bitsandbytesr�r�r;rZ Params4bitr�rPr�itemsizer`�numel�sum) r�r r Ztotal_parameters� total_numelr�bnbr�� num_bytesrzrr{�num_parameters�s8� �   �     �zModuleUtilsMixin.num_parameters� input_dictcCsJt|d�si|_|j|vr||j��Sd|jvr#t�d�d|jd<dS)z� Helper function to estimate the total number of tokens from the model inputs. Args: inputs (`dict`): The model inputs. Returns: `int`: The total number of tokens. �warnings_issued�estimate_tokenszdCould not estimate the number of tokens of the input, floating-point operations will not be computedTr)r�r�main_input_namerr�r)r�rrzrzr{r�s   � z ModuleUtilsMixin.estimate_tokensTcCsd|�|�|j|d�S)a� Get number of (optionally, non-embeddings) floating-point operations for the forward and backward passes of a batch with this transformer model. Default approximation neglects the quadratic dependency on the number of tokens (valid if `12 * d_model << sequence_length`) as laid out in [this paper](https://arxiv.org/pdf/2001.08361.pdf) section 2.1. Should be overridden for transformers with parameter re-use e.g. Albert or Universal Transformers, or if doing long-range modeling with very high sequence lengths. Args: batch_size (`int`): The batch size for the forward pass. sequence_length (`int`): The number of tokens in each line of the batch. exclude_embeddings (`bool`, *optional*, defaults to `True`): Whether or not to count embedding and softmax operations. Returns: `int`: The number of floating-point operations. �)r )rr)r�rr rzrzr{�floating_point_ops�sz#ModuleUtilsMixin.floating_point_opsr�)NN�F�FF�T)r�� __module__� __qualname__�__doc__� staticmethodr�r�r�r��propertyrsr�r�rr�r�r rr�rr r�rrrr r�rrrr rzrzrzr{r��s`      ����� �5���� � $6����cs�eZdZdZdZdZdZdZdZdZ dZ dZ dZ dZ dZdZdZdZdZdZdZdZdZdZdZdZedeeejffdd��Zedefd d ��Z d e!f�fd d � Z"dd�Z#dd�Z$dd�Z%de&e'eefddfdd�Z(e)dd��Z*e)    d�de+de,ej-de,e&eeee.ffde+fdd��Z/e)d ej-dej-fd!d"��Z0ede1j2fd#d$��Z3e)de+fd%d&��Z4e)    d�de,ej-de,e&eeee.ffde+d'e+de!f d(d)��Z5e)d�d'e+de!fd*d+��Z6e)d�d'e+de!fd,d-��Z7d.d/�Z8d0d1�Z9de1j2fd2d3�Z:d4e1j2fd5d6�Z;de1j2fd7d8�Z<d9d:�Z=d;d<�Z>d=d>�Z?e@d?e1j2d@e1j2dAedBefdCdD��ZAdEdF�ZBdefdGdH�ZC   d�dIe,e.dJe,e.dKe+de1jDfdLdM�ZEd�dNdO�ZF   d�dPe1jDdIe,e.dJe,e.dKe+de1jDf dQdR�ZG   d�dSe1jHdIe,e.dTe,e+dKe+de1jHf dUdV�ZIdWdX�ZJ d�dYdZ�ZKd[d\�ZLd]d^�ZMd_e.fd`da�ZNde&e1jDeOe1jDffdbdc�ZPddde�ZQdfee.e'e.ffdgdh�ZRd�didj�ZSdeTfdke+dleUfdmdn�ZVdodp�ZWede+fdqdr��ZXddejYddsddddf dte&eeZj[fdue+dve,e\dweUdxe+dye&e.efdze+d{e,ed|e,e&ee+fd}e+fd~d�Z]e^e_j`��fd�d���Z`d�d�d��Zae^ej1j2jb��fd�d���Zbe^ej1j2jc��fd�d���Zc�fd�d��Zd�fd�d��Zee)ddddddd�ddd�� d�efegd�e,e&eeZj[fd e,e&e!eeZj[fd�e,e&eeZj[fd�e+d�e+d�e+d|e,e&ee+fd�ed�e,e+d�e+degfd�d���Zhe)            d�d�d���Zid�d�d��Zje@    d�d�d���Zke)d�d�d���Zld�d�d��Zmd�d��Znd�d��Zoed�d���Zped�d���Zqd�d��Zred�d���Zsd�etfd�d��Zu�ZvS)�r�a� Base class for all models. [`PreTrainedModel`] takes care of storing the configuration of the models and handles methods for loading, downloading and saving models as well as a few methods common to all models to: - resize the input embeddings, - prune heads in the self-attention heads. Class attributes (overridden by derived classes): - **config_class** ([`PretrainedConfig`]) -- A subclass of [`PretrainedConfig`] to use as configuration class for this model architecture. - **load_tf_weights** (`Callable`) -- A python *method* for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments: - **model** ([`PreTrainedModel`]) -- An instance of the model on which to load the TensorFlow checkpoint. - **config** ([`PreTrainedConfig`]) -- An instance of the configuration associated to the model. - **path** (`str`) -- A path to the TensorFlow checkpoint. - **base_model_prefix** (`str`) -- A string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model. - **is_parallelizable** (`bool`) -- A flag indicating whether this model supports model parallelization. - **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for NLP models, `pixel_values` for vision models and `input_values` for speech models). Nr�� input_idsFr�cCsdt�t�iS)z^ `Dict[str, torch.Tensor]`: Dummy inputs to do a forward pass in the network. r))rsrLr5r�rzrzr{� dummy_inputsR�zPreTrainedModel.dummy_inputscC�dS)z@ :str: Identifies that this is a PyTorch model. r-rzr�rzrzr{r/YszPreTrainedModel.frameworkrcs�t���t|t�std|jj�d|jj�d���t|dd�s)|j|t � �dd�}||_ |j |_ i|_ |��r<t�|�nd|_t�|jj�|_dS)NzParameter config in `zt(config)` should be an instance of class `PretrainedConfig`. To create a model from a pretrained model use `model = z(.from_pretrained(PRETRAINED_MODEL_NAME)`�_attn_implementation_autosetF)� torch_dtype�check_device_map)�super�__init__r;rr�r�r�r��_autoset_attn_implementationrs�get_default_dtyper� name_or_pathr� can_generater�from_model_config�generation_configr��_keep_in_fp32_modules)r�r�inputsr��r�rzr{r1`s   ��  �zPreTrainedModel.__init__cCs,|��|��|j|ur|jj|_dSdS)z� A method executed at the end of each Transformer model initialization, to execute code that needs the model's modules properly initialized (such as weight initialization). N)� init_weights�._backward_compatibility_gradient_checkpointing� base_modelr�base_model_tp_plan�_tp_planr�rzrzr{� post_initws  �zPreTrainedModel.post_initcCs&t|dd�}|durtd��|�|�S)z� Potentially dequantize the model in case it has been quantized by a quantization method that support dequantization. r�Nz?You need to first quantize your model in order to dequantize it)r�r�� dequantize)r�r�rzrzr{rA�s  zPreTrainedModel.dequantizecCs4|jrt|jdd�r|��t|jd�dSdSdS)N�gradient_checkpointingF)�supports_gradient_checkpointingr�r�gradient_checkpointing_enable�delattrr�rzrzr{r<�s�z>PreTrainedModel._backward_compatibility_gradient_checkpointing�tagscCsDt|t�r|g}|jdurg|_|D] }||jvr|j�|�qdS)a\ Add custom tags into the model that gets pushed to the Hugging Face Hub. Will not overwrite existing tags in the model. Args: tags (`Union[List[str], str]`): The desired tags to inject in the model Examples: ```python from transformers import AutoModel model = AutoModel.from_pretrained("google-bert/bert-base-cased") model.add_model_tags(["custom", "custom-bert"]) # Push the model to your namespace with the name "my-custom-bert". model.push_to_hub("my-custom-bert") ``` N)r;r�� model_tagsr`)r�rF�tagrzrzr{�add_model_tags�s    ��zPreTrainedModel.add_model_tagsc Ks|�dt���}|�dd�}d}|dur|�|�}t�|�}|jdur'|j}nd}|�d|�|_t|dd�s?|j ||d|d�}t �rwt swt swddl }t�d �|jjt�d �t�g}t|��||fi|��} Wd�n1sqwYn||fi|��} |dur�t�|�| S) z� All context managers that the model should be initialized under go here. Args: torch_dtype (`torch.dtype`, *optional*): Override the default `torch.dtype` and load the model under this dtype. r.�use_flash_attention_2FN�attn_implementationr-)rJr/r.r�@Detected DeepSpeed ZeRO-3: activating zero.init() for this model�Zconfig_dict_or_path)rcrsr3�_set_default_torch_dtyper��deepcopy�_attn_implementation_internal�_attn_implementationr�r2r"r�r�r�r��infor��Initr!r�r=�set_default_dtype) �clsrr�r.rJ� dtype_origrKr�� init_contextsrrzrzr{� _from_config�s:     �  �� zPreTrainedModel._from_configTrJr.r�r/c Cs�d}t|d�rL|jdurL|jdkr|rtd|j�d���t|jt�sI|jdvrId|j�d�}|jr5|d 7}|jr<|d 7}|jrC|d 7}t|d ��|j}|j � �D]}t ||�} t|t�s_|n|� |d�} | | _qQ|rst �d �d|_|jdkr�|j|||d|d�nK|dkr�|j|dd�}n?|dvr�t�s�|j||dur�dndd�}tjjdur�|jdkr�tj��dkr�t �d�tjj�d�n t|t�r�d|_nd|_d|_|S)az Automatically checks and dispatches to a default attention implementation. In order of priority: 1. An implementation specified in `config._attn_implementation` (due for example to the argument attn_implementation="sdpa" in from_pretrained). 2. DEPRECATED: if use_flash_attention_2 is set to `True` and `flash_attn` is available, flash attention. (`LlamaFlashAttention` for example) 3. SDPA implementation, if available and supported by the model type. (`LlamaSdpaAttention` for example) 4. The default model's implementation otherwise (`LlamaAttention` for example) . NrP�flash_attention_2zBoth attn_implementation="z�" and `use_flash_attention_2=True` were used when loading the model, which are not compatible. We recommend to just use `attn_implementation="flash_attention_2"` when loading the model.)�eager�sdparY�flex_attentionz Specified `attn_implementation="zt"` is not supported. The only possible arguments are `attn_implementation="eager"` (manual attention implementation)zT, `"attn_implementation=flash_attention_2"` (implementation using flash attention 2)zf, `"attn_implementation=sdpa"` (implementation using torch.nn.functional.scaled_dot_product_attention)zV, `"attn_implementation=flex_attention"` (implementation using torch's flex_attention)r�z�The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.F)r.r��hard_check_onlyr/r\T)r])Nr[r[rz�Using the `SDPA` attention implementation on multi-gpu setup with ROCM may lead to performance issues due to the FA backend. Disabling it to use alternative backends.rZ)r�rPrQr�r;r��_supports_flash_attn_2�_supports_sdpa�_supports_flex_attn� sub_configsr�r�ryr�� warning_once�_check_and_enable_flash_attn_2�_check_and_enable_flex_attnrP�_check_and_enable_sdparsr�hip�cuda� device_count�backends�enable_flash_sdpr-) rUrrJr.r�r/Zrequested_attn_implementation�messager�Z sub_configZcurr_attn_implementationrzrzr{r2�st ��  � �� ��  �� z,PreTrainedModel._autoset_attn_implementationr�cCsN|jstd|j�d|�d���t�d|j�d|�d��t��}t�|�|S)a� Change the default dtype and return the previous one. This is needed when wanting to instantiate the model under specific dtype. Args: dtype (`torch.dtype`): a floating dtype to set to. Returns: `torch.dtype`: the original `dtype` that can be used to restore `torch.set_default_dtype(dtype)` if it was modified. If it wasn't, returns `None`. Note `set_default_dtype` currently only works with floating-point types and asserts if for example, `torch.int64` is passed. So if a non-float `dtype` is passed this functions will throw an exception. zCan't instantiate z model under dtype=z' since it is not a floating point dtypezInstantiating z model under default dtype r�)r�r�r�r�rRrsr3rT)rUr�rVrzrzr{rNSs� z(PreTrainedModel._set_default_torch_dtypecCst||j|�S)z@ `torch.nn.Module`: The main body of the model. )r��base_model_prefixr�rzrzr{r=nr+zPreTrainedModel.base_modelcCs�dt|j�vr dSt|j�t|j�vrdS|jD]}t|d�s qdt|�vr-|��r-dSqdt|j�vr@t�|j�d��dSdS)z� Returns whether this model can generate sequences with `.generate()`. Returns: `bool`: Whether this model can generate sequences with `.generate()`. rTr5r�u: has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions. - If you're using `trust_remote_code=True`, you can get rid of this warning by loading the model with an auto class. See https://huggingface.co/docs/transformers/en/model_doc/auto#auto-classes - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception). - If you are not the owner of the model architecture class, please contact the model code owner to update it.F) r�� __bases__r��generater�r5�prepare_inputs_for_generationr�rb)rU�baserzrzr{r5us    � � zPreTrainedModel.can_generater]c Cs�|jst|j�d|j�d���t�s|d}d}tj�d�dur't|�d|����t � tj � d��}t j j r\|t � d�krGt|�d |�d |����t j ��sSt|�d ���t|�d |����t j jr||t � d �krst|�d|�d |����t|�d |����t|dd�} | r�td��|dur�t�d�n|dur�|t jt jfvr�t�d|j�d|�d��|r�|dur�t �d�jjdkr�t j ��r�t�d�ntd��|r�|dur�t|t�r�d|��vs�d|��vr�td��|s�d|_|S)a9 Checks the availability of Flash Attention 2 and compatibility with the current model. If all checks pass and `hard_check_only` is False, the method will set the config attribute `attn_implementation` to "flash_attention_2" so that the model can initialize the correct attention module. z� does not support Flash Attention 2.0 yet. Please request to add support where the model is hosted, on its model hub page: https://huggingface.co/zk/discussions/new or in the Transformers GitHub repo: https://github.com/huggingface/transformers/issues/newzVFlashAttention2 has been toggled on, but it cannot be used due to the following error:z�Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.� flash_attnNz3 the package flash_attn seems to be not installed. r4zY you need flash_attn package version to be greater or equal than 2.1.0. Detected version z. z\ Flash Attention 2 is not available on CPU. Please make sure torch can access a CUDA device.z% Flash Attention 2 is not available. z2.0.4z� you need flash_attn package version to be greater or equal than 2.0.4. Make sure to have that version installed - detected version �use_bettertransformerFz�Flash Attention 2 and BetterTransformer API are not compatible. Please make sure to disable BetterTransformers by doing model.reverse_bettertransformer()zwYou are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviourzcFlash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in z is aG. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`rrgz�You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.a-You are attempting to use Flash Attention 2.0 with a model not initialized on GPU and with no GPU available. This is not supported yet. Please make sure to have access to a GPU and either initialise the model on a GPU by passing a device_map or initialising the model on CPU and then moving it to GPU.r r�z�You are attempting to use Flash Attention 2.0 with a model dispatched on CPU or disk. This is not supported. Please make sure to initialise the model on a GPU by passing a device_map that contains only GPU devices as keys.rY)r^r�r�� _name_or_pathrG� importlib�util� find_specr�rr<r8rsrgrurfr�r�rbr�r��emptyr�r�r;r�r�rQ) rUrr.r�r/r]�prefaceZinstall_messageZflash_attention_version�_is_bettertransformerrzrzr{rc�s���� �� ����� �����z.PreTrainedModel._check_and_enable_flash_attn_2cCsZ|r|js t|j�d���t�std��t�r|js|St|dd�}|r&|S|s+d|_|S)a  Checks the availability of SDPA for a given model. If all checks pass and `hard_check_only` is False, the method will set the config attribute `_attn_implementation` to "sdpa" so that the model can initialize the correct attention module. a� does not support an attention implementation through torch.nn.functional.scaled_dot_product_attention yet. Please request the support for this architecture: https://github.com/huggingface/transformers/issues/28005. If you believe this error is a bug, please open an issue in Transformers GitHub repository and load your model with the argument `attn_implementation="eager"` meanwhile. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="eager")`zSPyTorch SDPA requirements in Transformers are not met. Please install torch>=2.1.1.rrFr[)r_r�r�rOr�r�rQ)rUrr]ryrzrzr{re�s" ��  z&PreTrainedModel._check_and_enable_sdpacCsF|r|js t|j�d���t�std��t�r|js|S|s!d|_|S)a Checks the availability of Flex Attention for a given model. If all checks pass and `hard_check_only` is False, the method will set the config attribute `_attn_implementation` to "flex_attention" so that the model can initialize the correct attention module. a� does not support an attention implementation through torch's flex_attention. Please request the support for this architecture: https://github.com/huggingface/transformers/issues/34809. If you believe this error is a bug, please open an issue in Transformers GitHub repository and load your model with the argument `attn_implementation="eager"` meanwhile. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="eager")`z]PyTorch Flex Attention requirements in Transformers are not met. Please install torch>=2.5.0.r\)r`r�r�rMr�rQ)rUrr]rzrzr{rds �� z+PreTrainedModel._check_and_enable_flex_attncCsdd�}|���|�|_dS)z� Enables the gradients for the input embeddings. This is useful for fine-tuning adapter weights while keeping the model weights fixed. cSs|�d�dS�NT)�requires_grad_)r��input�outputrzrzr{�make_inputs_require_grads;szMPreTrainedModel.enable_input_require_grads.<locals>.make_inputs_require_gradsN)�get_input_embeddingsr��_require_grads_hook)r�r~rzrzr{�enable_input_require_grads5sz*PreTrainedModel.enable_input_require_gradscCs|j��dS)z4 Removes the `_require_grads_hook`. N)r��remover�rzrzr{�disable_input_require_grads@sz+PreTrainedModel.disable_input_require_gradscCs"t||j|�}||ur|��St�)z� Returns the model's input embeddings. Returns: `nn.Module`: A torch module mapping vocabulary to hidden states. )r�rlr�NotImplementedError)r�r=rzrzr{rFsz$PreTrainedModel.get_input_embeddingsr�cCs(t||j|�}||ur|�|�dSt�)z� Set model's input embeddings. Args: value (`nn.Module`): A module mapping vocabulary to hidden states. N)r�rl�set_input_embeddingsr�)r�r�r=rzrzr{r�Ssz$PreTrainedModel.set_input_embeddingscCr,)z� Returns the model's output embeddings. Returns: `nn.Module`: A torch module mapping hidden states to vocabulary. Nrzr�rzrzr{�get_output_embeddings`�z%PreTrainedModel.get_output_embeddingscCr,)a] Initialize the weights. This method should be overridden by derived class and is the only initialization method that will be called when loading a checkpoint using `from_pretrained`. Any attempt to initialize outside of this function will be useless as the torch.nn.init function are all replaced with skip. Nrzr�rzrzr{r�ir�zPreTrainedModel._init_weightscCs$t|dd�rdS|�|�d|_dS)zM Initialize the weights if they are not already initialized. rIFNT)r�r�rIr�rzrzr{�_initialize_weightsrs   z#PreTrainedModel._initialize_weightscCs�t|jdd�r|��}|dur|�||���t|jdd�r?t|jdd�r?t||j�r1t||j�}|�|j|j |jd�}||_ |� �D] }t|d�rN|� �qCdS) z� Tie the weights between the input embeddings and the output embeddings. If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning the weights instead. �tie_word_embeddingsTN�is_encoder_decoderF�tie_encoder_decoder�encoder� _tie_weights) r�rr��_tie_or_clone_weightsrr�rl�_tie_encoder_decoder_weightsr��decoderrXrr�)r��output_embeddings� tied_weightsr�rzrzr{� tie_weights{s   �  ��zPreTrainedModel.tie_weightsr�r�rl�base_encoder_namec s�g}g�|j|jkrt�|j�d|j�d��   d dtjdtjdtdtd ttf ��fd d � ��|||||�t|�dkrGt�d |����S)N� and zZ are not equal. In this case make sure that all encoder weights are correctly initialized.rr��decoder_pointer�encoder_pointerrEr��uninitialized_encoder_weightsc s�t|tj�r t|tj�sJ|�d|�d���t|d�rLt|d�s"J�|j|_��|�|�d��t|d�rJt|d�s<J���|�|�d��|j|_dS|j}|j} t| �dkr�t|�dkshJd|�d |�����fd d �|� �D�} d} | � �D]e\} } | � �r�t t | �| �}| }t| |t||��s�t|�t| �kr�| d 8} qyn| |vr�qy|d kr�td��| }}�| |||�d| |||d |�d|��|�d|��d�| ��d|�qy|t| �7}dSdS)Nr�z have to be of type nn.Modulersr ruz.biasrzEncoder module z does not match decoder module csh|]}�d|�qS)�/rz)r�Zsub_namerDrzr{rF�rzkPreTrainedModel._tie_encoder_decoder_weights.<locals>.tie_encoder_to_decoder_recursively.<locals>.<setcomp>ri�z�Max depth of recursive function `tie_encoder_to_decoder` reached. It seems that there is a circular dependency between two or more `nn.Modules` of your model.r�r�)�depth�total_encoder_name�total_decoder_name)r;rr�r�rsr`rur�r�r�r��isdigitr�rr�r�r�r)r�r�rEr�r�r�r�r�Zencoder_modulesZdecoder_modulesZall_encoder_weightsZencoder_layer_posr�r�� encoder_name� decoder_name��"tie_encoder_to_decoder_recursivelyr�rDr{r��sl ��   �����   � �zXPreTrainedModel._tie_encoder_decoder_weights.<locals>.tie_encoder_to_decoder_recursivelyz;The following encoder weights were not tied to the decoder )rr�r�) r�r�rRrr�r�r r�r)r�r�rlr�r�rzr�r{r��s8 � ������D � �z,PreTrainedModel._tie_encoder_decoder_weightscCs�|jjrt�|j���|_n|j|_t|dd�dur3tj�|j j d|jj d|j j dfdd�|j _ t |d�rCt |d�rE|j |_dSdSdS)zPTie or clone module weights depending of whether we are using TorchScript or notruNr�constant� out_features�num_embeddings)r� torchscriptrr�rs�cloner�� functional�padrur�r�r�r�r�)r�r�Zinput_embeddingsrzrzr{r��s�� �z%PreTrainedModel._tie_or_clone_weightscCs�t�}|g}t|�dkrB|�d�}|jj|vr<t|t�r4|jdur-t|jj�d|�d���|t|j�B}|t |� ��7}t|�dks t |�S)a� Get the modules of the model that should not be spit when using device_map. We iterate through the modules to get the underlying `_no_split_modules`. Args: device_map (`str`): The device map value. Options are ["auto", "balanced", "balanced_low_0", "sequential"] Returns: `List[str]`: List of modules that should not be split rr~Nz does not support `device_map='z_'`. To implement support, the model class needs to implement the `_no_split_modules` attribute.) rr�rcr�r�r;r��_no_split_modulesr�r�children)r�r�r�Zmodules_to_checkr�rzrzr{�_get_no_split_moduless      � � z%PreTrainedModel._get_no_split_modules�new_num_tokens�pad_to_multiple_of� mean_resizingcCs�|�|||�}|dur|dur|St|d�o|jdu}t�rD|sDddl}|jj|jdd��|jjd}Wd�n1s>wYn|jjd}||j � �_ ||_ |� �|S)a$ Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`. Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method. Arguments: new_num_tokens (`int`, *optional*): The new number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing anything. pad_to_multiple_of (`int`, *optional*): If set will pad the embedding matrix to a multiple of the provided value.If `new_num_tokens` is set to `None` will just pad the embedding to a multiple of `pad_to_multiple_of`. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc mean_resizing (`bool`): Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`. Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models, where the generated tokens' probabilities won't be affected by the added embeddings because initializing the new embeddings with the old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings. Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html Return: `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model. Nr�rr�) �_resize_token_embeddingsr�r�r"r�r�r�rsr�r�get_text_config� vocab_sizer�)r�r�r�r�Z model_embedsr+r�r�rzrzr{�resize_token_embeddings!s$ ��  z'PreTrainedModel.resize_token_embeddingsc CsR|��}|�||||�}t|d�r|j}t||�|jj}|�|�|�|�t|d�o0|j du}|durdt �r^|s^ddl } | j j |jdd��|jjd}Wd�n1sXwYn|jjd}|��dur�|jjs�|��} t| tjj�r�|j| ||d�} n|j| ||d�} t| d�r�| j}t| |�| jj} | �| �|�| �|��S)N�_hf_hookr�rr�)r�)r�_get_resized_embeddingsr�r�rbrsr�r{r�r�r"r�r�r�r�r�rr�r;rsrr�_get_resized_lm_headZset_output_embeddings) r�r�r�r��old_embeddings�new_embeddings�hookZold_embeddings_requires_gradr+r�� old_lm_head� new_lm_headZold_lm_head_requires_gradrzrzr{r�\s>�     ��     z(PreTrainedModel._resize_token_embeddingsr�c Cs�|dur&t|t�std|�d���|dur|jjd}||d||}n t�d|�d��|dur5|St|d�o>|jdu}t �ri|siddl }|j j |jdd ��|j� �\}}Wd�n1scwYn|j� �\}}||kryt �sy|St|tj�s�td t|��d tj�d tj�d ���tj|||jj|jjd�} ||kr�|s�|�| �nE||kr�|r�t�d�||} t �r�|s�ddl }|j j |jgdd ��|�|| ||| �Wd�n1s�wYn |�|| ||| �t||�} t ��r5|�s5ddl }|j| jg} |j j | dd ��|jjd| �dd�f| jjd| �dd�f<Wd�n 1�s/wYn|jjd| �dd�f| jjd| �dd�f<t ��r�|�s�ddl }|j| jg} |j j | dd ��6| j|_| jjjd|_|jdu�r�|d|jk�r�d|_Wd�|SWd�|SWd�|S1�s�wY|S| jj|j_| jjjd|_|jdu�r�|d|jk�r�d|_|S)a� Build a resized Embedding Module from a provided token Embedding Module. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end Args: old_embeddings (`torch.nn.Embedding`): Old embeddings to be resized. new_num_tokens (`int`, *optional*): New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing anything. pad_to_multiple_of (`int`, *optional*): If set will pad the embedding matrix to a multiple of the provided value. If `new_num_tokens` is set to `None` will just pad the embedding to a multiple of `pad_to_multiple_of`. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc mean_resizing (`bool`): Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`. Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models, where the generated tokens' probabilities will not be affected by the added embeddings because initializing the new embeddings with the old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings. Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html Return: `torch.nn.Embedding`: Pointer to the resized Embedding Module or the old Embedding Module if `new_num_tokens` is `None` Nz5Asking to pad the embedding matrix to a multiple of `z@`, which is not and integer. Please make sure to pass an integerrrz�You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be a.. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tcr�r�zOld embeddings are of type �, which is not an instance of zj. You should either use a different resize function or make sure that `old_embeddings` are an instance of r�r�z�The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`)r;rr�rsr�r�rRr�r�r"r�r�r��sizerr� TypeErrorr�r�r�r�rb�(_init_added_embeddings_weights_with_meanr�r�r�� padding_idx) r�r�r�r�r�r+r��old_num_tokens�old_embedding_dimr��added_num_tokens�n�paramsrzrzr{r��s�+  � �� �� �� �   �  ��� �  .��,  �� �� �� �z'PreTrainedModel._get_resized_embeddingsr�� transposedc Cs�|dur|St|d�o|jdu}t�rC|sCddl}|jj|jdd��|s*|j��n|j����\}}Wd�n1s=wYn|sJ|j��n|j����\}}||kr\t�s\|St |t j �stt dt |��dt j �dt j �d���|sz||fn||f} |jdu} t j | | |jj|jjd ��} ||kr�|s�|�| �nf||k�r|�rt�d �||} t�r�|s�ddl}|jg} | r�| |jg7} |jj| dd��|�|| ||| |�| r�|�|| | �Wd�n1s�wYn|�|| ||| |�| �r|�|| | �t||�}t��rB|�sBddl}|j|j| j| jg} |jj| dd��|�| |||| �Wd�| S1�s;wY| S|�| |||| �| S) a� Build a resized Linear Module from a provided old Linear Module. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end Args: old_lm_head (`torch.nn.Linear`): Old lm head liner layer to be resized. new_num_tokens (`int`, *optional*): New number of tokens in the linear matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just returns a pointer to the input tokens `torch.nn.Linear` module of the model without doing anything. transposed (`bool`, *optional*, defaults to `False`): Whether `old_lm_head` is transposed or not. If True `old_lm_head.size()` is `lm_head_dim, vocab_size` else `vocab_size, lm_head_dim`. mean_resizing (`bool`): Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`. Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models, where the generated tokens' probabilities will not be affected by the added embeddings because initializing the new embeddings with the old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings. Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html Return: `torch.nn.Linear`: Pointer to the resized Linear Module or the old Linear Module if `new_num_tokens` is `None` Nr�rr�z#Old language model head is of type r�zg. You should either use a different resize function or make sure that `old_lm_head` are an instance of r�)rur�r�aThe new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`)r�r�r"r�r�r�rsr�r�r;r�Linearr�r�rur�r�r�r�rb�%_init_added_lm_head_weights_with_mean�"_init_added_lm_head_bias_with_meanr��!_copy_lm_head_original_to_resized)r�r�r�r�r�r+r�r��old_lm_head_dimZnew_lm_head_shape�has_new_lm_head_biasr�r�r��num_tokens_to_copyrzrzr{r�! s�# ���� �� �  �   ���� �  � � � � �z$PreTrainedModel._get_resized_lm_headc Cs�|jj�tj�}tj|dd�}||}|j||} tj�| �} t | | jk� �o3t� | � o3| dk� ��} | r\tj j j|d| d�} | j|fd��|jj�|jjd|d�dd�f<dS|ddd�f�|d��|jj�|jjd|d�dd�f<dS)Nrr���&� .>)�covariance_matrix)� sample_shaper~r)rsr�r�rsr��mean�T�linalg�eigvalsr��all� is_complex� distributions�multivariate_normal�MultivariateNormal�sampler�r�) r�r�r�r�r�r�Zold_embeddings_weightZmean_embeddingsZold_centered_embeddings� covariance� eigenvaluesZis_covariance_psd� distributionrzrzr{r�� s& $��� � �z8PreTrainedModel._init_added_embeddings_weights_with_meancCsZ|r|jjj|j_|jjj|j_|�|||||�|r+|jjj|j_|jjj|j_dSdSr�)rsr�r�r�)r�r�r�r�r�r�r�rzrzr{r�� s  ��z5PreTrainedModel._init_added_lm_head_weights_with_meancCsVtj|jjdtjd�}tj|jjdd��tj�}|jjd|d�j|d|d�dS)Nr)r�r�r�r~r�)r��std)rsr�rur�r�r�r�r�)r�r�r�r�Z bias_meanZbias_stdrzrzr{r�� s&z2PreTrainedModel._init_added_lm_head_bias_with_meancCs�|s|jjd|�dd�f|jjd|�dd�f<n|jjdd�d|�f|jjdd�d|�f<|rA|jjd|�|jjd|�<dSdSr�)rsr�ru)r�r�r�r�r�r�rzrzr{r�� s ., �z1PreTrainedModel._copy_lm_head_original_to_resized�new_num_position_embeddingscC�$td|j�d|j�d|jj�d���)Nz4`resize_position_embeddings` is not implemented for �B`. To implement it, you should overwrite this method in the class � in `modeling_�.py`�r�r�r$)r�r�rzrzr{�resize_position_embeddings� � ���z*PreTrainedModel.resize_position_embeddingscCr�)Nz1`get_position_embeddings` is not implemented for r�r�r�r�r�rzrzr{�get_position_embeddings� r�z'PreTrainedModel.get_position_embeddingscCs6|jjr |�|jj�tr|�|j�|��dSdS)z� If needed prunes and maybe initializes weights. If using a custom `PreTrainedModel`, you need to implement any initialization logic in `_init_weights`. N)r� pruned_heads� prune_headsr��applyr�r�r�rzrzr{r;� s   �zPreTrainedModel.init_weights�heads_to_prunecCsN|��D]\}}t|jj�|g��t|�B}t|�|jj|<q|j�|�dS)a� Prunes heads of the base model. Arguments: heads_to_prune (`Dict[int, List[int]]`): Dictionary with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2. N)r�rrr�ryrr=Z _prune_heads)r�r��layer�headsZ union_headsrzrzr{r� s zPreTrainedModel.prune_headscCs�|js t|jj�d���|durddi}tjtfi|��}dt�|j �j v}|s0|j d|d�n|� t|j dd��t � d�t|d d �rK|��dSdS) az Activates gradient checkpointing for the current model. Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint activations". We pass the `__call__` method of the modules instead of `forward` because `__call__` attaches all the hooks of the module. https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2 Args: gradient_checkpointing_kwargs (dict, *optional*): Additional keyword arguments passed along to the `torch.utils.checkpoint.checkpoint` function. z) does not support gradient checkpointing.N� use_reentrantTr�)�enable�gradient_checkpointing_func�r��VYou are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.�_hf_peft_config_loadedF)rCr�r�r�� functoolsrrr�r��_set_gradient_checkpointingr�r�r�rr�r�)r�Zgradient_checkpointing_kwargsr��_is_using_old_formatrzrzr{rD s�  �z-PreTrainedModel.gradient_checkpointing_enabler�r�cCs`d}t|d�r||_||_d}|��D]}t|d�r"||_||_d}q|s.t|jj�d���dS)NFrBTz� is not compatible with gradient checkpointing. Make sure all the architecture support it by setting a boolean attribute `gradient_checkpointing` to modules of the model that uses checkpointing.)r�Z_gradient_checkpointing_funcrBrr�r�r�)r�r�r�Zis_gradient_checkpointing_setr�rzrzr{r�> s    � ��z+PreTrainedModel._set_gradient_checkpointingcCsd|jr$dt�|j�jv}|s|jdd�nt�d�|�t|jdd��t |dd�r0|� �dSdS)z� Deactivates gradient checkpointing for the current model. Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint activations". r�F)r�r�r�r�N) rCr�r�r�r�r�rr�rr�r�)r�r�rzrzr{�gradient_checkpointing_disableT s�  �z.PreTrainedModel.gradient_checkpointing_disablecCstdd�|��D��S)z� Whether gradient checkpointing is activated for this model or not. Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint activations". css �|] }t|d�o |jVqdS)rBN)r�rB�r��mrzrzr{r�s ��z<PreTrainedModel.is_gradient_checkpointing.<locals>.<genexpr>)r�rr�rzrzr{�is_gradient_checkpointingk sz)PreTrainedModel.is_gradient_checkpointing�5GB�save_directory�is_main_processr�� save_function� push_to_hub�max_shard_size�safe_serializationr��token�save_peft_formatc H s�| �dd�} | �dd�} | dur t�dt�| durtd��| } | dur(| | d<t|dd�}t|d d�}|duoBt|t�oB|j|d �}|durU|sU|sUtd |j j �d ���d | vrct�d�| �d �}|rlt �slt d��t j�|�r}t�d|�d��dSt j|dd�|r�| �dd�}| �d|�t jj�d�}|j|fi| ��}|�|�}t|�}t|�}t|��d�d|j_|jjg|j_d|j_|j dur�t!|||jd�|�ra|�s|j�"�}|�#��r t$|�dk�r t�d|�d�t%�|�&�D]\}}t'|j(||�t'|j|d�q�|j�)|�|�#��r|j(�)|�|�rat�*d�|�+�}| �rDt�*d�i}|�&�D] \}}||d|��<�q5|}|�,�}t$|�dk�rStd ��|d}|j-|}|�)|�i}|du�r�t.|d!��r�t$t/|j0�1���dk�r�d"|j0�1�v�s�d#|j0�1�v�r�t�d$�|�2�D]\�}�d%k�r��q�|�3�} | D] }||�d|��<�q��q�|�3�}t4�r�t5j6j7j8D] \}!}"|!|�}�q�|j9du�r�|j9D] }#|#|�:�v�r�||#=�q�|�r�t;�<t=�}$|�&�D] \�}%t|%t>j?��r|$t@|%��A���q�|$tB|%��A���q�t.|d!��r-tC|�}&|&�r*|&d��fd&d'�|$�&�D�}'n i}'n d(d'�|$�&�D�}'tD|�}(g})t/�}*|'�1�D]5}+|(du�rwd},tE|+�D]&�tF�fd)d*�|(D��}-|-�ru�|v�ru|,d7},|,t$|+�k�ru|*�G���qP�qCtH|'�1�|�\}.}/|/D] �|��I�|�<�q�tJ|.|�\}.}0|0D]"}1|1�K|*�}2|2D]�|�=�q�|1�L|*�}3t$|3�dk�r�|)�A|3��q�|.�r�|)�At/|.��t$|)�dk�r�tMd+|)�d,���|�s�|�r�tNntO}4tP|4|�}4n|�r�tQntR}4|4�Sd-d.��Sd/d0�}5tT||5|d1�}6d}7|6jU�r |6jV|6jWd2�}7t �X|�D]H}8t j�Y||8�}9|4�Sd-d%��Sd/d%�}:|8�Sd-d%��Sd/d%�};tZ�[d3�}<|8�\|:��rVt j�|9��rV|8|6j]�:�v�rV|�rV|<�^|;�du�rVt �_|9��q|6j]�&�}=|�rgt`ja|=d4d5�}=|=D]f\}>}?i}@|?D]}%||%�b�|@|%<||%=�qq|�r�tctd�ed6�k�r�t d7tc�d8���d9d'�|@D�}A|@D] }B||B}tf||B|A�}A�q�|A}@~Atg�h�|�r�ti|@t j�Y||>�d:d;id<��qi||@t j�Y||>���qi~|7du�r�t j�Y||4�}Ct�*d=|C���nJ|�r�tjntk}Dt j�Y|tP|D|��}Dtl|Dd>d?d@��}Etmjn|7dAddB�dC}F|E�o|F�Wd�n 1�swYt�*dD|�dEt$|6j]��dF|D�d��|�rRtp||jq| | dG�}G|G�rt j�Y|dH��|js||||| dI�dSdS)Ja� Save a model and its configuration file to a directory, so that it can be re-loaded using the [`~PreTrainedModel.from_pretrained`] class method. Arguments: save_directory (`str` or `os.PathLike`): Directory to which to save. Will be created if it doesn't exist. is_main_process (`bool`, *optional*, defaults to `True`): Whether the process calling this is the main process or not. Useful when in distributed training like TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on the main process to avoid race conditions. state_dict (nested dictionary of `torch.Tensor`): The state dictionary of the model to save. Will default to `self.state_dict()`, but can be used to only save parts of the model or if special precautions need to be taken when recovering the state dictionary of a model (like when using model parallelism). save_function (`Callable`): The function to use to save the state dictionary. Useful on distributed training like TPUs when one need to replace `torch.save` by another method. push_to_hub (`bool`, *optional*, defaults to `False`): Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the repository you want to push to with `repo_id` (will default to the name of `save_directory` in your namespace). max_shard_size (`int` or `str`, *optional*, defaults to `"5GB"`): The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`). We default it to 5GB in order for models to be able to run easily on free-tier google colab instances without CPU OOM issues. <Tip warning={true}> If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard which will be bigger than `max_shard_size`. </Tip> safe_serialization (`bool`, *optional*, defaults to `True`): Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). variant (`str`, *optional*): If specified, weights are saved in the format pytorch_model.<variant>.bin. token (`str` or `bool`, *optional*): The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use the token generated when running `huggingface-cli login` (stored in `~/.huggingface`). save_peft_format (`bool`, *optional*, defaults to `True`): For backward compatibility with PEFT library, in case adapter weights are attached to the model, all keys of the state dict of adapters needs to be pre-pended with `base_model.model`. Advanced users can disable this behaviours by setting `save_peft_format` to `False`. kwargs (`Dict[str, Any]`, *optional*): Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. �use_auth_tokenN�ignore_metadata_errorsF�rThe `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.�V`token` and `use_auth_token` are both specified. Please set only the argument `token`.rr�r�)rzThe model is quantized with z� and is not serializable - check out the warnings from the logger on the traceback to understand the reason why the quantized model is not serializable.� save_configze`save_config` is deprecated and will be removed in v5 of Transformers. Use `is_main_process` instead.zR`safe_serialization` requires the `safetensors library: `pip install safetensors`.zProvided path (z#) should be a directory, not a fileT��exist_ok�commit_message�repo_idr~r�r)rrzHMoving the following attributes in the config to the generation config: z�. You are seeing this warning because you've set generation parameters in the model config, as opposed to in the generation config.zhDetected adapters on the model, saving the model in the PEFT format, only adapter weights will be saved.z�To match the expected format of the PEFT library, all keys of the state dict of adapters will be pre-pended with `base_model.model`.zbase_model.model.z�Multiple active adapters detected, saving multiple active adapters is not supported yet. You can save adapters separately one by one by iteratively calling `model.set_adapter(adapter_name)` then `model.save_pretrained(...)`� hf_device_mapr r�z|Attempting to save a model with offloaded modules. Ensure that unallocated cpu memory exceeds the `shard_size` (5GB default)r�cs,i|]\}}t�fdd�|D��r||�qS)c3��|]}|�vVqdSr�rz�r�r��� tied_namesrzr{r�_ ��z=PreTrainedModel.save_pretrained.<locals>.<dictcomp>.<genexpr>)r��r��ptrr\rrzr{� <dictcomp>^ s ��z3PreTrainedModel.save_pretrained.<locals>.<dictcomp>cSs"i|] \}}t|�dkr||�qS�r�r�rrzrzr{rd r�c3s�|] }t�|��VqdSr��r�r�)r��pat�r�rzr{r�p s�z2PreTrainedModel.save_pretrained.<locals>.<genexpr>z8The weights trying to be saved contained shared tensors z� that are mismatching the transformers base configuration. Try saving using `safe_serialization=False` or remove this tensor sharing.z.binz {suffix}.binr,z{suffix}.safetensors)�filename_patternr)r8r�z(.*?)-\d{5}-of-\d{5}zSaving checkpoint shards��descrmzxYou need accelerate version to be greater or equal than 0.31 to save models with offloaded parameters. Detected version z<. Please upgrade accelerate with `pip install -U accelerate`cSsi|]}|d�qS�r�rzrrzrzr{r� �r0r-)r8zModel weights saved in �wr�r�r_)�indent� sort_keys� z:The model is bigger than the maximum size per checkpoint (z) and is going to be split in z^ checkpoint shards. You can find where each parameters has been saved in the index located at )rrz README.md)r r)trcr�r�r�r�r�r;r.Zis_serializable�quantization_config� quant_methodrLr�rwr rr��error�makedirsr��sep� _create_repo�_get_files_timestamps� unwrap_modelr�r�rr.r�r�� architecturesr-� _auto_classr�&_get_non_default_generation_parametersr5r�� UserWarningr�r�r7�save_pretrainedrRZget_adapter_state_dictZactive_adaptersZ peft_configr�rr r�rGr��IS_SAGEMAKER_MP_POST_1_10�smp�stateZmodule_managerZtranslate_functions�_keys_to_ignore_on_saver�rmrnrrsrr'r`�idrer[�sortedr�rbrlr�rq� intersection� differencerr8r<r�r2r3rCr� is_shardedr8Ztensor_to_filename�listdirrr��compiler��filename_to_tensors� fullmatchr�rQ�tqdmr��accelerate_versionrr<rnrr�safe_save_filer7r;rr�dumps�writerTrG�save�_upload_modified_files)Hr�r�r�r�r�r�rrr�rrr�rrr�r�Zquantization_serializabler r �files_timestampsZ model_to_saver�Zmisplaced_generation_parametersr�� param_valueZpeft_state_dictr�r�Zactive_adapterZcurrent_peft_config� module_mapr�Zmodule_state_dictZ smp_to_hfrgZ ignore_key�ptrsrL� tied_paramsZ shared_ptrsrSZ error_namesZto_delete_namesr\�foundZmatches_pattern� shared_namesZdisjoint_namesZidentical_namesZinames�known�unknownr�rZstate_dict_splitr"�filename� full_filenameZweights_no_suffixZfilename_no_suffix�regr<r(r^�shardZshard_state_dictrEZpath_to_weightsZsave_index_filer!�content� model_cardrz)r�rr{r0u s� ? ��  � ���       ��   ���   � � �   �   �    �    � � �� � �� �    �   ���� � ��zPreTrainedModel.save_pretrainedcsj|jdur|jng}|�dg�}t|t�r|g}|D] }||vr%|�|�q|r,||d<t�j|i|��S)NrF)rGryr;r�r`r0r�)r�r�r�rFZ tags_kwargsrHr:rzr{r� s   �zPreTrainedModel.push_to_hubcCs<tdd�|��D��}|rtdd�|��D��}||}|S)a Get the memory footprint of a model. This will return the memory footprint of the current model in bytes. Useful to benchmark the memory footprint of the current model and design some tests. Solution inspired from the PyTorch discussions: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822/2 Arguments: return_buffers (`bool`, *optional*, defaults to `True`): Whether to return the size of the buffer tensors in the computation of the memory footprint. Buffers are tensors that do not require gradients and not registered as parameters. E.g. mean and std in batch norm layers. Please see: https://discuss.pytorch.org/t/what-pytorch-means-by-buffers/120266/2 cS�g|] }|��|���qSrz�rMrP)r�r�rzrzr{r� rz8PreTrainedModel.get_memory_footprint.<locals>.<listcomp>cSrTrzrU)r��bufrzrzr{r� r)rr�r�)r�Zreturn_buffersr�Zmem_bufsrzrzr{�get_memory_footprint s  z$PreTrainedModel.get_memory_footprintcs�t|dd�tjkr td��t|dd�tjkr9t|dd�r td��t�tj�d��t�d�kr7td|j �d ���dSt �j |i|��S) N�quantization_methodz2`.cuda` is not supported for HQQ-quantized models.�is_loaded_in_8bitFz�Calling `cuda()` is not supported for `8-bit` quantized models. Please use the model as it is, since the model has already been set to the correct devices.r�0.43.2z�Calling `cuda()` is not supported for `4-bit` quantized models with the installed version of bitsandbytes. The current device is `�L`. If you intended to move the model, please install bitsandbytes >= 0.43.2.) r�r[�HQQr��BITS_AND_BYTESrr<rtr8r�r0rg)r�r�r�r:rzr{rg# s ����zPreTrainedModel.cudacs�d|v}|s|D] }t|tj�rd}nqt|dd�tjkr"td��t|dd�tjkrS|r1td��t|dd�r;td��t� t j �d ��t� d �krRtd |j �d ���nt|dd�tj krb|rbtd ��t�j|i|��S)Nr�TrXz0`.to` is not supported for HQQ-quantized models.z�You cannot cast a bitsandbytes model in a new `dtype`. Make sure to load the model using `from_pretrained` using the desired `dtype` by passing the correct `torch_dtype` argument.rYFz�`.to` is not supported for `8-bit` bitsandbytes models. Please use the model as it is, since the model has already been set to the correct devices and casted to the correct `dtype`.rrZz�Calling `to()` is not supported for `4-bit` quantized models with the installed version of bitsandbytes. The current device is `r[z�You cannot cast a GPTQ model in a new `dtype`. Make sure to load the model using `from_pretrained` using the desired `dtype` by passing the correct `torch_dtype` argument.)r;rsr�r�r[r\r�r]rr<rtr8r�ZGPTQr0r�)r�r�r�Zdtype_present_in_args�argr:rzr{r�6 s> �� �����zPreTrainedModel.toc� t|dd�r td��t�j|�S)Nr+Fz�`.half()` is not supported for quantized model. Please use the model as it is, since the model has already been casted to the correct `dtype`.)r�r�r0�half�r�r�r:rzr{r`^ � � zPreTrainedModel.halfcr_)Nr+Fz�`.float()` is not supported for quantized model. Please use the model as it is, since the model has already been casted to the correct `dtype`.)r�r�r0r�rar:rzr{r�h rbzPreTrainedModel.float�main) r� cache_dir�ignore_mismatched_sizes�force_download�local_files_onlyr�revision�use_safetensorsr rUr�rdrerfrgrhrir c _s| �dd�} | �dd�}| �dd�}| �dd�}| �dd�}| �dd�}| �d d�}| �d d�}| �d d�}| �d d�}| �d d�}| �dd�}| �dd�}| �dd�}| �dd�}| �dd�}| �dd�}| �dd�}| �dd�}| �dd�} | �dd�}!| �dd�}"| �dd�}#| �dd�}$| �dd�}%| �di�}&| �dd �}'| �d!d�}(| �d"d�})| �d#d�}*d}+| �d$d�},|,dur�|,d%kr�td&|,�d'���t�r�d}|dur�t�d(t�|dur�td)��|}|dur�|&dur�d*|&vr�||&d*<| dur�t�s�d} |du�rt�d+�|*du�rt ��std,��|$du�r=t |t ��s7t |t ||||||||#dddd-� }-t|-|$�}$nt|dd�}$t��r�|&�d.d�}.|.du�r[t|f||||||$d/�|&��}.|.du�r�tj�|.��r�t|.d0d1d2��}/|}.t�|/�d3}Wd�n 1�s�wYnd}.t |tj��r�d|i}n8t |t��r�|d4v�r�z dt�|�i}Wn#t�y�td5|�d'���wt |t��r�|d6k�r�td7��d|i}|du�r�|du�r�d}n|�s�td8��|�r�t��r�td9��t ��s�td:t�d;���|!�s| �r-|"du�r td<��d=d>�| � �D�}0i|0�|!| d?��}0t!j"d�|0dd@�| ��\}"} t�dA�||B }1dBdC|dD�}2|du�rA||2dE<t#��rO|�sOt�$dF�d}t |t ��sw|du�r\|n|}3|j%j&|3f|d|||||||#||dG� | ��\}}4nt'�(|�}| �dHd�}5|5du�r�|5|_)| }4t|dd�du}6|6�s�|"du�r�|6�r�t*�+|j,|"�|_,n|"|_,t*j-|j,|6dI�}7nd}7|7du�r�|7j.||||| dJ�|7�/|�}|7�0|�}|7j,j1j2|2dK<|du�r�d}t�dL�|7du}8d}9d}:d};d�d}<|*du�r�|7du�r�tdM��|du�rm|*du�rmt|�}tj�3|�}=|=�rj|�r3tj�tj�4||#t5dN���r3tj�4||#t5dN�}>�n |�rMtj�tj�4||#t6���rMtj�4||#t6�}>�n|�rgtj�tj�4||#t7���rgtj�4||#t7�}>�n�| du�r�tj�tj�4||#t8t9|%����r�tj�4||#t8t9|%��}>�n�| du�r�tj�tj�4||#t8t:|%����r�tj�4||#t8t:|%��}>d}9�n�| �s�tj�tj�4||#t8t;|%����r�tj�4||#t8t;|%��}>�n�| �s�tj�tj�4||#t8t<|%����r�tj�4||#t8t<|%��}>d}9�nd| �stj�tj�4||#t5dN���stj�tj�4||#t6���rt=dOt8t;|%��dP|�dQ���| �s:tj�tj�4||#t7���r:t=dOt8t;|%��dP|�dR���| �rKt=dOt8t9|%��dP|�d'���t=dOt8t;|%��dSt8t9|%��dSt6�dSt5dN�dTt7�dP|�d'� ��tj�tj�4|#|���r||}>d}=�n�tj�tj�4|#|dN���r�|�s�tdU|dN�dV���tj�4|#|dN�}>d}=�n�t>|��r�|}?t?|�}@�n�|�r�t6}?n|�r�t7}?n| du�r�t8t9|%�}?nt8t;|%�}?�zT|||||||2||#dd|$dW� }At ||?fi|A��}@|@du�rF|?t8t9|%�k�rFt |t8t:|%�fi|A��}@|@du�rd}9n>| �r8|dXk�rt@|fi|A��\}@}}9||AdY<|@du�r7t=|�dZt8t9|%��dTt8t:|%��d[���nt8t;|%�}?t ||?fi|A��}@|@du�rf|?t8t;|%�k�rft |t8t<|%�fi|A��}@|@du�rfd}9|�s!t#��s!|@du�r�|?t;t<fv�r�|9�r~t:nt9}B|||||d\�}C|||||2|#dd|$d]� |C�}AtA||Bfi|C���s�tBt@|fd^di|A�d_d`��C�nq|||||d\�}CtA|t6fi|C���r�t=|�dZt8t;|%��dQ���tA|t7fi|C���r�t=|�dZt8t;|%��dR���|%du�rtA|t;fi|C���rt=|�dZt8t;|%��da|%�db���t=|�dZt8t;|%��dSt8t9|%��dSt6�dSt5�dTt7�d'� ��Wn0t=�y+�tD�yR}Dzt=dc|�dd|�det8t;|%��dSt6�dSt5�dTt7�d'� �|D�d}D~Dww|=�rat�$df|>���|>}@nFt�$df|?�dg|@���n:|*�r�dhdilEmF}Etj�|*��r�|*}+n|||||||2||#dd|$dW� }At ||*fi|A��}+|E|+ddj�dk} d}@d}9nd}@|9�r�tG||@|||||||2||#|$dl� \}@}:t��r#t |@t��r#|@�Hdm��r#tI|@dndo�� }/|/�J�}FWd�n 1�s�wY|F�Kdp�dnk�r�n3|F�Kdp�dqk�rd}t�$dr�n#|F�Kdp�dsk�rd}t�$dt�n|F�Kdp�duk�rn tdv|F�Kdp�����||B }1|1�r�|9�s9| du�r9tL|@| dw�} d}G|du�r�t |t��r�|d%k�r�tM|d��rd|jNdu�rd|jN}t�$dx|�dy��n;|9�rqdz|:v�rq|:dz}n|9�sytO| �}n tL|@d6| dw�}HtO|H�}~Ht�$d{�ntMt|��r�tt|�}ntd||����|�P|�}G|jQdu�o�|tjRk�p�tM|7d}�}<|9�r�|:d~}IntS| �T��}I|+du�r�|�s�|<�r�t ��r�d} ||_UtV|d�g}Jd}Kt��r|8�stW�sd6dlX}Lt�$d��|LjYjZt[�d��t\�g|J}JnB|�rt ��std:t�d;���|J�]t^��n,|,du�rBtj_�`��s%td���tja�b�jc}Mt�d|M�}Nt�|Mtj_�e�|N�f��}K|J�]|K�t��rO|8�rO|J�]tg��t'�(|�}t|d�d��sd|jh||(||d��}ti|J��||g| �Ri|4��}OWd�n 1�swY|Ojj}|<�r�t ��r�t��s�d}|OjQ�ng�|7du�r�|7jk|O|�d��||_lt |t�� rFi}P|7du�r�|P�m|7�n|O|��|P�m�fd�d>�|O�o�D��|}Q|7du�r�|7�p|Q�}Q|O�q|�}R|d4v�r�td���d�|Ri}Sd�tr�stt�juv�r�|P|Sd�<n tv|P�d6k� rt�d��|d�k� rtw|Of|Q|d�k|d��|S��}ntx|�}|7du� r(|7�y|�}||Sd<|O�z�tt|Ofdz|Qi|S��}|7du� rE|7j.|d��n|du� rX|O�z�t{|O�}Tt||T|�|� r�|@�HdN�� rm|�}|O||@dd���}On�zdhd�l~m}U|U|O|@ddd��\}O};Wnwt� y�t��d���w|� r�z dhd�l�m�}V|V|O|@�}OWnXt� y�t��d���w|1� r�|Gdu� r�t��|G�g}W|Kdu� r�|W�]|K�ti|W��$|j�|O| |I|@|||:|||||||7�|+| d��\}O}X}Y}Z}[}\Wd�n 1� s�wY|O�z�|O���|O���� r|)du� rt�$d��|Oj��"|)����|O_�n1|O���� rI|du� rIzt�j&|f||||||||#||d�� | ��|O_�Wnt�� yHt�$d��Ynw|du� r�|||[|d��}Sd�tr�st��juv� rc|Oj�|Sd�<d�tr�st��juv� r}|7du� r}|7j,j1t�j�k� r}d|Sd�<|7du� r�|7j,j1t�j�k� r�t |t��� r�d�|���v� s�d�|���v� r�d|Sd<t�� s�t�� s�t�|Ofi|S��|7du� r�|7��|O�|7|O_�|.du� r�|Oj�|.|'||&d��|� r�|;du� r�|X|Y|Z|\d��};|O|;fS|,du� r |Kdu� s�Jd���|Oj�� s�t�d���tj_���}]tj_��|Kjc|]f�}^|O��|^�|OS)�a�B Instantiate a pretrained pytorch model from a pre-trained model configuration. The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train the model, you should first set it back in training mode with `model.train()`. The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning task. The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those weights are discarded. If model weights are the same precision as the base model (and is a supported model), weights will be lazily loaded in using the `meta` device and brought into memory once an input is passed through that layer regardless of `low_cpu_mem_usage`. Parameters: pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*): Can be either: - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. - A path to a *directory* containing model weights saved using [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`. - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In this case, `from_tf` should be set to `True` and a configuration object should be provided as `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. - A path or url to a model folder containing a *flax checkpoint file* in *.msgpack* format (e.g, `./flax_model/` containing `flax_model.msgpack`). In this case, `from_flax` should be set to `True`. - `None` if you are both providing the configuration and state dictionary (resp. with keyword arguments `config` and `state_dict`). model_args (sequence of positional arguments, *optional*): All remaining positional arguments will be passed to the underlying model's `__init__` method. config (`Union[PretrainedConfig, str, os.PathLike]`, *optional*): Can be either: - an instance of a class derived from [`PretrainedConfig`], - a string or path valid as input to [`~PretrainedConfig.from_pretrained`]. Configuration for the model to use instead of an automatically loaded configuration. Configuration can be automatically loaded when: - The model is a model provided by the library (loaded with the *model id* string of a pretrained model). - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the save directory. - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a configuration JSON file named *config.json* is found in the directory. state_dict (`Dict[str, torch.Tensor]`, *optional*): A state dictionary to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using [`~PreTrainedModel.save_pretrained`] and [`~PreTrainedModel.from_pretrained`] is not a simpler option. cache_dir (`Union[str, os.PathLike]`, *optional*): Path to a directory in which a downloaded pretrained model configuration should be cached if the standard cache should not be used. from_tf (`bool`, *optional*, defaults to `False`): Load the model weights from a TensorFlow checkpoint save file (see docstring of `pretrained_model_name_or_path` argument). from_flax (`bool`, *optional*, defaults to `False`): Load the model weights from a Flax checkpoint save file (see docstring of `pretrained_model_name_or_path` argument). ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`): Whether or not to raise an error if some of the weights from the checkpoint do not have the same size as the weights of the model (if for instance, you are instantiating a model with 10 labels from a checkpoint with 3 labels). force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. resume_download: Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v5 of Transformers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. output_loading_info(`bool`, *optional*, defaults to `False`): Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages. local_files_only(`bool`, *optional*, defaults to `False`): Whether or not to only look at local files (i.e., do not try to download the model). token (`str` or `bool`, *optional*): The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use the token generated when running `huggingface-cli login` (stored in `~/.huggingface`). revision (`str`, *optional*, defaults to `"main"`): The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier allowed by git. <Tip> To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>"`. </Tip> mirror (`str`, *optional*): Mirror source to accelerate downloads in China. If you are from China and have an accessibility problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety. Please refer to the mirror site for more information. _fast_init(`bool`, *optional*, defaults to `True`): Whether or not to disable fast initialization. <Tip warning={true}> One should only disable *_fast_init* to ensure backwards compatibility with `transformers.__version__ < 4.6.0` for seeded model initialization. This argument will be removed at the next major version. See [pull request 11471](https://github.com/huggingface/transformers/pull/11471) for more information. </Tip> attn_implementation (`str`, *optional*): The attention implementation to use in the model (if relevant). Can be any of `"eager"` (manual implementation of the attention), `"sdpa"` (using [`F.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html)), or `"flash_attention_2"` (using [Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention)). By default, if available, SDPA will be used for torch>=2.1.1. The default is otherwise the manual `"eager"` implementation. > Parameters for big model inference low_cpu_mem_usage(`bool`, *optional*): Tries not to use more than 1x model size in CPU memory (including peak memory) while loading the model. Generally should be combined with a `device_map` (such as `"auto"`) for best results. This is an experimental feature and a subject to change at any moment. </Tip> If the model weights are in the same precision as the model loaded in, `low_cpu_mem_usage` (without `device_map`) is redundant and will not provide any benefit in regards to CPU memory usage. However, this should still be enabled if you are passing in a `device_map`. </Tip> torch_dtype (`str` or `torch.dtype`, *optional*): Override the default `torch.dtype` and load the model under a specific `dtype`. The different options are: 1. `torch.float16` or `torch.bfloat16` or `torch.float`: load in a specified `dtype`, ignoring the model's `config.torch_dtype` if one exists. If not specified - the model will get loaded in `torch.float` (fp32). 2. `"auto"` - A `torch_dtype` entry in the `config.json` file of the model will be attempted to be used. If this entry isn't found then next check the `dtype` of the first weight in the checkpoint that's of a floating point type and use that as `dtype`. This will load the model using the `dtype` it was saved in at the end of the training. It can't be used as an indicator of how the model was trained. Since it could be trained in one of half precision dtypes, but saved in fp32. 3. A string that is a valid `torch.dtype`. E.g. "float32" loads the model in `torch.float32`, "float16" loads in `torch.float16` etc. <Tip> For some models the `dtype` they were trained in is unknown - you may try to check the model's paper or reach out to the authors and ask them to add this information to the model's card and to insert the `torch_dtype` entry in `config.json` on the hub. </Tip> device_map (`str` or `Dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*): A map that specifies where each submodule should go. It doesn't need to be refined to each parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the same device. If we only pass the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank like `1`) on which the model will be allocated, the device map will map the entire model to this device. Passing `device_map = 0` means put the whole model on GPU 0. To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For more information about each option see [designing a device map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map). max_memory (`Dict`, *optional*): A dictionary device identifier to maximum memory. Will default to the maximum memory available for each GPU and the available CPU RAM if unset. offload_folder (`str` or `os.PathLike`, *optional*): If the `device_map` contains any value `"disk"`, the folder where we will offload weights. offload_state_dict (`bool`, *optional*): If `True`, will temporarily offload the CPU state dict to the hard drive to avoid getting out of CPU RAM if the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to `True` when there is some disk offload. offload_buffers (`bool`, *optional*): Whether or not to offload the buffers with the model parameters. quantization_config (`Union[QuantizationConfigMixin,Dict]`, *optional*): A dictionary of configuration parameters or a QuantizationConfigMixin object for quantization (e.g bitsandbytes, gptq). There may be other quantization-related kwargs, including `load_in_4bit` and `load_in_8bit`, which are parsed by QuantizationConfigParser. Supported only for bitsandbytes quantizations and not preferred. consider inserting all such arguments into quantization_config instead. subfolder (`str`, *optional*, defaults to `""`): In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can specify the folder name here. variant (`str`, *optional*): If specified load weights from `variant` filename, *e.g.* pytorch_model.<variant>.bin. `variant` is ignored when using `from_tf` or `from_flax`. use_safetensors (`bool`, *optional*, defaults to `None`): Whether or not to use `safetensors` checkpoints. Defaults to `None`. If not specified and `safetensors` is not installed, it will be set to `False`. weights_only (`bool`, *optional*, defaults to `True`): Indicates whether unpickler should be restricted to loading only tensors, primitive types, dictionaries and any types added via torch.serialization.add_safe_globals(). When set to False, we can load wrapper tensor subclass weights. kwargs (remaining dictionary of keyword arguments, *optional*): Can be used to update the configuration object (after it being loaded) and initiate the model (e.g., `output_attentions=True`). Behaves differently depending on whether a `config` is provided or automatically loaded: - If a configuration is provided with `config`, `**kwargs` will be directly passed to the underlying model's `__init__` method (we assume all relevant updates to the configuration have already been done) - If a configuration is not provided, `kwargs` will be first passed to the configuration class initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that corresponds to a configuration attribute will be used to override said attribute with the supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's `__init__` function. <Tip> Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to use this method in a firewalled environment. </Tip> Examples: ```python >>> from transformers import BertConfig, BertModel >>> # Download model and configuration from huggingface.co and cache. >>> model = BertModel.from_pretrained("google-bert/bert-base-uncased") >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable). >>> model = BertModel.from_pretrained("./test/saved_model/") >>> # Update configuration during loading. >>> model = BertModel.from_pretrained("google-bert/bert-base-uncased", output_attentions=True) >>> assert model.config.output_attentions == True >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower, for example purposes, not runnable). >>> config = BertConfig.from_json_file("./tf_model/my_tf_model_config.json") >>> model = BertModel.from_pretrained("./tf_model/my_tf_checkpoint.ckpt.index", from_tf=True, config=config) >>> # Loading from a Flax checkpoint file instead of a PyTorch model (slower) >>> model = BertModel.from_pretrained("google-bert/bert-base-uncased", from_flax=True) ``` * `low_cpu_mem_usage` algorithm: This is an experimental function that loads the model using ~1x model size CPU memory Here is how it works: 1. save which state_dict keys we have 2. drop state_dict before the model is created, since the latter takes 1x model size CPU memory 3. after the model has been instantiated switch to the meta device all params/buffers that are going to be replaced from the loaded state_dict 4. load state_dict 2nd time 5. replace the params/buffers from the state_dict Currently, it can't handle deepspeed ZeRO stage 3 and ignores loading errors r�N�from_tfF� from_flax�resume_download�proxies�output_loading_infor�trust_remote_code�mirror�_from_pipeline� _from_auto� _fast_initTr.�low_cpu_mem_usager�� max_memoryr��offload_state_dict�offload_buffers� load_in_8bit� load_in_4bitr$� subfolderr�� _commit_hashr��adapter_kwargs� adapter_name�defaultrJr7� gguf_file�tp_plan�autoz-tp_plan supports 'auto' only for now but got r�rrrzgThe argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.zIaccelerate is required when loading a GGUF file `pip install accelerate`.) rdrfrlrmrgrrhrz� _raise_exceptions_for_gated_repo�%_raise_exceptions_for_missing_entries�'_raise_exceptions_for_connection_errors�_adapter_model_path)rdrfrlrmrgr{r�r�r��base_model_name_or_path)r��balanced�balanced_low_0� sequentialz�When passing device_map as a string, the value needs to be a device name (e.g. cpu, cuda:0) or 'auto', 'balanced', 'balanced_low_0', 'sequential' but found rznYou can't pass device_map as a negative int. If you want to put the model on the cpu, pass device_map = 'cpu' z>Passing along a `device_map` requires `low_cpu_mem_usage=True`z`DeepSpeed Zero-3 is not compatible with `low_cpu_mem_usage=True` or with passing a `device_map`.z`Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install 'accelerate>=z'`zwYou can't pass `load_in_4bit`or `load_in_8bit` as a kwarg when passing `quantization_config` argument at the same time.cSs&i|]\}}|t�t�jvr||�qSrz)r�r�rZr�r�rzrzr{r�&z3PreTrainedModel.from_pretrained.<locals>.<dictcomp>)ryrx)� config_dict�return_unused_kwargsz�The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.r�pytorch)� file_typer/�from_auto_class�using_pipelinez+Offline mode: forcing local_files_only=True) rdr�rfrlrmrgrrhrzrrrqrK)� pre_quantized)r.rjrkr�r �quantzK`low_cpu_mem_usage` was None, now default to True since model is quantized.z�You cannot combine Quantization and loading a model from a GGUF file, try again by making sure you did not passed a `quantization_config` or that you did not load a quantized model from the Hub.z.indexzError no file named z found in directory zf but there is a file for TensorFlow weights. Use `from_tf=True` to load this model from those weights.zb but there is a file for Flax weights. Use `from_flax=True` to load this model from those weights.z, r�z$We found a TensorFlow checkpoint at z:, please set from_tf to True to load from this checkpoint.) rdrfrmrlrgr� user_agentrhrzr�r�r{rcrhz& does not appear to have a file named z� and thus cannot be loaded with `safetensors`. Please make sure that the model has been saved with `safe_serialization=True` or do not set `use_safetensors=True`.)rhrmrrdrg) rdrfrlrgr�rzr�r�r{Zignore_errors_during_conversionzProcess-auto_conversion)�targetr�r�r�z) but there is a file without the variant z;. Use `variant=None` to load this model from those weights.zCan't load the model for 'z�'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'z=' is the correct path to a directory containing a file named zloading weights file z from cache at r)�load_gguf_checkpoint)�return_tensorsr^) rdrfrmrlrgrr�rhrzr{r,r-r.r0r1zAA TensorFlow safetensors file is being loaded in a PyTorch model.r2z;A Flax safetensors file is being loaded in a PyTorch model.r3zTIncompatible safetensors file. File metadata is not ['pt', 'tf', 'flax', 'mlx'] but �r zWill use torch_dtype=z$ as defined in model's config objectr�z�Since the `torch_dtype` attribute can't be found in model's config object, will use torch_dtype={torch_dtype} as derived from model's weightszh`torch_dtype` can be one of: `torch.dtype`, `"auto"` or a string of a valid `torch.dtype`, but received �use_keep_in_fp32_modules�all_checkpoint_keys)r�rLrMzCTensor Parallel requires torch.distributed to be initialized first.r-)rJr.r�)rr�r�cs.i|]\�}t�fdd��D��r�tj�qS)c3rr�rzr�rrzr{r�Drz=PreTrainedModel.from_pretrained.<locals>.<dictcomp>.<genexpr>)r�rsr�)r�rg�r�rr{rAs ��ziIf passing a string for `device_map`, please choose 'auto', 'balanced', 'balanced_low_0' or 'sequential'.Zno_split_module_classes�special_dtypesz�This model has some weights that should be kept in higher precision, you need to upgrade `accelerate` to properly deal with them (`pip install --upgrade accelerate`).r�r�)r�Zlow_zeroru)r�i����)�$load_tf2_checkpoint_in_pytorch_model)Zallow_missing_keysrnz�Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.)�%load_flax_checkpoint_in_pytorch_modelz�Loading a Flax model in PyTorch, requires both PyTorch and Flax to be installed. Please see https://pytorch.org/ and https://flax.readthedocs.io/en/latest/installation.html for installation instructions.) re�sharded_metadatarsrtr�r�rvr�r�r�� gguf_pathr z\The user-defined `generation_config` will be used to override the default generation config.) rdrfrlrmrgrrhrzrrrqzZGeneration config file not found, using a generation config created from the model config.)r�Z offload_dirr�rw� skip_keysZ force_hooksr r�)r}rr|)r#r$�mismatched_keysr�ztp_device not set!z0This model does not have a tensor parallel plan.rz)�rcr�r|r�r�r�rLr�rrEr;rr@r4rCr�rJr�rwr rrrrrsr�r�rrr"r�r1r�rZ� from_dictrHrR� config_class�from_pretrainedr�rOrQr-Zmerge_quantization_configsr$� from_configZvalidate_environmentZupdate_torch_dtypeZupdate_device_mapr%r��isdirrr:r9r6r�r8r7r<r;�EnvironmentErrorrKrBr0rDrrjr=�modeling_gguf_pytorch_utilsr�rUr7ror8ryrr�r.r�rNr8r�rr�r4r�r�r�r�rSr!r�r`rartrv�_C�_get_acceleratorr��get_device_moduler:rhr�r2r=rZpreprocess_model�_pre_quantization_dtype�updateZget_special_dtypes_updater~Zadjust_target_dtyper�r�r�r`r�r�rfrgZadjust_max_memoryr�rercZload_tf_weights�modeling_tf_pytorch_utilsr�r&Zmodeling_flax_pytorch_utilsr�rT�_load_pretrained_model�evalr5r7�to_dictrr9r_�_skip_keys_device_placementr[r\Z FBGEMM_FP8r�r�Zpostprocess_modelr�Z load_adapter�supports_tp_planr��get_world_size�init_device_mesh�tensor_parallel)_rUr�rrdrerfrgrrhrir � model_argsr�r�rjrkrlrmrnrrorg� from_pipeliner�rsr.rtr�rur�rvrwrxryr$rz� commit_hashr�r|r}rJr7rr�r��resolved_config_filer�r!r�Zfrom_ptr�� config_path� model_kwargsZkwarg_attn_impr�r�r+r9r�Z loading_infor��is_local� archive_filerN�resolved_archive_fileZcached_file_kwargsZsafe_weights_nameZhas_file_kwargsrAr�r8rVZone_state_dictr�rWZ tp_devicer�� device_type� device_modulerr�� target_dtypeZno_split_modulesZdevice_map_kwargsrIr�r�Z load_contextsr#r$r�r�r�� world_size� device_meshrzr�r{r�r s                               �� �  �    ��� �� ���  �  � �  �� ��     �� �   � �    �  � � �� � �� � � � � ����� �����������     ���  � � ���� ����  �� �  ������� ���������������������  � ���� �  �       �  �  �    �� �       � � �   ��    � � ���    �    ��� ��     ��� �� ��� �   �   � �   zPreTrainedModel.from_pretrainedc=st d}|du}d}d}| durAd| ��vrAt|ttf�r|dn|���d�}| dur0|s0td��| dur;tj| dd�| durAd} |oF|du}|��|� �}t|� ��}|j � |durc|� |||�}dd ��|}�fd d �|D�}t � �dkr�t� fd d �|D��}t� fdd �|D��}nd}d}| o�|}|o�| }|r�� �d���fdd �|D���fdd �|D�}n |r‡ fdd �|D�}tt|�t|��}t|�t|�}dd�|��D�} |r�fdd�| D�} n |r� fdd�| D�} t|| �}|��| du�r1t��s1t��s1t�t�}!|� ���D]\� }"t|"�}#|!|#�� ��qdd �|!��D�}$nt|�}$|$D]>�|�rF�fdd ��D��n |�rR� fdd ��D���fdd �|D�� t � �dk�rtt � �t ��k�rt� fdd �|D�}�q7�jdu�r��jD] � � fdd �|D�}�q�jdu�r��jD] � � fdd �|D�}�q�|du�r�|�||� �}| �rX|D]���t|� ��v�r���n;� �d���t|� ��v�r׈ �d����n%��� ��r�d���d�dd��t|� ��v�r�d���d�dd���|�}%| }&|du�r| t j!k�rt�fdd �|D���rt j"}&|%j#t �#d �k�rVt j$|%�%�d!|&i�}'|�rCt&|d"d��sC|j'||'�id#��sLt(|�d$|'��q�|�)||'�d$||��q�|�r�|�s�|�rk� fd%d �|D�}(n|�rx� fd&d �|D�}(n|}(t*||(�})t+|j,d'��r�|j,j-�r�|�.�}*|*du�r�t+|*d(��r�|*j/du�r�d|*_0nt1|�2��})t��r�|�s�ddl3}+ttt4j5�6d)d �|)��D����},|+j7j8|,dd*��|�9|j:�Wd�n 1�s�wYn|�9|j:�|du�r|�;�D]\� }%t� fd+d �|D���r |%j<�=t j"�|%_<�q�d,�|}-t �j �dk�r*t+|�j ��s*|�r*�j d�t �j �dk�rkt+|�j ��rk|�skt&|�j �}-t|-� �� ���t��fd-d �|D���r[td.��| du�rk�fd/d0�| ��D�} � fd1d2�}.|du�r�tj>j?�|d�tj>j?�dd3���nd�| du�r�|�r�t@| |��� | du�r�tA| ��Bd4d,�nd5�|du�r�t|ttf��r�|dn|��fd6d0�|D�}/n �fd7d0�|d8��D�}/� ��fd9d0�|/��D�}0nd}0|du�r|.||||||�}1|�r tC|-|�|| | |0||| ||||d:�\}2}0}�nktD|-|��}3tE|-|�|3�}2�n\t|t��s"|g}g}2g}1|�s9| du�r7d| ��v�r7ind}0| �rCtF�G�}i}nd}d}|�r[tH| |�d;�}4�fd<d �|4D�}4ng}4t |�dk�rktIjJ|d=d>�}d}3|D]�}5|5|4v�rx�qod}6| du�r�|du�r�|jKjLtMjNk�r�|jKjOd?k�r�t �#d@d �| ��D�d�}6tP|5||6|dA�}|1|.||||||�7}1| �rt��r�tQ��s�|�s�|-� ���D]\�}%|%j#t �#d �k�r�t(|-�d$t j$|%�%�d!| i���q�n.tC|-|�|| | |0||| ||||d:�\}7}0}|2|77}2n|3du�r tD|-|��}3|2tE|-|�|3�7}2~tR�S��qo|0du�rgt |0�dk�rg||-k�r]�j � |�sR|0D]}8tT�Utj>�| |8�dB��tj>�| � �d|8�dB����q5� fdCd0�|0��D�}0|�sgtV|0| �d}0| �rutW|-||�tT�X|�t |2�dk�r�dD�|2�}9dE|9v�r�|9dF7}9tYdG|jZj[�dH|9����t |�dk�r�|j,j\du�r�gn|j,j\}:|jZj[|:v�r�t]j^nt]j_};|;dI|�dJ|jZj[�dK|�dL|jZj[�dM|jZj[�dN� �n t]�_dO|jZj[�dP��t |�dk�r�t]�^dQ|jZj[�dR|�dS|�dT��nt |1�dk�rt]�_dU|jZj[�dV|�dW|jZj[�dX��t |1�dk�r2dY�dZd �|1D��}<t]�^dQ|jZj[�dR|�d[|<�dT��||||1|0|2fS)\NFr�rr,z�The current `device_map` had weights offloaded to the disk. Please provide an `offload_folder` for them. Alternatively, make sure you have `safetensors` installed if the model you are using offers the weights in this format.Tr cSs�d|vr |�dd�Sd|vr|�dd�Sttjjd�r1d|vr%|�dd�Sd|vr/|�dd �S|Sd|vr;|�dd�Sd |vrE|�d d�S|S) Nrtrurrrsr�r�r�r�r�)rCr�rr�r��r�rzrzr{�_fix_keyQs    �  z8PreTrainedModel._load_pretrained_model.<locals>._fix_keycsg|]}�|��qSrzrzr�)r�rzr{r�e�z:PreTrainedModel._load_pretrained_model.<locals>.<listcomp>c3��|]}|���VqdSr�r��r��srVrzr{r�h��z9PreTrainedModel._load_pretrained_model.<locals>.<genexpr>c3r�r�r�r�rVrzr{r�ir�r�csg|] }|���s|�qSrzr�r���_prefixrzr{r�ur�c�*g|]}|���r|t��d�n|�qSr��r�r�r�r�rzr{r�v�*c�g|] }d��|g��qSrU�rr�rVrzr{r�xr�cSsh|]\}}|�qSrzrz)r�r�rgrzrzr{rFr�z9PreTrainedModel._load_pretrained_model.<locals>.<setcomp>cs*h|]}|���r|t��d�n|�qSr�r�r�r�rzr{rF�r�csh|] }d��|g��qSrUr�r�rVrzr{rF�r�cSs g|] \}}t|�dkr|�qSrr)r�rgr\rzrzr{r��� cr�r�r�r�r�rzr{r��r�cr�rUr�r�rVrzr{r��r�csg|]}|�vr|�qSrzrzr)�grouprzr{r��rcr�rzrzr)�missing_in_grouprzr{r��rc� g|] }t��|�dur|�qSr�rr�rrzr{r��r�cr�r�rrr�rzr{r��r�rc3r�r�r�r�r�rzr{r��r�r�r�r�)rFr�r�r csg|] }��d|���qSrUrzrrVrzr{r��r�cs g|] }|t��dd��qS)rNrrrVrzr{r��r�r�rucss�|] }|jdd�VqdS)F)r}N)r�)r�r]rzrzr{r��s�  �r�c3r�r�r�r�rrzr{r��s�r�c3s �|] }|�vo |�vVqdSr�rzr�)�base_model_expected_keys�expected_keys_not_prefixedrzr{r��r�zjThe state dictionary of the model you are trying to load is corrupted. Are you sure it was properly saved?cs&i|]\}}|��j�d�d�|�qSrB)rCrlr�)rUrzr{rr�z:PreTrainedModel._load_pretrained_model.<locals>.<dictcomp>c s�g}|r_|D]X}||vr q|}|r��d|��}n|r'd�|�d�dd��}||vr^||j||jkr^||jddkrM||��d||��krMq|�|||j||jf�||=q|S)Nr�rr~r_)rr�r�rr`) r��model_state_dictr�add_prefix_to_model�remove_prefix_from_modelrer�Zcheckpoint_keyZ model_keyrVrzr{�_find_mismatched_keyss*��zEPreTrainedModel._load_pretrained_model.<locals>._find_mismatched_keysr~ztorch.r�csi|]}|��qSrzrz�r��p)r�rzr{r;rcs i|] \}}|tj��|��qSrz�rwr r�r�r�r!�rrzr{r=r�r�csLi|]"\}}|���r$�|t��d�dkr|t��d�||�d��qS)Nr�)�safetensors_file� weight_namer�r�r�)�param_device_mapr�� str_dtyperzr{r>s��) r�r�r�r�r�r�r�r�r�r$)r�r�csg|] }tj��|��qSrzr�)r�r!r�rzr{r��r�zLoading checkpoint shardsrZint4_weight_onlycSsg|]}|dvr|�qS))r r�rz)r��drzrzr{r��r)r+r r z.datcs i|] \}}��d|��|�qSrUrz)r�r�r�rVrzr{r�r�z z size mismatchz_ You may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method.rz: z(Some weights of the model checkpoint at z! were not used when initializing �: z, - This IS expected if you are initializing z� from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing z� from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).z9All model checkpoint weights were used when initializing z. zSome weights of z3 were not initialized from the model checkpoint at z and are newly initialized: zo You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.zAll the weights of z/ were initialized from the model checkpoint at zf. If your task is similar to the task the model of the checkpoint was trained on, you can already use z* for predictions without further training.r#c Ss*g|]\}}}d|�d|�d|�d��qS)z- z: found shape z in the checkpoint and z in the model instantiatedrz)r�r��shape1�shape2rzrzr{r�s��z= and are newly initialized because the shapes did not match: )`r�r;rr�r7r�rwr'r�r�r�rlZupdate_expected_keysr�r�r6r� named_buffersr|r"rmrnr�r'r`re�_keys_to_ignore_on_load_missing�"_keys_to_ignore_on_load_unexpectedZupdate_missing_keysr�rr�rsr�r�r�rwr�r�r�rkr�rKr�rr�r�rurIr�rGr�� itertools�chain� from_iterabler�r�r�r�r~r�r�r r(�expand_device_mapr�rCr�r�r��tempfile�mkdtemp�get_disk_only_shard_filesrQr>r$r%r[ZTORCHAO� quant_typerr�rr�shutil�moverjrh�rmtreerr�r�r,r�rrR)=rUrr�rr�r�rer�rsrtr�r�rvr�r�r�r�r r�r+r�r�Zis_sharded_safetensorsr�r�Zoriginal_loaded_keysZhas_prefix_moduleZexpects_prefix_moduler�r�r#r$Z model_buffersrHrLZ id_tensorrIr�r�r�Z _loaded_keysrJr�r�Znot_initialized_parametersr�r�r�r�r�r�r|Zdisk_only_shard_filesr(r Znew_error_msgsr�� error_msg�archsZwarnerZmismatched_warningrz)r�r�r�r�rUr�rr�r�r�r�r�rrWr�r�r{r�s��  �         �     2   ��� ����  �  �����  �$ $ �   (&    ��� � ���  �   ����� �  ��  �     ������  ��� ����� ���z&PreTrainedModel._load_pretrained_modelc Cs�dd�|D�}|�dd�|D��}g}|��D]8\}}|r3|j�d�}|�|�r0|t|�d�n|}n|rFt|�dkrCd�|j|g�n|j}||vrO|�|�q|S)NcSs$h|]}d�|�d�dd���qS)r�Nr~)rr�r�rzrzr{rFs$z>PreTrainedModel.retrieve_modules_from_names.<locals>.<setcomp>cSs<h|]}t|�dkr|d��rd�|�d�dd���qS)rr~r�N�����)r�r�rr�r�rzrzr{rFs<r�r)�unionrGrlr�r�rr`) r�r\� add_prefix� remove_prefixZ module_keysZretrieved_modulesr�r�r�rzrzr{�retrieve_modules_from_namess �  " �z+PreTrainedModel.retrieve_modules_from_namesc Cs2t|||�t||d�}|}t|||||d�} | S)a� This is an experimental function that loads the model using ~1.x model size CPU memory Before you call it do: 1. save which state_dict keys are available 2. drop state_dict before model is created, since the latter takes 1x model size memory Here then we continue: 3. switch to the meta device all params/buffers that are going to be replaced from the loaded state_dict 4. load state_dict 2nd time 5. replace the params/buffers from the state_dict Currently, it doesn't handle missing_keys, unexpected_keys, mismatched_keys. It can't handle deepspeed. To handle bitsandbytes, needs non-empty hf_quantizer argument. r�)r�r�)r�rr�) rr�r�r�r�r�r r�r�r�rzrzr{�_load_pretrained_model_low_mem&s  �z.PreTrainedModel._load_pretrained_model_low_mem� AutoModelcCsDt|t�s|j}ddlmm}t||�st|�d���||_dS)a� Register this class with a given auto class. This should only be used for custom models as the ones in the library are already mapped with an auto class. <Tip warning={true}> This API is experimental and may have some slight breaking changes in the next releases. </Tip> Args: auto_class (`str` or `type`, *optional*, defaults to `"AutoModel"`): The auto class to register this new model with. rNz is not a valid auto class.) r;r�r��transformers.models.auto�modelsr�r�r�r-)rU� auto_class� auto_modulerzrzr{�register_for_auto_classNs   z'PreTrainedModel.register_for_auto_classcC�Tt�std��ddlm}t�|�t�d�krtd|�d���ddlm}|�|�S)a( Converts the model to use [PyTorch's native attention implementation](https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html), integrated to Transformers through [Optimum library](https://huggingface.co/docs/optimum/bettertransformer/overview). Only a subset of all Transformers models are supported. PyTorch's attention fastpath allows to speed up inference through kernel fusions and the use of [nested tensors](https://pytorch.org/docs/stable/nested.html). Detailed benchmarks can be found in [this blog post](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2). Returns: [`PreTrainedModel`]: The model converted to BetterTransformer. �<The package `optimum` is required to use Better Transformer.rr��1.7.0�EPlease install optimum>=1.7.0 to use Better Transformer. The version � was found.��BetterTransformer) rIr��optimum.versionr�rr<�optimum.bettertransformerr � transform�r�Zoptimum_versionr rzrzr{�to_bettertransformerhs  �  z$PreTrainedModel.to_bettertransformercCr)a Reverts the transformation from [`~PreTrainedModel.to_bettertransformer`] so that the original modeling is used, for example in order to save the model. Returns: [`PreTrainedModel`]: The model converted back to the original modeling. rrr�r r r r ) rIr�rr�rr<rr �reverserrzrzr{�reverse_bettertransformer�s  �  z)PreTrainedModel.reverse_bettertransformerc Cs�t|�s tj��s t�rdS|dus|jjdurdS|jj|dd�ddgfvrtd}|jjdur8|jj|jjksT|jjdurF|jj|jjksT|jj durm|jj |jjkrm|d|jj�d|jj�d|jj�d|jj �d � 7}t � |�dSdS) zv Shows a one-time warning if the input_ids appear to contain padding and no attention mask was given. Nr~rz�We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.z5 You may ignore this warning if your `pad_token_id` (z&) is identical to the `bos_token_id` (z), `eos_token_id` (z), or the `sep_token_id` (z ), and your input is not padded.) rXrs�jit� is_tracingrYr� pad_token_id� bos_token_id� eos_token_id� sep_token_idr�rb)r�r)r�Z warn_stringrzrzr{�%warn_if_padding_and_no_attention_mask�s*�  �����z5PreTrainedModel.warn_if_padding_and_no_attention_maskcCs"t�dt�t|d�s dS|jjS)Nz�`_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` insteadr�F)r�r�r�r�r�Z is_trainabler�rzrzr{�_is_quantized_training_enabled�s� z.PreTrainedModel._is_quantized_training_enabledcCs(|jdurdSt|jdd�durdSdS)zJ Returns whether the model has a tensor parallelism plan. NTr?F)r?r�r=r�rzrzr{r��s z PreTrainedModel.supports_tp_plancs8td�std��dtjjddf�fdd� }|�|�dS)z� Tensor parallelize the model across the given device mesh. Args: device_mesh (`torch.distributed.DeviceMesh`): The device mesh to use for tensor parallelism. z2.5z3tensor parallel is only supported for `torch>=2.5`.�modr�Ncs\t|dd�}|dur dSt�d|jj�d|���tjj�t |�}tj j j j |�|d�dS)Nr?zApplying tensor parallel to r�)r��parallelize_plan)r�r�r�r�r�rsr��_pytree�tree_mapr,rtrL�parallel�parallelize_module)rr��r�rzr{�tplize�s �  �z/PreTrainedModel.tensor_parallel.<locals>.tplize)rNr�rsrr�r�)r�r�r$rzr#r{r��szPreTrainedModel.tensor_parallelcCs�t|jdd�dur|jj}n&|jj}|tvr4dd�t��d�}t�||jj�}t |�dkr2|d}nd}|dusE|tvrPt|jdd�durPt � d|�d��d}t|S) N� loss_type�(�|r�rz `loss_type=zY` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.� ForCausalLM) r�rr%r�r�r#rr��findallr�r�rb)r�r%Z loss_groupsrzrzr{� loss_function�s   " �zPreTrainedModel.loss_function�compile_configcCsPt|jdt��}t|d�rt|d|�|kr%||_tj|jfi|����|_ |j S)a�Return a `torch.compile`'d version of `self.__call__`. This is useful to dynamically choose between non-compiled/compiled `forward` during inference, especially to switch between prefill (where we don't want to use compiled version to avoid recomputing the graph with new shapes) and iterative decoding (where we want the speed-ups of compiled version with static shapes).r+�_compiled_call�_last_compile_config) r�r7rr�r-rsr;�__call__r�r,)r�r+�default_configrzrzr{�get_compiled_calls�z!PreTrainedModel.get_compiled_call)FNNT)NNTFr!)NNTrz)NFTr�r#) FNTFNNNNNNNTr")r�NNT)r)r�r�)wr�r$r%r&r�rlrrGr-r�r�r8r�r�r4rSZis_parallelizablerC� _is_statefulr^r_r`�_supports_cache_class�_supports_static_cache�_supports_quantized_cacher?r(r r�rsrr*r/rr1r@rAr<rr rI� classmethodrXr�r r�rr2rNrr�r=r5rcrerdr�r�rr�r�r�r�r�r'r�r�r�rr�r�r�r�r�r�r�r�r�r�r r�r;r�rDrrr�r�r�rCrw�PathLiker�r0rr?r�rWrgr�r`r�rr�r�r�r�rrrrrrr�r�r*rr0� __classcell__rzrzr:r{r�s&   8���� �e&����� �]      ��� �Y���� � ;*����� �"����� �" �  * � ����� �� � � �    '  �������� � � � � �0 � w �'  #   & rrz model file)�objectZ object_classZ object_filescsJeZdZdZdef�fdd� Z d dejdeejdejfd d �Z �Z S) �PoolerStartLogitsz� Compute SQuAD start logits from sequence hidden states. Args: config ([`PretrainedConfig`]): The config used by the model, will be used to grab the `hidden_size` of the model. rcst���t�|jd�|_dS)Nr)r0r1rr�� hidden_size�dense�r�rr:rzr{r10s zPoolerStartLogits.__init__N� hidden_states�p_maskr�cCsV|�|��d�}|dur)t|�tjkr|d|d|}|S|d|d|}|S)a� Args: hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`): The final hidden states of the model. p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*): Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token should be masked. Returns: `torch.FloatTensor`: The start logits for SQuAD. r~Nr����ꌠ9Y>)F)r;�squeezer�rsr�)r�r=r>�xrzrzr{�forward4s�zPoolerStartLogits.forwardr�) r�r$r%r&rr1rs� FloatTensorr rCr7rzrzr:r{r9's����r9c sbeZdZdZdef�fdd� Z   d dejdeejdeej d eejd ejf d d �Z �Z S)�PoolerEndLogitsz� Compute SQuAD end logits from sequence hidden states. Args: config ([`PretrainedConfig`]): The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps` to use. rcsRt���t�|jd|j�|_t��|_tj|j|j d�|_t�|jd�|_ dS)Nr_)�epsr) r0r1rr�r:�dense_0�Tanh� activation� LayerNorm�layer_norm_eps�dense_1r<r:rzr{r1Ws  zPoolerEndLogits.__init__Nr=� start_states�start_positionsr>r�cCs�|dus |dus Jd��|dur4|jdd�\}}|dd�ddf�dd|�}|�d|�}|�d|d�}|�tj||gdd��}|�|�}|�|�}|�|�� d�}|durst |�tj kri|d|d|}|S|d|d|}|S) a� Args: hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`): The final hidden states of the model. start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*): The hidden states of the first tokens for the labeled span. start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): The position of the first token for the labeled span. p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*): Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token should be masked. <Tip> One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides `start_states`. </Tip> Returns: `torch.FloatTensor`: The end logits for SQuAD. N�7One of start_states, start_positions should be not Noner�r~�r�rr?r@) r�r �gatherrGrsr�rIrJrLrAr�r�)r�r=rMrNr>�slen�hszrBrzrzr{rC^s$�   �zPoolerEndLogits.forward�NNN� r�r$r%r&rr1rsrDr � LongTensorrCr7rzrzr:r{rEMs"  ������rEc s\eZdZdZ�fdd�Z   d dejdeejdeejdeejd ejf d d �Z �Z S) �PoolerAnswerClassz� Compute SQuAD 2.0 answer class from classification and start tokens hidden states. Args: config ([`PretrainedConfig`]): The config used by the model, will be used to grab the `hidden_size` of the model. csBt���t�|jd|j�|_t��|_tj|jddd�|_dS)Nr_rF)ru) r0r1rr�r:rGrHrIrLr<r:rzr{r1�s  zPoolerAnswerClass.__init__Nr=rMrN� cls_indexr�cCs�|jd}|dus|dusJd��|dur,|dd�ddf�dd|�}|�d|��d�}|durH|dd�ddf�dd|�}|�d|��d�}n |dd�ddd�f}|�tj||gdd��}|�|�}|�|��d�}|S)a� Args: hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`): The final hidden states of the model. start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*): The hidden states of the first tokens for the labeled span. start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): The position of the first token for the labeled span. cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Position of the CLS token for each sentence in the batch. If `None`, takes the last token. <Tip> One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides `start_states`. </Tip> Returns: `torch.FloatTensor`: The SQuAD 2.0 answer class. r~NrOr�rP) r�r rQrArGrsr�rIrL)r�r=rMrNrXrSZcls_token_staterBrzrzr{rC�s � zPoolerAnswerClass.forwardrT) r�r$r%r&r1rsrDr rVrCr7rzrzr:r{rW�s"  ������rWc@s~eZdZUdZdZeejed<dZ eejed<dZ eej ed<dZ eejed<dZ eej ed<dZeejed<dS) �SquadHeadOutputa� Base class for outputs of question answering models using a [`~modeling_utils.SQuADHead`]. Args: loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided): Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses. start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided): Log probabilities for the top config.start_n_top start token possibilities (beam-search). start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided): Indices for the top config.start_n_top start token possibilities (beam-search). end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided): Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search). end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided): Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search). cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided): Log probabilities for the `is_impossible` label of the answers. N�loss�start_top_log_probs�start_top_index�end_top_log_probs� end_top_index� cls_logits)r�r$r%r&rZr rsrD�__annotations__r[r\rVr]r^r_rzrzrzr{rY�s rYcs�eZdZdZ�fdd�Zeeed�      ddej de ej d e ej d e ej d e ej d e ej d e de eeej ffdd��Z�ZS)� SQuADHeadz� A SQuAD head inspired by XLNet. Args: config ([`PretrainedConfig`]): The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps` to use. cs<t���|j|_|j|_t|�|_t|�|_t|�|_ dSr�) r0r1� start_n_top� end_n_topr9� start_logitsrE� end_logitsrW� answer_classr<r:rzr{r1�s   zSQuADHead.__init__)� output_typer�NFr=rN� end_positionsrX� is_impossibler>� return_dictr�cCs|j||d�}|durk|durk||||fD]} | dur&| ��dkr&| �d�q|j|||d�} t�} | ||�} | | |�} | | d}|dura|dura|j|||d�}t��}|||�}||d7}|rht|d �S|fS|� �\}}}tj j |dd �}t j ||jdd �\}}|�d��dd|�}t �|d |�}|�d��d|dd�}|�d��|�}|dur�|�d�nd}|j|||d �} tj j | dd �}t j ||jdd �\}}|�d|j|j�}|�d|j|j�}t �d ||�}|j|||d�}|s�|||||fSt|||||d�S)a� Args: hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`): Final hidden states of the model on the sequence tokens. start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Positions of the first token for the labeled span. end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Positions of the last token for the labeled span. cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Position of the CLS token for each sentence in the batch. If `None`, takes the last token. is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Whether the question has a possible answer in the paragraph or not. p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*): Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token should be masked. return_dict (`bool`, *optional*, defaults to `False`): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. Returns: )r>Nrr~)rNr>r_)rNrXg�?)rZrPr�)rMr>z blh,bl->bh)rMrX)r[r\r]r^r_)rdr��squeeze_rerrfr�BCEWithLogitsLossrYr�r��softmaxrs�topkrbrr rQ� expand_asrcrN�einsum)r�r=rNrhrXrir>rjrdrBreZloss_fctZ start_lossZend_lossZ total_lossr_Z loss_fct_cls�cls_loss�bszrRrSZstart_log_probsr[r\Zstart_top_index_exprMZhidden_states_expandedZ end_log_probsr]r^rzrzr{rCs^ �      � � ��zSQuADHead.forward)NNNNNF)r�r$r%r&r1rRrYrrsrDr rVr�rr rCr7rzrzr:r{ra�s6 �������� �racsJeZdZdZdef�fdd� Z d dejdeej dejfd d �Z �Z S) �SequenceSummarya� Compute a single vector summary of a sequence hidden states. Args: config ([`PretrainedConfig`]): The config used by the model. Relevant arguments in the config class of the model are (refer to the actual config class of your model for the default values it uses): - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are: - `"last"` -- Take the last token hidden state (like XLNet) - `"first"` -- Take the first token hidden state (like Bert) - `"mean"` -- Take the mean of all tokens hidden states - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2) - `"attn"` -- Not implemented now, use multi-head attention - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction. - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes (otherwise to `config.hidden_size`). - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output, another string or `None` will add no activation. - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation. - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation. rcs�t���t|dd�|_|jdkrt�t�|_t|d�r;|jr;t|d�r0|j r0|j dkr0|j }n|j }t � |j |�|_t|dd�}|rGt|�nt�|_t�|_t|d�r`|jdkr`t �|j�|_t�|_t|d �rw|jdkryt �|j�|_dSdSdS) N� summary_type�last�attn�summary_use_proj�summary_proj_to_labelsrZsummary_activation�summary_first_dropout�summary_last_dropout)r0r1r�rtr�r�summaryr�rwrx� num_labelsr:rr�rrI� first_dropoutry�Dropout� last_dropoutrz)r�r� num_classesZactivation_stringr:rzr{r1�s&   �zSequenceSummary.__init__Nr=rXr�cCs|jdkr|dd�df}ne|jdkr|dd�df}nW|jdkr(|jdd�}nK|jd krl|durItj|d dd�dd�f|jd dtjd �}n|�d��d�}|�d |��d|� d�f�}|� d |�� d �}n|jdkrst �|� |�}|�|�}|�|�}|�|�}|S)ak Compute a single vector summary of a sequence hidden states. Args: hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`): The hidden states of the last layer. cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*): Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token. Returns: `torch.FloatTensor`: The summary of the sequence hidden states. ruNr~�firstrr�rrPrX.r�r�)r~rv)rtr�rs� full_liker��longrr r�r�rQrAr�r}r{rIr)r�r=rXr}rzrzr{rC�s.     �"     zSequenceSummary.forwardr�rUrzrzr:r{rsgs����rs� recursivecCsNt�ri}|rtd�std��||d<t|fi|��St|d�r%t|j�S|S)a� Recursively unwraps a model from potential containers (as used in distributed training). Args: model (`torch.nn.Module`): The model to unwrap. recursive (`bool`, *optional*, defaults to `False`): Whether to recursively extract all cases of `module.module` from `model` as well as unwrap child sublayers recursively, not just the top-level distributed containers. z0.29.0zsSetting `recursive=True` to `unwrap_model` requires `accelerate` v0.29.0. Please upgrade your version of accelerater�r�)rErrdr�r+r�)rr�r�rzrzr{r+�s �  r+csFi}�fdd�|D�}|��D]\��|���fdd�|D��q|S)zT Expand a device map to return the correspondance parameter name to device. cs&g|]}|���r|t��d��qSr�r�r�r�rzr{r��r�z%expand_device_map.<locals>.<listcomp>cs2i|]}|�ks|���d��s�dkr|��qSrBr�r�)r�r�rzr{r�s2z%expand_device_map.<locals>.<dictcomp>)r�r�)r�� param_namesr�Znew_device_maprz)r�r�r�r{r��s�r�cs��fdd�|d��D�}t�t�}|��D]-\}}t|�dkr:||vr:d�|�d�dd��}t|�dkr:||vs$||�||�qdd �|��D�S) zT Returns the list of shard files containing only weights offloaded to disk. cs,i|]\}}|���r|t��d�|�qSr�r�)r�r�r�r�rzr{r�s ��z-get_disk_only_shard_files.<locals>.<dictcomp>r�rr�Nr~cSs"g|] \}}t|�dhkr|�qS)r�)r)r��fname�devicesrzrzr{r�r�z-get_disk_only_shard_files.<locals>.<listcomp>)r�rmrnrr�rr�r`)r�r�r�r�Z files_contentr�rNrzr�r{r��s  � �r�r#r)TT)FNTr!) NNNNNNNFNNNr�)�rmr�r�r�importlib.metadatartr�r�rrwr�r�r�r�� contextlibr� dataclassesrrr�multiprocessingr�typingrrr r r r r rrr�zipfilerrs�huggingface_hubr� packagingrrr�torch.nnrr�torch.utils.checkpointr� activationsr�configuration_utilsr�dynamic_module_utilsr� generationrrr� integrationsr r!r"Zloss.loss_utilsr#� pytorch_utilsr$r%r&r'r(r)r*r+r,� quantizersr-r.Zquantizers.quantizers_utilsr/Zsafetensors_conversionr0r�r1r2r3r4r5r6r7r8r9r:r;r<r=r>r?r@rArBrCrDrErFrGrHrIrJrKrLrMrNrOrPrQrRrSZ utils.hubrTrUZutils.import_utilsrVrWrXrYZutils.quantization_configrZr[rxry�upperr\r^rlr_r`raZaccelerate.hooksrb�accelerate.utilsrcrdrerfrgrhrirjrkr<r8r?Zaccelerate.utils.modelingrn� safetensorsro�safetensors.torchrprrqr@� get_loggerr�r�r�r�r�r|r�Z!smdistributed.modelparallel.torchZ modelparallelr2Zsmdistributed.modelparallelr�Z SMP_VERSIONr1r�r�r�r�r�r�r�r�r�r�r�r�r�r�r�r�r�r�r�r�r�r�r�r�r�r�r�r�r�r)r�r6r�r�rrKrrRr[rlrqr�r�r�r�r�r�r�r�r&r0r9rErWrYrarsr+r�r�rzrzrzr{�<module>sd   0        ,  �% ,          �    1   "W� ��� �F << N �8 : 9 �&EAuc
Memory