o �J�h��@s6dZddlZddlZddlZddlZddlmZddlmZm Z m Z mZmZm Z mZddlmZmZmZmZmZmZmZmZmZmZmZmZmZddlmZmZm Z m!Z!e!�"e#�Z$dZ%d Z&d Z'Gdd�d�Z(Gd d�de(�Z)dd�Z*dd�Z+dd�Z,dd�Z-dd�Z.de e/de/fdd�Z0e e�Gdd�de��Z1dS)z� Tokenization classes for python tokenizers. For fast tokenizers (provided by HuggingFace's tokenizers library) see tokenization_utils_fast.py �N)�OrderedDict)�Any�Dict�List�Optional�Tuple�Union�overload�) �ENCODE_KWARGS_DOCSTRING�'ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING�INIT_TOKENIZER_DOCSTRING� AddedToken� BatchEncoding�EncodedInput�EncodedInputPair�PreTokenizedInput�PreTokenizedInputPair�PreTrainedTokenizerBase� TextInput� TextInputPair�TruncationStrategy)�PaddingStrategy� TensorType�add_end_docstrings�loggingzspecial_tokens_map.jsonzadded_tokens.jsonztokenizer_config.jsonc@sLeZdZdZdd�Zdd�Zdefdd�Zd ed eefdd�Z d d�Z dS)�Triez� Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass Loose reference https://en.wikipedia.org/wiki/Trie cGs"i|_t�|_d|_|j|�dS)N�)�data�set�_tokens�_termination_char�update��self�args�r&�]C:\pinokio\api\whisper-webui.git\app\env\lib\site-packages\transformers\tokenization_utils.py�__init__;sz Trie.__init__cGst|�D]}|�|�qdS)z� Updates the Trie with new tokens provided as arguments. Args: *args: Variable number of words to be added to the Trie. N)�tuple�add)r$r%�tokenr&r&r'r"As�zTrie.update�wordcCsJ|sdS|j�|�|j}|D]}|�|i�||<||}qd||j<dS)u� Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation. The special key `""` in `self._termination_char` is used to represent termination. This function is idempotent, adding twice the same word will leave the trie unchanged Example: ```python >>> trie = Trie() >>> trie.add("Hello 友達") >>> trie.data {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}} >>> trie.add("Hello") >>> trie.data {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}} ``` Nr )r r*r� setdefaultr!)r$r,�ref�charr&r&r'r*Ks zTrie.add�text�returncCs�t�}dg}d}t|�D]�\}}|r||krqt�}d}|��D]�\} } d| vr�|��D]V\}}|| kr6nM|| krC|d} |d}n|} |}| t|�krQ|| nd}d|vr]|} | }| }||vr�||}| d7} d|vrs|} | }| }| t|�krzn|| }||vsaq,|�| �|�|�d}n|| vr�| |} | || <q |�| �q |r�i}n|D]} || =q�||kr�||jvr�|j|||<q|��D]\} } d| vr�t|�}|�| �|�|�nq�|�||�S)a\ Will look for the words added to the trie within `text`. Output is the original string splitted along the boundaries of the words found. This trie will match the longest possible word first ! Example: ```python >>> trie = Trie() >>> trie.split("[CLS] This is a extra_id_100") ["[CLS] This is a extra_id_100"] >>> trie.add("[CLS]") >>> trie.add("extra_id_1") >>> trie.add("extra_id_100") >>> trie.split("[CLS] This is a extra_id_100") ["[CLS]", " This is a ", "extra_id_100"] ``` rFrr NT) r� enumerater�items�len�appendr*r�cut_text)r$r0�states�offsets�skip�currentZcurrent_char� to_remove�reset�startZtrie_pointerZ lookstartZlooktrie_pointerZlookahead_index�end� next_charr&r&r'�splitjst! �� z Trie.splitcCsX|�t|��g}d}|D]}||krt�d�q ||krq |�|||��|}q |S)NrzbThere was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it anyway.)r5r4�logger�error)r$r0r8�tokensr=r>r&r&r'r6s�z Trie.cut_textN)�__name__� __module__�__qualname__�__doc__r(r"�strr*rr@r6r&r&r&r'r5s rcsNeZdZ�fdd�Zdefdd�Zdedefdd �Zd edefdd�Z �Z S) �ExtensionsTriecst�j|�dS�N)�superr(r#�� __class__r&r'r(szExtensionsTrie.__init__�prefixcs&|��}|�|�}�fdd�|D�S)aC Generates all extensions of a given prefix token in the Trie. Example: ```python >>> trie = Trie() >>> trie.add("apple") >>> trie.add("app") >>> trie.add("application") >>> trie.extensions("app") ['app', 'apple', 'application'] ``` c�g|]}�|�qSr&r&��.0r+�rNr&r'� <listcomp>1�z-ExtensionsTrie.extensions.<locals>.<listcomp>)� _get_node�_collect_tokens)r$rNZprefix_node�retr&rRr'� extensions s zExtensionsTrie.extensionsr+r1cCs*|j}|D] }||vr|S||}q|S)a Retrieves the node corresponding to the given token in the Trie. Args: token (str): The token for which the corresponding node needs to be retrieved. Returns: dict: The node in the Trie corresponding to the given token. )r)r$r+�noder/r&r&r'rU3s �zExtensionsTrie._get_noderYcsX|j|vr |jgng}|��D]\�}�|jkr)|�|�}|��fdd�|D��q|S)a Generates all tokens in the Trie starting from a given node. Args: node (dict): The node in the Trie from which tokens need to be generated. Returns: list: List of tokens generated from the given node. crOr&r&)rQZsubtoken�r+r&r'rSSrTz2ExtensionsTrie._collect_tokens.<locals>.<listcomp>)r!r3rV�extend)r$rYrCZsubtrie_headZ subtokensr&rZr'rVEs �zExtensionsTrie._collect_tokens)rDrErFr(rHrX�dictrU�listrV� __classcell__r&r&rLr'rIs rIcCs>|dks|dks|dks|dkrdSt�|�}|dkrdSdS)z0Checks whether `char` is a whitespace character.� � � � T�ZsF)�unicodedata�category�r/�catr&r&r'�_is_whitespaceWs rhcCs8|dks|dks|dkrdSt�|�}|�d�rdSdS)z-Checks whether `char` is a control character.r`rarbF�CT)rdre� startswithrfr&r&r'�_is_controlcs rkcCsht|�}|dkr|dks$|dkr|dks$|dkr|dks$|dkr&|dkr&d St�|�}|�d �r2d SdS)z1Checks whether `char` is a punctuation character.�!�/�:�@�[�`�{�~T�PF)�ordrdrerj)r/�cprgr&r&r'�_is_punctuationos@ rwcC�$|d}tt|�t|�Bt|�B�S)zcChecks whether the last character in text is one of a punctuation, control or whitespace character.��boolrkrwrh)r0� last_charr&r&r'�_is_end_of_word~�r}cCrx)zdChecks whether the first character in text is one of a punctuation, control or whitespace character.rrz)r0Z first_charr&r&r'�_is_start_of_word�r~r� token_list� new_tokencCs8t�||�}|t|�kr|||krdS|�||�dS)zm Inserts one token to an ordered list if it does not already exist. Note: token_list must be sorted. N)�bisect�bisect_leftr4�insert)r�r�Z insertion_idxr&r&r'�!_insert_one_token_to_ordered_list�sr�c's�eZdZdZ�fdd�Zedefdd��Zedefdd��Z ede eeffd d ��Zede ee ffdd��Zejd e eee effde ee ffdd��Zde eeffdd�Zdd�Zdd�Zd`deeeee fdedefdd�Zgfdeefdd�Zd`dedefdd�Zd edeefd!d"�Zd#d$�Zd%eeeefdeeeeffd&d'�Zd(d)�Zd*d+�Zd,d-ej e!j"d,d.dd,d,d,d,d,ddddd-fd eee#e$fd/eeee#e$fd0ed1ed2e!d3eed4ed5ed6eed7eed8eeee%fd9eed:eed;ed<ed=ed>ed?ede&f&d@dA�Z'd-ej e!j"d,d.dd,d,d,d,d,ddddd-dfdBeeeee(ee#ee)ee$ee*fd0ed1ed2e!d3eed4ed5ed6eed7eed8eeee%fd9eed:eed;ed<ed=ed>ed?edCede&f&dDdE�Z+e,e-e.�d-ej e!j"d,d.d,d,d,d,d,dddd-dfdFeee)e/eed,ffd0ed1ed2e!d3eed4ed6eed7eed8eed9eed:eed;ed<ed>ed?edCede&f"dGdH��Z0 d`d ed5ede/ee ee1fffdIdJ�Z2 dadKedLeedMedeef�fdNdO� Z3e4d`dPedQedefdRdS��Z5e4d`dPeedQedeefdTdS��Z5 d`dPeeeefdQedeeeeffdUdS�Z5dVedefdWdX�Z6d%eedefdYdZ�Z7 , -dbd[eeeefdQed\ed]edef d^d_�Z8�Z9S)c�PreTrainedTokenizera Base class for all slow tokenizers. Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`]. Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary. This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...). cs|t��_t�d�si�_�j�|�di��dd��j��D��_t�j d i|��j �fdd��jD�dd�d �_dS)N�_added_tokens_decoder�added_tokens_decodercS�i|]\}}|j|�qSr&��content�rQ�v�kr&r&r'� <dictcomp>��z0PreTrainedTokenizer.__init__.<locals>.<dictcomp>csg|] }|�jvr|�qSr&��_added_tokens_encoderrP�r$r&r'rS�sz0PreTrainedTokenizer.__init__.<locals>.<listcomp>T)�special_tokensFr&) r�tokens_trie�hasattrr�r"�popr3r�rKr(�_add_tokensZall_special_tokens_extended�_decode_use_source_tokenizer)r$�kwargsrLr�r'r(�s � zPreTrainedTokenizer.__init__r1cCsdS�NFr&r�r&r&r'�is_fast�szPreTrainedTokenizer.is_fastcC�t�)zP `int`: Size of the base vocabulary (without the added tokens). ��NotImplementedErrorr�r&r&r'� vocab_size�szPreTrainedTokenizer.vocab_sizecCs dd�t|j��dd�d�D�S)z� Returns the sorted mapping from string to index. The added tokens encoder is cached for performance optimisation in `self._added_tokens_encoder` for the slow tokenizers. cSr�r&r�r�r&r&r'r��r�z<PreTrainedTokenizer.added_tokens_encoder.<locals>.<dictcomp>cS�|dS�Nrr&��itemr&r&r'�<lambda>��z:PreTrainedTokenizer.added_tokens_encoder.<locals>.<lambda>��key)�sortedr�r3r�r&r&r'�added_tokens_encoder�s z(PreTrainedTokenizer.added_tokens_encodercCstt|j��dd�d��S)z� Returns the added tokens in the vocabulary as a dictionary of index to AddedToken. Returns: `Dict[str, int]`: The added tokens. cSr�r�r&r�r&r&r'r��r�z:PreTrainedTokenizer.added_tokens_decoder.<locals>.<lambda>r�)r\r�r�r3r�r&r&r'r��sz(PreTrainedTokenizer.added_tokens_decoder�valuec Cs�|��D]9\}}t|ttf�rt|t�s(td|j|jf�dttttff��t|t�r1t|�n||j|<||j t|�<q|� �dS)Nz;The provided `added_tokens_decoder` has an element of type z, should be a dict of )r3� isinstancerHr�int� TypeErrorrMrr�r��_update_total_vocab_size)r$r��indexr+r&r&r'r��s"�cC�|jS)aX Returns the added tokens in the vocabulary as a dictionary of token to index. Results might be different from the fast call because for now we always add the tokens even if they are already in the vocabulary. This is something we should change. Returns: `Dict[str, int]`: The added tokens. r�r�r&r&r'�get_added_vocab�s z#PreTrainedTokenizer.get_added_vocabcCr�)zD Size of the full vocabulary with the added tokens. )�total_vocab_sizer�r&r&r'�__len__�szPreTrainedTokenizer.__len__cCst|��|_dS)a! Update the size of the full vocabulary with the added tokens. Counts the `keys` and not the `values` because otherwise if there is a hole in the vocab, we will add tokenizers at a wrong index. This operation is slow and is only updated when adding tokens. N)r4� get_vocabr�r�r&r&r'r��sz,PreTrainedTokenizer._update_total_vocab_sizeF� new_tokensr�c Cszd}|dur|S|��}t|�}|D]�}t|ttf�s*td|�dt|��d��t|�dkr1qt|t�rN||jvr<q||j vpB|}t|dd||d�}n|rY|� d |jd ��||jvr_q|j sq|jrqt|dd�rq|j��|_|j|vr�||}|||j<|d7}n||j}|j r�t|�|j vr�|jd �|�||j|<||j|j<|jr�t�d|�d��q|��|��|S)a� Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to it with indices starting from length of the current vocabulary. Special tokens are sometimes already in the vocab which is why they have to be handled specifically. Args: new_tokens (`List[str]`or `List[tokenizers.AddedToken]`): Token(s) to add in vocabulary. A token is counted as added if it's not already in the vocabulary (tested by checking if the tokenizer assign the index of the `unk_token` to them). If a token is part of the vocabulary then we simply mark this token as an `AddedToken` which allows to control the stripping and normalization of this token. This is NOT possible in `tokenizers`. special_tokens (`bool`, *optional*, defaults to `False`): Whether or not the tokens should be added as special tokens. Returns: `int`: The number of tokens actually added to the vocabulary. Examples: ```python # Let's see how to increase the vocabulary of Bert model and tokenizer tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased") model = BertModel.from_pretrained("google-bert/bert-base-uncased") num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"]) print("We have added", num_added_toks, "tokens") # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer. model.resize_token_embeddings(len(tokenizer)) ```rNzToken z is not a string but a �.rF)�rstrip�lstrip� normalized�specialT)r�r�� do_lower_caser �additional_special_tokenszAdding z to the vocabulary)r��copyr4r�rHrr��typer��all_special_tokens�__setstate__r�r�r��getattrr��lowerZ_special_tokens_mapr5�verboserA�info�_update_trier�) r$r�r�Zadded_tokensZ current_vocab�new_idxr+� is_special�token_indexr&r&r'r�sL � �zPreTrainedTokenizer._add_tokens�unique_no_split_tokenscCsP|j��D]}||jjvr|j�|j�q|D]}||jjvr%|j�|�qdSrJ)r��valuesr�r r*r�)r$r�r+r&r&r'r�Os��z PreTrainedTokenizer._update_trie�paircCs$g}g}t|�||r|��Sd��S)aG Returns the number of added tokens when encoding a sequence with special tokens. <Tip> This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put this inside your training loop. </Tip> Args: pair (`bool`, *optional*, defaults to `False`): Whether the number of added tokens should be computed in the case of a sequence pair or a single sequence. Returns: `int`: Number of special tokens added to sequences. N)r4� build_inputs_with_special_tokens)r$r��token_ids_0�token_ids_1r&r&r'�num_special_tokens_to_addWsz-PreTrainedTokenizer.num_special_tokens_to_addr0cKs$|�d|j�}|j|fi|��\}}|rt�d|�d��t|d�rM|jrMdd�|jD�}|dd�|j� �D�7}dd � |�d d}t�|dd �|�}|rUg}|g}n|j ��}|j�|�}t|�D]�\}} | |vr�|j�|j | d�} |dkr�||dnd}|t|�dkr�||dnd}t| t�r�| jr�|r�|��||d<| jr�|r�|��||d<| jr�|r�|ddkr�||d| 7<d||<qd| jr�|r�|ddkr�| ||d||d<d||<qdt| �dt| ��qdg} |D]} | s�q�| |v�r| �| �q�| �|�| ��q�| S)a$ Converts a string into a sequence of tokens, using the tokenizer. Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces). Takes care of added tokens. Args: text (`str`): The sequence to be encoded. **kwargs (additional keyword arguments): Passed along to the model-specific `prepare_for_tokenization` preprocessing method. Returns: `List[str]`: The list of tokens. �split_special_tokenszKeyword arguments z not recognized.r�cSsg|]}t�|��qSr&)�re�escape�rQZs_tokr&r&r'rS�sz0PreTrainedTokenizer.tokenize.<locals>.<listcomp>cSs$g|]}|js|jrt�|j��qSr&)r�r�r�r�r�r�r&r&r'rS�s�� (�|z)|z(.+?)cSs|��dp |��d��S)Nrr )�groupsr�)�mr&r&r'r��sz.PreTrainedTokenizer.tokenize.<locals>.<lambda>Nrr ryr_rzy cannot be tokenized because it was not properly added to the tokenizer. This means that it is not an `AddedToken` but a )r�r��prepare_for_tokenizationrA�warningr�r�r�r�r��joinr��subr��keysr�r@r2�getr4r�rr�r��single_word� ValueErrorr�r5r[� _tokenize)r$r0r�r�Zescaped_special_toks�patternZno_split_tokenrC�ir+Ztok_extended�left�rightZtokenized_textr&r&r'�tokenizens^� �� zPreTrainedTokenizer.tokenizecKr�)a Converts a string into a sequence of tokens (string), using the tokenizer. Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces). Do NOT take care of added tokens. r�)r$r0r�r&r&r'r��szPreTrainedTokenizer._tokenizerCcCsB|durdSt|t�r|�|�Sg}|D] }|�|�|��q|S)aT Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the vocabulary. Args: tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s). Returns: `int` or `List[int]`: The token id or list of token ids. N)r�rH�#_convert_token_to_id_with_added_vocr5)r$rC�idsr+r&r&r'�convert_tokens_to_ids�s z)PreTrainedTokenizer.convert_tokens_to_idscCs*|durdS||jvr|j|S|�|�SrJ)r��_convert_token_to_id�r$r+r&r&r'r��s z7PreTrainedTokenizer._convert_token_to_id_with_added_voccCr�rJr�r�r&r&r'r��z(PreTrainedTokenizer._convert_token_to_idNTr� text_pair�add_special_tokens�padding_strategy�truncation_strategy� max_length�stride�is_split_into_words�pad_to_multiple_of�padding_side�return_tensors�return_token_type_ids�return_attention_mask�return_overflowing_tokens�return_special_tokens_mask�return_offsets_mapping� return_lengthr�cs��fdd�}|rtd��||�}|dur||�nd}�j|fid|�d|�d|j�d|j�d|�d |�d | �d| �d|�d d�d| �d|�d|�d|�d|�d|��S)Ncs�t|t�r�j|fi��}��|�St|ttf�rBt|�dkrBt|dt�rB�r=ttj��fdd�|D��}��|�S��|�St|ttf�rXt|�dkrXt|dt �rX|S�rbt d|�d��t d|�d��)Nrc3�&�|]}�j|fddi��VqdS�r�TN�r��rQ�t�r�r$r&r'� <genexpr>��$zJPreTrainedTokenizer._encode_plus.<locals>.get_input_ids.<locals>.<genexpr>zInput z] is not valid. Should be a string or a list/tuple of strings when `is_split_into_words=True`.zW is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.�r�rHr�r�r]r)r4� itertools�chainr�r��r0rC�r�r�r$r&r'� get_input_ids�s& (� ( � �z7PreTrainedTokenizer._encode_plus.<locals>.get_input_idsareturn_offset_mapping is not available when using Python tokenizers. To use this feature, change your tokenizer to one deriving from transformers.PreTrainedTokenizerFast. More information on available tokenizers at https://github.com/huggingface/transformers/pull/2674�pair_idsr��padding� truncationr�r�r�r�r��prepend_batch_axisTr�r�r�r�r�r�)r��prepare_for_modelr�)r$r0r�r�r�r�r�r�r�r�r�r�r�r�r�r�r�r�r�r�r � first_ids� second_idsr&r r'�_encode_plus�sT�� z PreTrainedTokenizer._encode_plus�batch_text_or_text_pairsr�cs��fdd�}|rtd��g}|D]9}t|ttf�s!|d}}n�r2t|dttf�s2|d}}n|\}}||�}|durB||�nd}|�||f�q�j|f||||||| ||| ||| ||d��}t|�S)Ncs�t|t�r�j|fi��}��|�St|ttf�rBt|�dkrBt|dt�rB�r=ttj��fdd�|D��}��|�S��|�St|ttf�rXt|�dkrXt|dt �rX|St d��)Nrc3r�r�r�rrr&r'r\rzPPreTrainedTokenizer._batch_encode_plus.<locals>.get_input_ids.<locals>.<genexpr>z\Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.rrr r&r'r Us (� (�z=PreTrainedTokenizer._batch_encode_plus.<locals>.get_input_idsz�return_offset_mapping is not available when using Python tokenizers. To use this feature, change your tokenizer to one deriving from transformers.PreTrainedTokenizerFast.r)r�r�r�r�r�r�r�r�r�r�r�r�r�r�r�)r�r�r]r)r5�_batch_prepare_for_modelr)r$rr�r�r�r�r�r�r�r�r�r�r�r�r�r�r�r�r�r�r � input_idsZids_or_pair_idsr�rrr� batch_outputsr&r r'�_batch_encode_plus8sF��z&PreTrainedTokenizer._batch_encode_plus�batch_ids_pairscCs�i}|D]W\}}|j||fid|�dtjj�d|j�d|�d|�dd�dd�d d �d| �d|�d | �d|�dd�dd �d|�d|��}|��D]\}}||vrSg||<||�|�qGq|j||j||||d�}t|| d�}|S)a� Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and manages a moving window (with user defined stride) for overflowing tokens Args: batch_ids_pairs: list of tokenized input ids or input ids pairs r�rr r�r�r�Nr�r�Fr�r�r�r�r�rr�r�)rr�r�r�r�)�tensor_type)rr� DO_NOT_PADr�r3r5�padr)r$rr�r�r�r�r�r�r�r�r�r�r�r�r�r�r�rrr�outputsr�r�r&r&r'r�sj�� z,PreTrainedTokenizer._batch_prepare_for_modelcKs||fS)a� Performs any necessary transformations before tokenization. This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the `kwargs` at the end of the encoding process to be sure all the arguments have been used. Args: text (`str`): The text to prepare. is_split_into_words (`bool`, *optional*, defaults to `False`): Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace) which it will tokenize. This is useful for NER or token classification. kwargs (`Dict[str, Any]`, *optional*): Keyword arguments to use for the tokenization. Returns: `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs. r&)r$r0r�r�r&r&r'r��sz,PreTrainedTokenizer.prepare_for_tokenizationr�r��already_has_special_tokenscsD|r|dur td��t�j||dd�Sdg|rt|�ndt|�S)a� Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods. Args: token_ids_0 (`List[int]`): List of ids of the first sequence. token_ids_1 (`List[int]`, *optional*): List of ids of the second sequence. already_has_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not the token list is already formatted with special tokens for the model. Returns: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. NzYou should not supply a second sequence if the provided sequence of ids is already formatted with special tokens for the model.T)r�r�rr)r�rK�get_special_tokens_maskr4)r$r�r�rrLr&r'r�s��z+PreTrainedTokenizer.get_special_tokens_maskr��skip_special_tokenscC�dSrJr&�r$r�rr&r&r'�convert_ids_to_tokens r�z)PreTrainedTokenizer.convert_ids_to_tokenscCr rJr&r!r&r&r'r"r�cCs�t|t�r||jvr|j|jS|�|�Sg}|D]%}t|�}|r'||jvr'q||jvr6|�|j|j�q|�|�|��q|S)a� Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and added tokens. Args: ids (`int` or `List[int]`): The token id (or token ids) to convert to tokens. skip_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not to remove special tokens in the decoding. Returns: `str` or `List[str]`: The decoded token(s). )r�r�r�r��_convert_id_to_tokenZall_special_idsr5)r$r�rrCr�r&r&r'r"s r�cCr�rJr�)r$r�r&r&r'r#3r�z(PreTrainedTokenizer._convert_id_to_tokencCs d�|�S)Nr_)r�)r$rCr&r&r'�convert_tokens_to_string6s z,PreTrainedTokenizer.convert_tokens_to_string� token_ids�clean_up_tokenization_spaces�spaces_between_special_tokenscs|�dd��_�j||d�}t|t�r|g}t�j��t�j��fdd��j D�B}g}g} |D]-} |r<| �jvr<q2| |vrZ| rT�� | �}t|�dkrR|�|�g} |�| �q2| �| �q2| rj|�� | ��|rrd� |�}nd� |�}|dur}|n�j}|r��|�} | S|S) NZuse_source_tokenizerF)rcs h|]}��|��jkr|�qSr&)r�r�rPr�r&r'� <setcomp>Hsz.PreTrainedTokenizer._decode.<locals>.<setcomp>rr_r)r�r�r"r�rHrr�r�r�r�r$r4r5r�r&Zclean_up_tokenization)r$r%rr&r'r�Zfiltered_tokensZlegacy_added_tokensZ sub_textsZcurrent_sub_textr+�stringr0� clean_textr&r�r'�_decode9sB � �� zPreTrainedTokenizer._decode)Fr�)FNT):rDrErFrGr(�propertyr{r�r�r�rrHr�rr��setterrr�r�r�rr�rr�r�rr�r�r�r�r�rrrZDO_NOT_TRUNCATErrrrrrrrrrrrrrrr�rr r"r#r$r+r^r&r&rLr'r��s� ,(NP* �� Z�� Y�� F�� "�� r�)2rGr�rr�rd�collectionsr�typingrrrrrrr �tokenization_utils_baserrr rrrrrrrrrr�utilsrrrr� get_loggerrDrAZSPECIAL_TOKENS_MAP_FILEZADDED_TOKENS_FILEZTOKENIZER_CONFIG_FILErrIrhrkrwr}rrHr�r�r&r&r&r'�<module>s0$< h;