mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
[remove black
] And use ruff (#1436)
* nits * Fixing deps. * Ruff update. * Import order matters. * Fix. * Revert ruff fix. * Visualizer. * Putting back the imports. --------- Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
@ -8,12 +8,14 @@ check_dirs := examples py_src/tokenizers tests
|
|||||||
# Format source code automatically
|
# Format source code automatically
|
||||||
style:
|
style:
|
||||||
python stub.py
|
python stub.py
|
||||||
black --line-length 119 --target-version py35 $(check_dirs)
|
ruff check $(check_dirs) --fix
|
||||||
|
ruff format $(check_dirs)t
|
||||||
|
|
||||||
# Check the source code is formatted correctly
|
# Check the source code is formatted correctly
|
||||||
check-style:
|
check-style:
|
||||||
python stub.py --check
|
python stub.py --check
|
||||||
black --check --line-length 119 --target-version py35 examples py_src/tokenizers tests
|
ruff check examples py_src/tokenizers tests
|
||||||
|
ruff format --check examples py_src/tokenizers tests
|
||||||
|
|
||||||
TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json
|
TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json
|
||||||
|
|
||||||
|
@ -4,16 +4,15 @@ import time
|
|||||||
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
logging.getLogger("transformers").disabled = True
|
|
||||||
logging.getLogger("transformers.tokenization_utils").disabled = True
|
|
||||||
|
|
||||||
from tokenizers import Tokenizer, decoders, pre_tokenizers
|
from tokenizers import Tokenizer, decoders, pre_tokenizers
|
||||||
from tokenizers.models import BPE, WordPiece
|
from tokenizers.models import BPE, WordPiece
|
||||||
from tokenizers.normalizers import BertNormalizer
|
from tokenizers.normalizers import BertNormalizer
|
||||||
from tokenizers.processors import BertProcessing
|
from tokenizers.processors import BertProcessing
|
||||||
from transformers import BertTokenizer, GPT2Tokenizer
|
from transformers import BertTokenizer, GPT2Tokenizer
|
||||||
|
|
||||||
|
logging.getLogger("transformers").disabled = True
|
||||||
|
logging.getLogger("transformers.tokenization_utils").disabled = True
|
||||||
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("--type", default="gpt2", type=str, help="The type of tokenizer (bert|gpt2)")
|
parser.add_argument("--type", default="gpt2", type=str, help="The type of tokenizer (bert|gpt2)")
|
||||||
@ -51,9 +50,7 @@ Although never is often better than *right* now.
|
|||||||
If the implementation is hard to explain, it's a bad idea.
|
If the implementation is hard to explain, it's a bad idea.
|
||||||
If the implementation is easy to explain, it may be a good idea.
|
If the implementation is easy to explain, it may be a good idea.
|
||||||
Namespaces are one honking great idea -- let's do more of those!
|
Namespaces are one honking great idea -- let's do more of those!
|
||||||
""".split(
|
""".split("\n")
|
||||||
"\n"
|
|
||||||
)
|
|
||||||
|
|
||||||
if args.type == "gpt2":
|
if args.type == "gpt2":
|
||||||
print("Running GPT-2 tokenizer")
|
print("Running GPT-2 tokenizer")
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import datasets
|
import datasets
|
||||||
|
|
||||||
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers
|
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers
|
||||||
|
|
||||||
|
|
||||||
# Build a tokenizer
|
# Build a tokenizer
|
||||||
|
@ -34,39 +34,44 @@ class AddedToken:
|
|||||||
Defines whether this token should be skipped when decoding.
|
Defines whether this token should be skipped when decoding.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False):
|
def __init__(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def content(self):
|
def content(self):
|
||||||
"""
|
"""
|
||||||
Get the content of this :obj:`AddedToken`
|
Get the content of this :obj:`AddedToken`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def lstrip(self):
|
def lstrip(self):
|
||||||
"""
|
"""
|
||||||
Get the value of the :obj:`lstrip` option
|
Get the value of the :obj:`lstrip` option
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def normalized(self):
|
def normalized(self):
|
||||||
"""
|
"""
|
||||||
Get the value of the :obj:`normalized` option
|
Get the value of the :obj:`normalized` option
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def rstrip(self):
|
def rstrip(self):
|
||||||
"""
|
"""
|
||||||
Get the value of the :obj:`rstrip` option
|
Get the value of the :obj:`rstrip` option
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def single_word(self):
|
def single_word(self):
|
||||||
"""
|
"""
|
||||||
Get the value of the :obj:`single_word` option
|
Get the value of the :obj:`single_word` option
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def special(self):
|
def special(self):
|
||||||
"""
|
"""
|
||||||
@ -78,7 +83,6 @@ class Encoding:
|
|||||||
"""
|
"""
|
||||||
The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`.
|
The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def attention_mask(self):
|
def attention_mask(self):
|
||||||
"""
|
"""
|
||||||
@ -92,6 +96,7 @@ class Encoding:
|
|||||||
:obj:`List[int]`: The attention mask
|
:obj:`List[int]`: The attention mask
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def char_to_token(self, char_pos, sequence_index=0):
|
def char_to_token(self, char_pos, sequence_index=0):
|
||||||
"""
|
"""
|
||||||
Get the token that contains the char at the given position in the input sequence.
|
Get the token that contains the char at the given position in the input sequence.
|
||||||
@ -106,6 +111,7 @@ class Encoding:
|
|||||||
:obj:`int`: The index of the token that contains this char in the encoded sequence
|
:obj:`int`: The index of the token that contains this char in the encoded sequence
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def char_to_word(self, char_pos, sequence_index=0):
|
def char_to_word(self, char_pos, sequence_index=0):
|
||||||
"""
|
"""
|
||||||
Get the word that contains the char at the given position in the input sequence.
|
Get the word that contains the char at the given position in the input sequence.
|
||||||
@ -120,6 +126,7 @@ class Encoding:
|
|||||||
:obj:`int`: The index of the word that contains this char in the input sequence
|
:obj:`int`: The index of the word that contains this char in the input sequence
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ids(self):
|
def ids(self):
|
||||||
"""
|
"""
|
||||||
@ -132,6 +139,7 @@ class Encoding:
|
|||||||
:obj:`List[int]`: The list of IDs
|
:obj:`List[int]`: The list of IDs
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def merge(encodings, growing_offsets=True):
|
def merge(encodings, growing_offsets=True):
|
||||||
"""
|
"""
|
||||||
@ -148,6 +156,7 @@ class Encoding:
|
|||||||
:class:`~tokenizers.Encoding`: The resulting Encoding
|
:class:`~tokenizers.Encoding`: The resulting Encoding
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def n_sequences(self):
|
def n_sequences(self):
|
||||||
"""
|
"""
|
||||||
@ -157,6 +166,7 @@ class Encoding:
|
|||||||
:obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding`
|
:obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def offsets(self):
|
def offsets(self):
|
||||||
"""
|
"""
|
||||||
@ -169,6 +179,7 @@ class Encoding:
|
|||||||
A :obj:`List` of :obj:`Tuple[int, int]`: The list of offsets
|
A :obj:`List` of :obj:`Tuple[int, int]`: The list of offsets
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def overflowing(self):
|
def overflowing(self):
|
||||||
"""
|
"""
|
||||||
@ -183,6 +194,7 @@ class Encoding:
|
|||||||
maximum length.
|
maximum length.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pad(self, length, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]"):
|
def pad(self, length, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]"):
|
||||||
"""
|
"""
|
||||||
Pad the :class:`~tokenizers.Encoding` at the given length
|
Pad the :class:`~tokenizers.Encoding` at the given length
|
||||||
@ -204,6 +216,7 @@ class Encoding:
|
|||||||
The pad token to use
|
The pad token to use
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def sequence_ids(self):
|
def sequence_ids(self):
|
||||||
"""
|
"""
|
||||||
@ -217,6 +230,7 @@ class Encoding:
|
|||||||
A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index.
|
A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def set_sequence_id(self, sequence_id):
|
def set_sequence_id(self, sequence_id):
|
||||||
"""
|
"""
|
||||||
Set the given sequence index
|
Set the given sequence index
|
||||||
@ -225,6 +239,7 @@ class Encoding:
|
|||||||
:class:`~tokenizers.Encoding`.
|
:class:`~tokenizers.Encoding`.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def special_tokens_mask(self):
|
def special_tokens_mask(self):
|
||||||
"""
|
"""
|
||||||
@ -236,6 +251,7 @@ class Encoding:
|
|||||||
:obj:`List[int]`: The special tokens mask
|
:obj:`List[int]`: The special tokens mask
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def token_to_chars(self, token_index):
|
def token_to_chars(self, token_index):
|
||||||
"""
|
"""
|
||||||
Get the offsets of the token at the given index.
|
Get the offsets of the token at the given index.
|
||||||
@ -252,6 +268,7 @@ class Encoding:
|
|||||||
:obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)`
|
:obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def token_to_sequence(self, token_index):
|
def token_to_sequence(self, token_index):
|
||||||
"""
|
"""
|
||||||
Get the index of the sequence represented by the given token.
|
Get the index of the sequence represented by the given token.
|
||||||
@ -267,6 +284,7 @@ class Encoding:
|
|||||||
:obj:`int`: The sequence id of the given token
|
:obj:`int`: The sequence id of the given token
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def token_to_word(self, token_index):
|
def token_to_word(self, token_index):
|
||||||
"""
|
"""
|
||||||
Get the index of the word that contains the token in one of the input sequences.
|
Get the index of the word that contains the token in one of the input sequences.
|
||||||
@ -283,6 +301,7 @@ class Encoding:
|
|||||||
:obj:`int`: The index of the word in the relevant input sequence.
|
:obj:`int`: The index of the word in the relevant input sequence.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tokens(self):
|
def tokens(self):
|
||||||
"""
|
"""
|
||||||
@ -294,6 +313,7 @@ class Encoding:
|
|||||||
:obj:`List[str]`: The list of tokens
|
:obj:`List[str]`: The list of tokens
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def truncate(self, max_length, stride=0, direction="right"):
|
def truncate(self, max_length, stride=0, direction="right"):
|
||||||
"""
|
"""
|
||||||
Truncate the :class:`~tokenizers.Encoding` at the given length
|
Truncate the :class:`~tokenizers.Encoding` at the given length
|
||||||
@ -312,6 +332,7 @@ class Encoding:
|
|||||||
Truncate direction
|
Truncate direction
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def type_ids(self):
|
def type_ids(self):
|
||||||
"""
|
"""
|
||||||
@ -324,6 +345,7 @@ class Encoding:
|
|||||||
:obj:`List[int]`: The list of type ids
|
:obj:`List[int]`: The list of type ids
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def word_ids(self):
|
def word_ids(self):
|
||||||
"""
|
"""
|
||||||
@ -341,6 +363,7 @@ class Encoding:
|
|||||||
A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
|
A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def word_to_chars(self, word_index, sequence_index=0):
|
def word_to_chars(self, word_index, sequence_index=0):
|
||||||
"""
|
"""
|
||||||
Get the offsets of the word at the given index in one of the input sequences.
|
Get the offsets of the word at the given index in one of the input sequences.
|
||||||
@ -355,6 +378,7 @@ class Encoding:
|
|||||||
:obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)`
|
:obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def word_to_tokens(self, word_index, sequence_index=0):
|
def word_to_tokens(self, word_index, sequence_index=0):
|
||||||
"""
|
"""
|
||||||
Get the encoded tokens corresponding to the word at the given index
|
Get the encoded tokens corresponding to the word at the given index
|
||||||
@ -370,6 +394,7 @@ class Encoding:
|
|||||||
:obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)`
|
:obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def words(self):
|
def words(self):
|
||||||
"""
|
"""
|
||||||
@ -404,37 +429,42 @@ class NormalizedString:
|
|||||||
sequence: str:
|
sequence: str:
|
||||||
The string sequence used to initialize this NormalizedString
|
The string sequence used to initialize this NormalizedString
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def append(self, s):
|
def append(self, s):
|
||||||
"""
|
"""
|
||||||
Append the given sequence to the string
|
Append the given sequence to the string
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def clear(self):
|
def clear(self):
|
||||||
"""
|
"""
|
||||||
Clears the string
|
Clears the string
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def filter(self, func):
|
def filter(self, func):
|
||||||
"""
|
"""
|
||||||
Filter each character of the string using the given func
|
Filter each character of the string using the given func
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def for_each(self, func):
|
def for_each(self, func):
|
||||||
"""
|
"""
|
||||||
Calls the given function for each character of the string
|
Calls the given function for each character of the string
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def lowercase(self):
|
def lowercase(self):
|
||||||
"""
|
"""
|
||||||
Lowercase the string
|
Lowercase the string
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def lstrip(self):
|
def lstrip(self):
|
||||||
"""
|
"""
|
||||||
Strip the left of the string
|
Strip the left of the string
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def map(self, func):
|
def map(self, func):
|
||||||
"""
|
"""
|
||||||
Calls the given function for each character of the string
|
Calls the given function for each character of the string
|
||||||
@ -443,37 +473,44 @@ class NormalizedString:
|
|||||||
returned value **must** be a str of length 1 (ie a character).
|
returned value **must** be a str of length 1 (ie a character).
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def nfc(self):
|
def nfc(self):
|
||||||
"""
|
"""
|
||||||
Runs the NFC normalization
|
Runs the NFC normalization
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def nfd(self):
|
def nfd(self):
|
||||||
"""
|
"""
|
||||||
Runs the NFD normalization
|
Runs the NFD normalization
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def nfkc(self):
|
def nfkc(self):
|
||||||
"""
|
"""
|
||||||
Runs the NFKC normalization
|
Runs the NFKC normalization
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def nfkd(self):
|
def nfkd(self):
|
||||||
"""
|
"""
|
||||||
Runs the NFKD normalization
|
Runs the NFKD normalization
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def normalized(self):
|
def normalized(self):
|
||||||
"""
|
"""
|
||||||
The normalized part of the string
|
The normalized part of the string
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def prepend(self, s):
|
def prepend(self, s):
|
||||||
"""
|
"""
|
||||||
Prepend the given sequence to the string
|
Prepend the given sequence to the string
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def replace(self, pattern, content):
|
def replace(self, pattern, content):
|
||||||
"""
|
"""
|
||||||
Replace the content of the given pattern with the provided content
|
Replace the content of the given pattern with the provided content
|
||||||
@ -486,16 +523,19 @@ class NormalizedString:
|
|||||||
The content to be used as replacement
|
The content to be used as replacement
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def rstrip(self):
|
def rstrip(self):
|
||||||
"""
|
"""
|
||||||
Strip the right of the string
|
Strip the right of the string
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def slice(self, range):
|
def slice(self, range):
|
||||||
"""
|
"""
|
||||||
Slice the string using the given range
|
Slice the string using the given range
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def split(self, pattern, behavior):
|
def split(self, pattern, behavior):
|
||||||
"""
|
"""
|
||||||
Split the NormalizedString using the given pattern and the specified behavior
|
Split the NormalizedString using the given pattern and the specified behavior
|
||||||
@ -513,11 +553,13 @@ class NormalizedString:
|
|||||||
A list of NormalizedString, representing each split
|
A list of NormalizedString, representing each split
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def strip(self):
|
def strip(self):
|
||||||
"""
|
"""
|
||||||
Strip both ends of the string
|
Strip both ends of the string
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def uppercase(self):
|
def uppercase(self):
|
||||||
"""
|
"""
|
||||||
Uppercase the string
|
Uppercase the string
|
||||||
@ -542,9 +584,9 @@ class PreTokenizedString:
|
|||||||
sequence: str:
|
sequence: str:
|
||||||
The string sequence used to initialize this PreTokenizedString
|
The string sequence used to initialize this PreTokenizedString
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, sequence):
|
def __init__(self, sequence):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def get_splits(self, offset_referential="original", offset_type="char"):
|
def get_splits(self, offset_referential="original", offset_type="char"):
|
||||||
"""
|
"""
|
||||||
Get the splits currently managed by the PreTokenizedString
|
Get the splits currently managed by the PreTokenizedString
|
||||||
@ -565,6 +607,7 @@ class PreTokenizedString:
|
|||||||
A list of splits
|
A list of splits
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def normalize(self, func):
|
def normalize(self, func):
|
||||||
"""
|
"""
|
||||||
Normalize each split of the `PreTokenizedString` using the given `func`
|
Normalize each split of the `PreTokenizedString` using the given `func`
|
||||||
@ -576,6 +619,7 @@ class PreTokenizedString:
|
|||||||
NormalizedString allow its modification.
|
NormalizedString allow its modification.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def split(self, func):
|
def split(self, func):
|
||||||
"""
|
"""
|
||||||
Split the PreTokenizedString using the given `func`
|
Split the PreTokenizedString using the given `func`
|
||||||
@ -590,6 +634,7 @@ class PreTokenizedString:
|
|||||||
should come from calling either `.split` or `.slice` on the received one.
|
should come from calling either `.split` or `.slice` on the received one.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def to_encoding(self, type_id=0, word_idx=None):
|
def to_encoding(self, type_id=0, word_idx=None):
|
||||||
"""
|
"""
|
||||||
Return an Encoding generated from this PreTokenizedString
|
Return an Encoding generated from this PreTokenizedString
|
||||||
@ -607,6 +652,7 @@ class PreTokenizedString:
|
|||||||
An Encoding
|
An Encoding
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def tokenize(self, func):
|
def tokenize(self, func):
|
||||||
"""
|
"""
|
||||||
Tokenize each split of the `PreTokenizedString` using the given `func`
|
Tokenize each split of the `PreTokenizedString` using the given `func`
|
||||||
@ -622,7 +668,6 @@ class Regex:
|
|||||||
"""
|
"""
|
||||||
Instantiate a new Regex with the given pattern
|
Instantiate a new Regex with the given pattern
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, pattern):
|
def __init__(self, pattern):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -639,9 +684,9 @@ class Tokenizer:
|
|||||||
The core algorithm that this :obj:`Tokenizer` should be using.
|
The core algorithm that this :obj:`Tokenizer` should be using.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, model):
|
def __init__(self, model):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def add_special_tokens(self, tokens):
|
def add_special_tokens(self, tokens):
|
||||||
"""
|
"""
|
||||||
Add the given special tokens to the Tokenizer.
|
Add the given special tokens to the Tokenizer.
|
||||||
@ -662,6 +707,7 @@ class Tokenizer:
|
|||||||
:obj:`int`: The number of tokens that were created in the vocabulary
|
:obj:`int`: The number of tokens that were created in the vocabulary
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def add_tokens(self, tokens):
|
def add_tokens(self, tokens):
|
||||||
"""
|
"""
|
||||||
Add the given tokens to the vocabulary
|
Add the given tokens to the vocabulary
|
||||||
@ -678,6 +724,7 @@ class Tokenizer:
|
|||||||
:obj:`int`: The number of tokens that were created in the vocabulary
|
:obj:`int`: The number of tokens that were created in the vocabulary
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def decode(self, ids, skip_special_tokens=True):
|
def decode(self, ids, skip_special_tokens=True):
|
||||||
"""
|
"""
|
||||||
Decode the given list of ids back to a string
|
Decode the given list of ids back to a string
|
||||||
@ -695,6 +742,7 @@ class Tokenizer:
|
|||||||
:obj:`str`: The decoded string
|
:obj:`str`: The decoded string
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def decode_batch(self, sequences, skip_special_tokens=True):
|
def decode_batch(self, sequences, skip_special_tokens=True):
|
||||||
"""
|
"""
|
||||||
Decode a batch of ids back to their corresponding string
|
Decode a batch of ids back to their corresponding string
|
||||||
@ -710,12 +758,14 @@ class Tokenizer:
|
|||||||
:obj:`List[str]`: A list of decoded strings
|
:obj:`List[str]`: A list of decoded strings
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def decoder(self):
|
def decoder(self):
|
||||||
"""
|
"""
|
||||||
The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer
|
The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def enable_padding(
|
def enable_padding(
|
||||||
self, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]", length=None, pad_to_multiple_of=None
|
self, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]", length=None, pad_to_multiple_of=None
|
||||||
):
|
):
|
||||||
@ -745,6 +795,7 @@ class Tokenizer:
|
|||||||
the longest sequence in a batch.
|
the longest sequence in a batch.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def enable_truncation(self, max_length, stride=0, strategy="longest_first", direction="right"):
|
def enable_truncation(self, max_length, stride=0, strategy="longest_first", direction="right"):
|
||||||
"""
|
"""
|
||||||
Enable truncation
|
Enable truncation
|
||||||
@ -765,6 +816,7 @@ class Tokenizer:
|
|||||||
Truncate direction
|
Truncate direction
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True):
|
def encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True):
|
||||||
"""
|
"""
|
||||||
Encode the given sequence and pair. This method can process raw text sequences
|
Encode the given sequence and pair. This method can process raw text sequences
|
||||||
@ -803,6 +855,7 @@ class Tokenizer:
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def encode_batch(self, input, is_pretokenized=False, add_special_tokens=True):
|
def encode_batch(self, input, is_pretokenized=False, add_special_tokens=True):
|
||||||
"""
|
"""
|
||||||
Encode the given batch of inputs. This method accept both raw text sequences
|
Encode the given batch of inputs. This method accept both raw text sequences
|
||||||
@ -838,6 +891,7 @@ class Tokenizer:
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def encode_special_tokens(self):
|
def encode_special_tokens(self):
|
||||||
"""
|
"""
|
||||||
@ -850,6 +904,7 @@ class Tokenizer:
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_buffer(buffer):
|
def from_buffer(buffer):
|
||||||
"""
|
"""
|
||||||
@ -863,6 +918,7 @@ class Tokenizer:
|
|||||||
:class:`~tokenizers.Tokenizer`: The new tokenizer
|
:class:`~tokenizers.Tokenizer`: The new tokenizer
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_file(path):
|
def from_file(path):
|
||||||
"""
|
"""
|
||||||
@ -877,6 +933,7 @@ class Tokenizer:
|
|||||||
:class:`~tokenizers.Tokenizer`: The new tokenizer
|
:class:`~tokenizers.Tokenizer`: The new tokenizer
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_pretrained(identifier, revision="main", auth_token=None):
|
def from_pretrained(identifier, revision="main", auth_token=None):
|
||||||
"""
|
"""
|
||||||
@ -897,6 +954,7 @@ class Tokenizer:
|
|||||||
:class:`~tokenizers.Tokenizer`: The new tokenizer
|
:class:`~tokenizers.Tokenizer`: The new tokenizer
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_str(json):
|
def from_str(json):
|
||||||
"""
|
"""
|
||||||
@ -911,6 +969,7 @@ class Tokenizer:
|
|||||||
:class:`~tokenizers.Tokenizer`: The new tokenizer
|
:class:`~tokenizers.Tokenizer`: The new tokenizer
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def get_added_tokens_decoder(self):
|
def get_added_tokens_decoder(self):
|
||||||
"""
|
"""
|
||||||
Get the underlying vocabulary
|
Get the underlying vocabulary
|
||||||
@ -919,6 +978,7 @@ class Tokenizer:
|
|||||||
:obj:`Dict[int, AddedToken]`: The vocabulary
|
:obj:`Dict[int, AddedToken]`: The vocabulary
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def get_vocab(self, with_added_tokens=True):
|
def get_vocab(self, with_added_tokens=True):
|
||||||
"""
|
"""
|
||||||
Get the underlying vocabulary
|
Get the underlying vocabulary
|
||||||
@ -931,6 +991,7 @@ class Tokenizer:
|
|||||||
:obj:`Dict[str, int]`: The vocabulary
|
:obj:`Dict[str, int]`: The vocabulary
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def get_vocab_size(self, with_added_tokens=True):
|
def get_vocab_size(self, with_added_tokens=True):
|
||||||
"""
|
"""
|
||||||
Get the size of the underlying vocabulary
|
Get the size of the underlying vocabulary
|
||||||
@ -943,6 +1004,7 @@ class Tokenizer:
|
|||||||
:obj:`int`: The size of the vocabulary
|
:obj:`int`: The size of the vocabulary
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def id_to_token(self, id):
|
def id_to_token(self, id):
|
||||||
"""
|
"""
|
||||||
Convert the given id to its corresponding token if it exists
|
Convert the given id to its corresponding token if it exists
|
||||||
@ -955,28 +1017,33 @@ class Tokenizer:
|
|||||||
:obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary
|
:obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def model(self):
|
def model(self):
|
||||||
"""
|
"""
|
||||||
The :class:`~tokenizers.models.Model` in use by the Tokenizer
|
The :class:`~tokenizers.models.Model` in use by the Tokenizer
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def no_padding(self):
|
def no_padding(self):
|
||||||
"""
|
"""
|
||||||
Disable padding
|
Disable padding
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def no_truncation(self):
|
def no_truncation(self):
|
||||||
"""
|
"""
|
||||||
Disable truncation
|
Disable truncation
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def normalizer(self):
|
def normalizer(self):
|
||||||
"""
|
"""
|
||||||
The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer
|
The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def num_special_tokens_to_add(self, is_pair):
|
def num_special_tokens_to_add(self, is_pair):
|
||||||
"""
|
"""
|
||||||
Return the number of special tokens that would be added for single/pair sentences.
|
Return the number of special tokens that would be added for single/pair sentences.
|
||||||
@ -984,6 +1051,7 @@ class Tokenizer:
|
|||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def padding(self):
|
def padding(self):
|
||||||
"""
|
"""
|
||||||
@ -996,6 +1064,7 @@ class Tokenizer:
|
|||||||
A dict with the current padding parameters if padding is enabled
|
A dict with the current padding parameters if padding is enabled
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def post_process(self, encoding, pair=None, add_special_tokens=True):
|
def post_process(self, encoding, pair=None, add_special_tokens=True):
|
||||||
"""
|
"""
|
||||||
Apply all the post-processing steps to the given encodings.
|
Apply all the post-processing steps to the given encodings.
|
||||||
@ -1022,18 +1091,21 @@ class Tokenizer:
|
|||||||
:class:`~tokenizers.Encoding`: The final post-processed encoding
|
:class:`~tokenizers.Encoding`: The final post-processed encoding
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def post_processor(self):
|
def post_processor(self):
|
||||||
"""
|
"""
|
||||||
The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer
|
The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def pre_tokenizer(self):
|
def pre_tokenizer(self):
|
||||||
"""
|
"""
|
||||||
The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer
|
The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def save(self, path, pretty=True):
|
def save(self, path, pretty=True):
|
||||||
"""
|
"""
|
||||||
Save the :class:`~tokenizers.Tokenizer` to the file at the given path.
|
Save the :class:`~tokenizers.Tokenizer` to the file at the given path.
|
||||||
@ -1046,6 +1118,7 @@ class Tokenizer:
|
|||||||
Whether the JSON file should be pretty formatted.
|
Whether the JSON file should be pretty formatted.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def to_str(self, pretty=False):
|
def to_str(self, pretty=False):
|
||||||
"""
|
"""
|
||||||
Gets a serialized string representing this :class:`~tokenizers.Tokenizer`.
|
Gets a serialized string representing this :class:`~tokenizers.Tokenizer`.
|
||||||
@ -1058,6 +1131,7 @@ class Tokenizer:
|
|||||||
:obj:`str`: A string representing the serialized Tokenizer
|
:obj:`str`: A string representing the serialized Tokenizer
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def token_to_id(self, token):
|
def token_to_id(self, token):
|
||||||
"""
|
"""
|
||||||
Convert the given token to its corresponding id if it exists
|
Convert the given token to its corresponding id if it exists
|
||||||
@ -1070,6 +1144,7 @@ class Tokenizer:
|
|||||||
:obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary
|
:obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def train(self, files, trainer=None):
|
def train(self, files, trainer=None):
|
||||||
"""
|
"""
|
||||||
Train the Tokenizer using the given files.
|
Train the Tokenizer using the given files.
|
||||||
@ -1086,6 +1161,7 @@ class Tokenizer:
|
|||||||
An optional trainer that should be used to train our Model
|
An optional trainer that should be used to train our Model
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def train_from_iterator(self, iterator, trainer=None, length=None):
|
def train_from_iterator(self, iterator, trainer=None, length=None):
|
||||||
"""
|
"""
|
||||||
Train the Tokenizer using the provided iterator.
|
Train the Tokenizer using the provided iterator.
|
||||||
@ -1109,6 +1185,7 @@ class Tokenizer:
|
|||||||
provide meaningful progress tracking
|
provide meaningful progress tracking
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def truncation(self):
|
def truncation(self):
|
||||||
"""
|
"""
|
||||||
|
@ -6,7 +6,6 @@ class Decoder:
|
|||||||
This class is not supposed to be instantiated directly. Instead, any implementation of
|
This class is not supposed to be instantiated directly. Instead, any implementation of
|
||||||
a Decoder will return an instance of this class when instantiated.
|
a Decoder will return an instance of this class when instantiated.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def decode(self, tokens):
|
def decode(self, tokens):
|
||||||
"""
|
"""
|
||||||
Decode the given list of tokens to a final string
|
Decode the given list of tokens to a final string
|
||||||
@ -29,9 +28,9 @@ class BPEDecoder(Decoder):
|
|||||||
The suffix that was used to caracterize an end-of-word. This suffix will
|
The suffix that was used to caracterize an end-of-word. This suffix will
|
||||||
be replaced by whitespaces during the decoding
|
be replaced by whitespaces during the decoding
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, suffix="</w>"):
|
def __init__(self, suffix="</w>"):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def decode(self, tokens):
|
def decode(self, tokens):
|
||||||
"""
|
"""
|
||||||
Decode the given list of tokens to a final string
|
Decode the given list of tokens to a final string
|
||||||
@ -53,9 +52,9 @@ class ByteFallback(Decoder):
|
|||||||
cannot be decoded you will get <20> instead for each inconvertable byte token
|
cannot be decoded you will get <20> instead for each inconvertable byte token
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def decode(self, tokens):
|
def decode(self, tokens):
|
||||||
"""
|
"""
|
||||||
Decode the given list of tokens to a final string
|
Decode the given list of tokens to a final string
|
||||||
@ -76,9 +75,9 @@ class ByteLevel(Decoder):
|
|||||||
This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.ByteLevel`
|
This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.ByteLevel`
|
||||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`.
|
:class:`~tokenizers.pre_tokenizers.PreTokenizer`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def decode(self, tokens):
|
def decode(self, tokens):
|
||||||
"""
|
"""
|
||||||
Decode the given list of tokens to a final string
|
Decode the given list of tokens to a final string
|
||||||
@ -105,9 +104,9 @@ class CTC(Decoder):
|
|||||||
Whether to cleanup some tokenization artifacts.
|
Whether to cleanup some tokenization artifacts.
|
||||||
Mainly spaces before punctuation, and some abbreviated english forms.
|
Mainly spaces before punctuation, and some abbreviated english forms.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, pad_token="<pad>", word_delimiter_token="|", cleanup=True):
|
def __init__(self, pad_token="<pad>", word_delimiter_token="|", cleanup=True):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def decode(self, tokens):
|
def decode(self, tokens):
|
||||||
"""
|
"""
|
||||||
Decode the given list of tokens to a final string
|
Decode the given list of tokens to a final string
|
||||||
@ -128,9 +127,9 @@ class Fuse(Decoder):
|
|||||||
This is the last step of decoding, this decoder exists only if
|
This is the last step of decoding, this decoder exists only if
|
||||||
there is need to add other decoders *after* the fusion
|
there is need to add other decoders *after* the fusion
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def decode(self, tokens):
|
def decode(self, tokens):
|
||||||
"""
|
"""
|
||||||
Decode the given list of tokens to a final string
|
Decode the given list of tokens to a final string
|
||||||
@ -157,9 +156,9 @@ class Metaspace(Decoder):
|
|||||||
Whether to add a space to the first word if there isn't already one. This
|
Whether to add a space to the first word if there isn't already one. This
|
||||||
lets us treat `hello` exactly like `say hello`.
|
lets us treat `hello` exactly like `say hello`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, replacement="▁", add_prefix_space=True):
|
def __init__(self, replacement="▁", add_prefix_space=True):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def decode(self, tokens):
|
def decode(self, tokens):
|
||||||
"""
|
"""
|
||||||
Decode the given list of tokens to a final string
|
Decode the given list of tokens to a final string
|
||||||
@ -180,9 +179,9 @@ class Replace(Decoder):
|
|||||||
This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.Replace`
|
This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.Replace`
|
||||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`.
|
:class:`~tokenizers.pre_tokenizers.PreTokenizer`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, pattern, content):
|
def __init__(self, pattern, content):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def decode(self, tokens):
|
def decode(self, tokens):
|
||||||
"""
|
"""
|
||||||
Decode the given list of tokens to a final string
|
Decode the given list of tokens to a final string
|
||||||
@ -204,9 +203,9 @@ class Sequence(Decoder):
|
|||||||
decoders (:obj:`List[Decoder]`)
|
decoders (:obj:`List[Decoder]`)
|
||||||
The decoders that need to be chained
|
The decoders that need to be chained
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, decoders):
|
def __init__(self, decoders):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def decode(self, tokens):
|
def decode(self, tokens):
|
||||||
"""
|
"""
|
||||||
Decode the given list of tokens to a final string
|
Decode the given list of tokens to a final string
|
||||||
@ -225,9 +224,9 @@ class Strip(Decoder):
|
|||||||
Strip normalizer
|
Strip normalizer
|
||||||
Strips n left characters of each token, or n right characters of each token
|
Strips n left characters of each token, or n right characters of each token
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, content, left=0, right=0):
|
def __init__(self, content, left=0, right=0):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def decode(self, tokens):
|
def decode(self, tokens):
|
||||||
"""
|
"""
|
||||||
Decode the given list of tokens to a final string
|
Decode the given list of tokens to a final string
|
||||||
@ -253,9 +252,9 @@ class WordPiece(Decoder):
|
|||||||
Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
|
Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
|
||||||
and some abbreviated english forms.
|
and some abbreviated english forms.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, prefix="##", cleanup=True):
|
def __init__(self, prefix="##", cleanup=True):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def decode(self, tokens):
|
def decode(self, tokens):
|
||||||
"""
|
"""
|
||||||
Decode the given list of tokens to a final string
|
Decode the given list of tokens to a final string
|
||||||
|
@ -8,7 +8,6 @@ class Model:
|
|||||||
|
|
||||||
This class cannot be constructed directly. Please use one of the concrete models.
|
This class cannot be constructed directly. Please use one of the concrete models.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def get_trainer(self):
|
def get_trainer(self):
|
||||||
"""
|
"""
|
||||||
Get the associated :class:`~tokenizers.trainers.Trainer`
|
Get the associated :class:`~tokenizers.trainers.Trainer`
|
||||||
@ -20,6 +19,7 @@ class Model:
|
|||||||
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def id_to_token(self, id):
|
def id_to_token(self, id):
|
||||||
"""
|
"""
|
||||||
Get the token associated to an ID
|
Get the token associated to an ID
|
||||||
@ -32,6 +32,7 @@ class Model:
|
|||||||
:obj:`str`: The token associated to the ID
|
:obj:`str`: The token associated to the ID
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def save(self, folder, prefix):
|
def save(self, folder, prefix):
|
||||||
"""
|
"""
|
||||||
Save the current model
|
Save the current model
|
||||||
@ -51,6 +52,7 @@ class Model:
|
|||||||
:obj:`List[str]`: The list of saved files
|
:obj:`List[str]`: The list of saved files
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def token_to_id(self, tokens):
|
def token_to_id(self, tokens):
|
||||||
"""
|
"""
|
||||||
Get the ID associated to a token
|
Get the ID associated to a token
|
||||||
@ -63,6 +65,7 @@ class Model:
|
|||||||
:obj:`int`: The ID associated to the token
|
:obj:`int`: The ID associated to the token
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def tokenize(self, sequence):
|
def tokenize(self, sequence):
|
||||||
"""
|
"""
|
||||||
Tokenize a sequence
|
Tokenize a sequence
|
||||||
@ -110,7 +113,6 @@ class BPE(Model):
|
|||||||
byte_fallback (:obj:`bool`, `optional`):
|
byte_fallback (:obj:`bool`, `optional`):
|
||||||
Whether to use spm byte-fallback trick (defaults to False)
|
Whether to use spm byte-fallback trick (defaults to False)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
vocab=None,
|
vocab=None,
|
||||||
@ -124,6 +126,7 @@ class BPE(Model):
|
|||||||
byte_fallback=False,
|
byte_fallback=False,
|
||||||
):
|
):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_file(cls, vocab, merge, **kwargs):
|
def from_file(cls, vocab, merge, **kwargs):
|
||||||
"""
|
"""
|
||||||
@ -149,6 +152,7 @@ class BPE(Model):
|
|||||||
:class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files
|
:class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def get_trainer(self):
|
def get_trainer(self):
|
||||||
"""
|
"""
|
||||||
Get the associated :class:`~tokenizers.trainers.Trainer`
|
Get the associated :class:`~tokenizers.trainers.Trainer`
|
||||||
@ -160,6 +164,7 @@ class BPE(Model):
|
|||||||
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def id_to_token(self, id):
|
def id_to_token(self, id):
|
||||||
"""
|
"""
|
||||||
Get the token associated to an ID
|
Get the token associated to an ID
|
||||||
@ -172,6 +177,7 @@ class BPE(Model):
|
|||||||
:obj:`str`: The token associated to the ID
|
:obj:`str`: The token associated to the ID
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def read_file(self, vocab, merges):
|
def read_file(self, vocab, merges):
|
||||||
"""
|
"""
|
||||||
@ -193,6 +199,7 @@ class BPE(Model):
|
|||||||
The vocabulary and merges loaded into memory
|
The vocabulary and merges loaded into memory
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def save(self, folder, prefix):
|
def save(self, folder, prefix):
|
||||||
"""
|
"""
|
||||||
Save the current model
|
Save the current model
|
||||||
@ -212,6 +219,7 @@ class BPE(Model):
|
|||||||
:obj:`List[str]`: The list of saved files
|
:obj:`List[str]`: The list of saved files
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def token_to_id(self, tokens):
|
def token_to_id(self, tokens):
|
||||||
"""
|
"""
|
||||||
Get the ID associated to a token
|
Get the ID associated to a token
|
||||||
@ -224,6 +232,7 @@ class BPE(Model):
|
|||||||
:obj:`int`: The ID associated to the token
|
:obj:`int`: The ID associated to the token
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def tokenize(self, sequence):
|
def tokenize(self, sequence):
|
||||||
"""
|
"""
|
||||||
Tokenize a sequence
|
Tokenize a sequence
|
||||||
@ -245,9 +254,9 @@ class Unigram(Model):
|
|||||||
vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`):
|
vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`):
|
||||||
A list of vocabulary items and their relative score [("am", -0.2442),...]
|
A list of vocabulary items and their relative score [("am", -0.2442),...]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, vocab, unk_id, byte_fallback):
|
def __init__(self, vocab, unk_id, byte_fallback):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def get_trainer(self):
|
def get_trainer(self):
|
||||||
"""
|
"""
|
||||||
Get the associated :class:`~tokenizers.trainers.Trainer`
|
Get the associated :class:`~tokenizers.trainers.Trainer`
|
||||||
@ -259,6 +268,7 @@ class Unigram(Model):
|
|||||||
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def id_to_token(self, id):
|
def id_to_token(self, id):
|
||||||
"""
|
"""
|
||||||
Get the token associated to an ID
|
Get the token associated to an ID
|
||||||
@ -271,6 +281,7 @@ class Unigram(Model):
|
|||||||
:obj:`str`: The token associated to the ID
|
:obj:`str`: The token associated to the ID
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def save(self, folder, prefix):
|
def save(self, folder, prefix):
|
||||||
"""
|
"""
|
||||||
Save the current model
|
Save the current model
|
||||||
@ -290,6 +301,7 @@ class Unigram(Model):
|
|||||||
:obj:`List[str]`: The list of saved files
|
:obj:`List[str]`: The list of saved files
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def token_to_id(self, tokens):
|
def token_to_id(self, tokens):
|
||||||
"""
|
"""
|
||||||
Get the ID associated to a token
|
Get the ID associated to a token
|
||||||
@ -302,6 +314,7 @@ class Unigram(Model):
|
|||||||
:obj:`int`: The ID associated to the token
|
:obj:`int`: The ID associated to the token
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def tokenize(self, sequence):
|
def tokenize(self, sequence):
|
||||||
"""
|
"""
|
||||||
Tokenize a sequence
|
Tokenize a sequence
|
||||||
@ -328,9 +341,9 @@ class WordLevel(Model):
|
|||||||
unk_token (:obj:`str`, `optional`):
|
unk_token (:obj:`str`, `optional`):
|
||||||
The unknown token to be used by the model.
|
The unknown token to be used by the model.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, vocab, unk_token):
|
def __init__(self, vocab, unk_token):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_file(vocab, unk_token):
|
def from_file(vocab, unk_token):
|
||||||
"""
|
"""
|
||||||
@ -353,6 +366,7 @@ class WordLevel(Model):
|
|||||||
:class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file
|
:class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def get_trainer(self):
|
def get_trainer(self):
|
||||||
"""
|
"""
|
||||||
Get the associated :class:`~tokenizers.trainers.Trainer`
|
Get the associated :class:`~tokenizers.trainers.Trainer`
|
||||||
@ -364,6 +378,7 @@ class WordLevel(Model):
|
|||||||
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def id_to_token(self, id):
|
def id_to_token(self, id):
|
||||||
"""
|
"""
|
||||||
Get the token associated to an ID
|
Get the token associated to an ID
|
||||||
@ -376,6 +391,7 @@ class WordLevel(Model):
|
|||||||
:obj:`str`: The token associated to the ID
|
:obj:`str`: The token associated to the ID
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def read_file(vocab):
|
def read_file(vocab):
|
||||||
"""
|
"""
|
||||||
@ -393,6 +409,7 @@ class WordLevel(Model):
|
|||||||
:obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
|
:obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def save(self, folder, prefix):
|
def save(self, folder, prefix):
|
||||||
"""
|
"""
|
||||||
Save the current model
|
Save the current model
|
||||||
@ -412,6 +429,7 @@ class WordLevel(Model):
|
|||||||
:obj:`List[str]`: The list of saved files
|
:obj:`List[str]`: The list of saved files
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def token_to_id(self, tokens):
|
def token_to_id(self, tokens):
|
||||||
"""
|
"""
|
||||||
Get the ID associated to a token
|
Get the ID associated to a token
|
||||||
@ -424,6 +442,7 @@ class WordLevel(Model):
|
|||||||
:obj:`int`: The ID associated to the token
|
:obj:`int`: The ID associated to the token
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def tokenize(self, sequence):
|
def tokenize(self, sequence):
|
||||||
"""
|
"""
|
||||||
Tokenize a sequence
|
Tokenize a sequence
|
||||||
@ -451,9 +470,9 @@ class WordPiece(Model):
|
|||||||
max_input_chars_per_word (:obj:`int`, `optional`):
|
max_input_chars_per_word (:obj:`int`, `optional`):
|
||||||
The maximum number of characters to authorize in a single word.
|
The maximum number of characters to authorize in a single word.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, vocab, unk_token, max_input_chars_per_word):
|
def __init__(self, vocab, unk_token, max_input_chars_per_word):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_file(vocab, **kwargs):
|
def from_file(vocab, **kwargs):
|
||||||
"""
|
"""
|
||||||
@ -476,6 +495,7 @@ class WordPiece(Model):
|
|||||||
:class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file
|
:class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def get_trainer(self):
|
def get_trainer(self):
|
||||||
"""
|
"""
|
||||||
Get the associated :class:`~tokenizers.trainers.Trainer`
|
Get the associated :class:`~tokenizers.trainers.Trainer`
|
||||||
@ -487,6 +507,7 @@ class WordPiece(Model):
|
|||||||
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def id_to_token(self, id):
|
def id_to_token(self, id):
|
||||||
"""
|
"""
|
||||||
Get the token associated to an ID
|
Get the token associated to an ID
|
||||||
@ -499,6 +520,7 @@ class WordPiece(Model):
|
|||||||
:obj:`str`: The token associated to the ID
|
:obj:`str`: The token associated to the ID
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def read_file(vocab):
|
def read_file(vocab):
|
||||||
"""
|
"""
|
||||||
@ -517,6 +539,7 @@ class WordPiece(Model):
|
|||||||
:obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
|
:obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def save(self, folder, prefix):
|
def save(self, folder, prefix):
|
||||||
"""
|
"""
|
||||||
Save the current model
|
Save the current model
|
||||||
@ -536,6 +559,7 @@ class WordPiece(Model):
|
|||||||
:obj:`List[str]`: The list of saved files
|
:obj:`List[str]`: The list of saved files
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def token_to_id(self, tokens):
|
def token_to_id(self, tokens):
|
||||||
"""
|
"""
|
||||||
Get the ID associated to a token
|
Get the ID associated to a token
|
||||||
@ -548,6 +572,7 @@ class WordPiece(Model):
|
|||||||
:obj:`int`: The ID associated to the token
|
:obj:`int`: The ID associated to the token
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def tokenize(self, sequence):
|
def tokenize(self, sequence):
|
||||||
"""
|
"""
|
||||||
Tokenize a sequence
|
Tokenize a sequence
|
||||||
|
@ -6,7 +6,6 @@ class Normalizer:
|
|||||||
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||||
Normalizer will return an instance of this class when instantiated.
|
Normalizer will return an instance of this class when instantiated.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def normalize(self, normalized):
|
def normalize(self, normalized):
|
||||||
"""
|
"""
|
||||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||||
@ -22,6 +21,7 @@ class Normalizer:
|
|||||||
:class:`~tokenizers.normalizers.Normalizer`
|
:class:`~tokenizers.normalizers.Normalizer`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def normalize_str(self, sequence):
|
def normalize_str(self, sequence):
|
||||||
"""
|
"""
|
||||||
Normalize the given string
|
Normalize the given string
|
||||||
@ -62,9 +62,9 @@ class BertNormalizer(Normalizer):
|
|||||||
lowercase (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
lowercase (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
Whether to lowercase.
|
Whether to lowercase.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True):
|
def __init__(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def normalize(self, normalized):
|
def normalize(self, normalized):
|
||||||
"""
|
"""
|
||||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||||
@ -80,6 +80,7 @@ class BertNormalizer(Normalizer):
|
|||||||
:class:`~tokenizers.normalizers.Normalizer`
|
:class:`~tokenizers.normalizers.Normalizer`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def normalize_str(self, sequence):
|
def normalize_str(self, sequence):
|
||||||
"""
|
"""
|
||||||
Normalize the given string
|
Normalize the given string
|
||||||
@ -102,9 +103,9 @@ class Lowercase(Normalizer):
|
|||||||
"""
|
"""
|
||||||
Lowercase Normalizer
|
Lowercase Normalizer
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def normalize(self, normalized):
|
def normalize(self, normalized):
|
||||||
"""
|
"""
|
||||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||||
@ -120,6 +121,7 @@ class Lowercase(Normalizer):
|
|||||||
:class:`~tokenizers.normalizers.Normalizer`
|
:class:`~tokenizers.normalizers.Normalizer`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def normalize_str(self, sequence):
|
def normalize_str(self, sequence):
|
||||||
"""
|
"""
|
||||||
Normalize the given string
|
Normalize the given string
|
||||||
@ -142,9 +144,9 @@ class NFC(Normalizer):
|
|||||||
"""
|
"""
|
||||||
NFC Unicode Normalizer
|
NFC Unicode Normalizer
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def normalize(self, normalized):
|
def normalize(self, normalized):
|
||||||
"""
|
"""
|
||||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||||
@ -160,6 +162,7 @@ class NFC(Normalizer):
|
|||||||
:class:`~tokenizers.normalizers.Normalizer`
|
:class:`~tokenizers.normalizers.Normalizer`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def normalize_str(self, sequence):
|
def normalize_str(self, sequence):
|
||||||
"""
|
"""
|
||||||
Normalize the given string
|
Normalize the given string
|
||||||
@ -182,9 +185,9 @@ class NFD(Normalizer):
|
|||||||
"""
|
"""
|
||||||
NFD Unicode Normalizer
|
NFD Unicode Normalizer
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def normalize(self, normalized):
|
def normalize(self, normalized):
|
||||||
"""
|
"""
|
||||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||||
@ -200,6 +203,7 @@ class NFD(Normalizer):
|
|||||||
:class:`~tokenizers.normalizers.Normalizer`
|
:class:`~tokenizers.normalizers.Normalizer`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def normalize_str(self, sequence):
|
def normalize_str(self, sequence):
|
||||||
"""
|
"""
|
||||||
Normalize the given string
|
Normalize the given string
|
||||||
@ -222,9 +226,9 @@ class NFKC(Normalizer):
|
|||||||
"""
|
"""
|
||||||
NFKC Unicode Normalizer
|
NFKC Unicode Normalizer
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def normalize(self, normalized):
|
def normalize(self, normalized):
|
||||||
"""
|
"""
|
||||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||||
@ -240,6 +244,7 @@ class NFKC(Normalizer):
|
|||||||
:class:`~tokenizers.normalizers.Normalizer`
|
:class:`~tokenizers.normalizers.Normalizer`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def normalize_str(self, sequence):
|
def normalize_str(self, sequence):
|
||||||
"""
|
"""
|
||||||
Normalize the given string
|
Normalize the given string
|
||||||
@ -262,9 +267,9 @@ class NFKD(Normalizer):
|
|||||||
"""
|
"""
|
||||||
NFKD Unicode Normalizer
|
NFKD Unicode Normalizer
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def normalize(self, normalized):
|
def normalize(self, normalized):
|
||||||
"""
|
"""
|
||||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||||
@ -280,6 +285,7 @@ class NFKD(Normalizer):
|
|||||||
:class:`~tokenizers.normalizers.Normalizer`
|
:class:`~tokenizers.normalizers.Normalizer`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def normalize_str(self, sequence):
|
def normalize_str(self, sequence):
|
||||||
"""
|
"""
|
||||||
Normalize the given string
|
Normalize the given string
|
||||||
@ -302,9 +308,9 @@ class Nmt(Normalizer):
|
|||||||
"""
|
"""
|
||||||
Nmt normalizer
|
Nmt normalizer
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def normalize(self, normalized):
|
def normalize(self, normalized):
|
||||||
"""
|
"""
|
||||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||||
@ -320,6 +326,7 @@ class Nmt(Normalizer):
|
|||||||
:class:`~tokenizers.normalizers.Normalizer`
|
:class:`~tokenizers.normalizers.Normalizer`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def normalize_str(self, sequence):
|
def normalize_str(self, sequence):
|
||||||
"""
|
"""
|
||||||
Normalize the given string
|
Normalize the given string
|
||||||
@ -343,9 +350,9 @@ class Precompiled(Normalizer):
|
|||||||
Precompiled normalizer
|
Precompiled normalizer
|
||||||
Don't use manually it is used for compatiblity for SentencePiece.
|
Don't use manually it is used for compatiblity for SentencePiece.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, precompiled_charsmap):
|
def __init__(self, precompiled_charsmap):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def normalize(self, normalized):
|
def normalize(self, normalized):
|
||||||
"""
|
"""
|
||||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||||
@ -361,6 +368,7 @@ class Precompiled(Normalizer):
|
|||||||
:class:`~tokenizers.normalizers.Normalizer`
|
:class:`~tokenizers.normalizers.Normalizer`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def normalize_str(self, sequence):
|
def normalize_str(self, sequence):
|
||||||
"""
|
"""
|
||||||
Normalize the given string
|
Normalize the given string
|
||||||
@ -383,9 +391,9 @@ class Prepend(Normalizer):
|
|||||||
"""
|
"""
|
||||||
Prepend normalizer
|
Prepend normalizer
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, prepend):
|
def __init__(self, prepend):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def normalize(self, normalized):
|
def normalize(self, normalized):
|
||||||
"""
|
"""
|
||||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||||
@ -401,6 +409,7 @@ class Prepend(Normalizer):
|
|||||||
:class:`~tokenizers.normalizers.Normalizer`
|
:class:`~tokenizers.normalizers.Normalizer`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def normalize_str(self, sequence):
|
def normalize_str(self, sequence):
|
||||||
"""
|
"""
|
||||||
Normalize the given string
|
Normalize the given string
|
||||||
@ -423,9 +432,9 @@ class Replace(Normalizer):
|
|||||||
"""
|
"""
|
||||||
Replace normalizer
|
Replace normalizer
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, pattern, content):
|
def __init__(self, pattern, content):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def normalize(self, normalized):
|
def normalize(self, normalized):
|
||||||
"""
|
"""
|
||||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||||
@ -441,6 +450,7 @@ class Replace(Normalizer):
|
|||||||
:class:`~tokenizers.normalizers.Normalizer`
|
:class:`~tokenizers.normalizers.Normalizer`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def normalize_str(self, sequence):
|
def normalize_str(self, sequence):
|
||||||
"""
|
"""
|
||||||
Normalize the given string
|
Normalize the given string
|
||||||
@ -468,7 +478,6 @@ class Sequence(Normalizer):
|
|||||||
normalizers (:obj:`List[Normalizer]`):
|
normalizers (:obj:`List[Normalizer]`):
|
||||||
A list of Normalizer to be run as a sequence
|
A list of Normalizer to be run as a sequence
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def normalize(self, normalized):
|
def normalize(self, normalized):
|
||||||
"""
|
"""
|
||||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||||
@ -484,6 +493,7 @@ class Sequence(Normalizer):
|
|||||||
:class:`~tokenizers.normalizers.Normalizer`
|
:class:`~tokenizers.normalizers.Normalizer`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def normalize_str(self, sequence):
|
def normalize_str(self, sequence):
|
||||||
"""
|
"""
|
||||||
Normalize the given string
|
Normalize the given string
|
||||||
@ -506,9 +516,9 @@ class Strip(Normalizer):
|
|||||||
"""
|
"""
|
||||||
Strip normalizer
|
Strip normalizer
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, left=True, right=True):
|
def __init__(self, left=True, right=True):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def normalize(self, normalized):
|
def normalize(self, normalized):
|
||||||
"""
|
"""
|
||||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||||
@ -524,6 +534,7 @@ class Strip(Normalizer):
|
|||||||
:class:`~tokenizers.normalizers.Normalizer`
|
:class:`~tokenizers.normalizers.Normalizer`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def normalize_str(self, sequence):
|
def normalize_str(self, sequence):
|
||||||
"""
|
"""
|
||||||
Normalize the given string
|
Normalize the given string
|
||||||
@ -546,9 +557,9 @@ class StripAccents(Normalizer):
|
|||||||
"""
|
"""
|
||||||
StripAccents normalizer
|
StripAccents normalizer
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def normalize(self, normalized):
|
def normalize(self, normalized):
|
||||||
"""
|
"""
|
||||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||||
@ -564,6 +575,7 @@ class StripAccents(Normalizer):
|
|||||||
:class:`~tokenizers.normalizers.Normalizer`
|
:class:`~tokenizers.normalizers.Normalizer`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def normalize_str(self, sequence):
|
def normalize_str(self, sequence):
|
||||||
"""
|
"""
|
||||||
Normalize the given string
|
Normalize the given string
|
||||||
|
@ -6,7 +6,6 @@ class PreTokenizer:
|
|||||||
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||||
PreTokenizer will return an instance of this class when instantiated.
|
PreTokenizer will return an instance of this class when instantiated.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def pre_tokenize(self, pretok):
|
def pre_tokenize(self, pretok):
|
||||||
"""
|
"""
|
||||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||||
@ -23,6 +22,7 @@ class PreTokenizer:
|
|||||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pre_tokenize_str(self, sequence):
|
def pre_tokenize_str(self, sequence):
|
||||||
"""
|
"""
|
||||||
Pre tokenize the given string
|
Pre tokenize the given string
|
||||||
@ -50,9 +50,9 @@ class BertPreTokenizer(PreTokenizer):
|
|||||||
This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
||||||
Each occurence of a punctuation character will be treated separately.
|
Each occurence of a punctuation character will be treated separately.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pre_tokenize(self, pretok):
|
def pre_tokenize(self, pretok):
|
||||||
"""
|
"""
|
||||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||||
@ -69,6 +69,7 @@ class BertPreTokenizer(PreTokenizer):
|
|||||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pre_tokenize_str(self, sequence):
|
def pre_tokenize_str(self, sequence):
|
||||||
"""
|
"""
|
||||||
Pre tokenize the given string
|
Pre tokenize the given string
|
||||||
@ -104,9 +105,9 @@ class ByteLevel(PreTokenizer):
|
|||||||
Set this to :obj:`False` to prevent this `pre_tokenizer` from using
|
Set this to :obj:`False` to prevent this `pre_tokenizer` from using
|
||||||
the GPT2 specific regexp for spliting on whitespace.
|
the GPT2 specific regexp for spliting on whitespace.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, add_prefix_space=True, use_regex=True):
|
def __init__(self, add_prefix_space=True, use_regex=True):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def alphabet():
|
def alphabet():
|
||||||
"""
|
"""
|
||||||
@ -120,6 +121,7 @@ class ByteLevel(PreTokenizer):
|
|||||||
:obj:`List[str]`: A list of characters that compose the alphabet
|
:obj:`List[str]`: A list of characters that compose the alphabet
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pre_tokenize(self, pretok):
|
def pre_tokenize(self, pretok):
|
||||||
"""
|
"""
|
||||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||||
@ -136,6 +138,7 @@ class ByteLevel(PreTokenizer):
|
|||||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pre_tokenize_str(self, sequence):
|
def pre_tokenize_str(self, sequence):
|
||||||
"""
|
"""
|
||||||
Pre tokenize the given string
|
Pre tokenize the given string
|
||||||
@ -164,7 +167,6 @@ class CharDelimiterSplit(PreTokenizer):
|
|||||||
delimiter: str:
|
delimiter: str:
|
||||||
The delimiter char that will be used to split input
|
The delimiter char that will be used to split input
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def pre_tokenize(self, pretok):
|
def pre_tokenize(self, pretok):
|
||||||
"""
|
"""
|
||||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||||
@ -181,6 +183,7 @@ class CharDelimiterSplit(PreTokenizer):
|
|||||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pre_tokenize_str(self, sequence):
|
def pre_tokenize_str(self, sequence):
|
||||||
"""
|
"""
|
||||||
Pre tokenize the given string
|
Pre tokenize the given string
|
||||||
@ -215,9 +218,9 @@ class Digits(PreTokenizer):
|
|||||||
|
|
||||||
"Call 123 please" -> "Call ", "123", " please"
|
"Call 123 please" -> "Call ", "123", " please"
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, individual_digits=False):
|
def __init__(self, individual_digits=False):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pre_tokenize(self, pretok):
|
def pre_tokenize(self, pretok):
|
||||||
"""
|
"""
|
||||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||||
@ -234,6 +237,7 @@ class Digits(PreTokenizer):
|
|||||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pre_tokenize_str(self, sequence):
|
def pre_tokenize_str(self, sequence):
|
||||||
"""
|
"""
|
||||||
Pre tokenize the given string
|
Pre tokenize the given string
|
||||||
@ -270,9 +274,9 @@ class Metaspace(PreTokenizer):
|
|||||||
Whether to add a space to the first word if there isn't already one. This
|
Whether to add a space to the first word if there isn't already one. This
|
||||||
lets us treat `hello` exactly like `say hello`.
|
lets us treat `hello` exactly like `say hello`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, replacement="_", add_prefix_space=True):
|
def __init__(self, replacement="_", add_prefix_space=True):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pre_tokenize(self, pretok):
|
def pre_tokenize(self, pretok):
|
||||||
"""
|
"""
|
||||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||||
@ -289,6 +293,7 @@ class Metaspace(PreTokenizer):
|
|||||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pre_tokenize_str(self, sequence):
|
def pre_tokenize_str(self, sequence):
|
||||||
"""
|
"""
|
||||||
Pre tokenize the given string
|
Pre tokenize the given string
|
||||||
@ -319,9 +324,9 @@ class Punctuation(PreTokenizer):
|
|||||||
Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next",
|
Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next",
|
||||||
"contiguous"
|
"contiguous"
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, behavior="isolated"):
|
def __init__(self, behavior="isolated"):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pre_tokenize(self, pretok):
|
def pre_tokenize(self, pretok):
|
||||||
"""
|
"""
|
||||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||||
@ -338,6 +343,7 @@ class Punctuation(PreTokenizer):
|
|||||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pre_tokenize_str(self, sequence):
|
def pre_tokenize_str(self, sequence):
|
||||||
"""
|
"""
|
||||||
Pre tokenize the given string
|
Pre tokenize the given string
|
||||||
@ -362,9 +368,9 @@ class Sequence(PreTokenizer):
|
|||||||
"""
|
"""
|
||||||
This pre-tokenizer composes other pre_tokenizers and applies them in sequence
|
This pre-tokenizer composes other pre_tokenizers and applies them in sequence
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, pretokenizers):
|
def __init__(self, pretokenizers):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pre_tokenize(self, pretok):
|
def pre_tokenize(self, pretok):
|
||||||
"""
|
"""
|
||||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||||
@ -381,6 +387,7 @@ class Sequence(PreTokenizer):
|
|||||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pre_tokenize_str(self, sequence):
|
def pre_tokenize_str(self, sequence):
|
||||||
"""
|
"""
|
||||||
Pre tokenize the given string
|
Pre tokenize the given string
|
||||||
@ -421,9 +428,9 @@ class Split(PreTokenizer):
|
|||||||
invert (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
invert (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
Whether to invert the pattern.
|
Whether to invert the pattern.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, pattern, behavior, invert=False):
|
def __init__(self, pattern, behavior, invert=False):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pre_tokenize(self, pretok):
|
def pre_tokenize(self, pretok):
|
||||||
"""
|
"""
|
||||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||||
@ -440,6 +447,7 @@ class Split(PreTokenizer):
|
|||||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pre_tokenize_str(self, sequence):
|
def pre_tokenize_str(self, sequence):
|
||||||
"""
|
"""
|
||||||
Pre tokenize the given string
|
Pre tokenize the given string
|
||||||
@ -467,9 +475,9 @@ class UnicodeScripts(PreTokenizer):
|
|||||||
Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too.
|
Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too.
|
||||||
This mimicks SentencePiece Unigram implementation.
|
This mimicks SentencePiece Unigram implementation.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pre_tokenize(self, pretok):
|
def pre_tokenize(self, pretok):
|
||||||
"""
|
"""
|
||||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||||
@ -486,6 +494,7 @@ class UnicodeScripts(PreTokenizer):
|
|||||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pre_tokenize_str(self, sequence):
|
def pre_tokenize_str(self, sequence):
|
||||||
"""
|
"""
|
||||||
Pre tokenize the given string
|
Pre tokenize the given string
|
||||||
@ -510,9 +519,9 @@ class Whitespace(PreTokenizer):
|
|||||||
"""
|
"""
|
||||||
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pre_tokenize(self, pretok):
|
def pre_tokenize(self, pretok):
|
||||||
"""
|
"""
|
||||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||||
@ -529,6 +538,7 @@ class Whitespace(PreTokenizer):
|
|||||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pre_tokenize_str(self, sequence):
|
def pre_tokenize_str(self, sequence):
|
||||||
"""
|
"""
|
||||||
Pre tokenize the given string
|
Pre tokenize the given string
|
||||||
@ -553,9 +563,9 @@ class WhitespaceSplit(PreTokenizer):
|
|||||||
"""
|
"""
|
||||||
This pre-tokenizer simply splits on the whitespace. Works like `.split()`
|
This pre-tokenizer simply splits on the whitespace. Works like `.split()`
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pre_tokenize(self, pretok):
|
def pre_tokenize(self, pretok):
|
||||||
"""
|
"""
|
||||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||||
@ -572,6 +582,7 @@ class WhitespaceSplit(PreTokenizer):
|
|||||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pre_tokenize_str(self, sequence):
|
def pre_tokenize_str(self, sequence):
|
||||||
"""
|
"""
|
||||||
Pre tokenize the given string
|
Pre tokenize the given string
|
||||||
|
@ -6,7 +6,6 @@ class PostProcessor:
|
|||||||
This class is not supposed to be instantiated directly. Instead, any implementation of
|
This class is not supposed to be instantiated directly. Instead, any implementation of
|
||||||
a PostProcessor will return an instance of this class when instantiated.
|
a PostProcessor will return an instance of this class when instantiated.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def num_special_tokens_to_add(self, is_pair):
|
def num_special_tokens_to_add(self, is_pair):
|
||||||
"""
|
"""
|
||||||
Return the number of special tokens that would be added for single/pair sentences.
|
Return the number of special tokens that would be added for single/pair sentences.
|
||||||
@ -19,6 +18,7 @@ class PostProcessor:
|
|||||||
:obj:`int`: The number of tokens to add
|
:obj:`int`: The number of tokens to add
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def process(self, encoding, pair=None, add_special_tokens=True):
|
def process(self, encoding, pair=None, add_special_tokens=True):
|
||||||
"""
|
"""
|
||||||
Post-process the given encodings, generating the final one
|
Post-process the given encodings, generating the final one
|
||||||
@ -53,9 +53,9 @@ class BertProcessing(PostProcessor):
|
|||||||
cls (:obj:`Tuple[str, int]`):
|
cls (:obj:`Tuple[str, int]`):
|
||||||
A tuple with the string representation of the CLS token, and its id
|
A tuple with the string representation of the CLS token, and its id
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, sep, cls):
|
def __init__(self, sep, cls):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def num_special_tokens_to_add(self, is_pair):
|
def num_special_tokens_to_add(self, is_pair):
|
||||||
"""
|
"""
|
||||||
Return the number of special tokens that would be added for single/pair sentences.
|
Return the number of special tokens that would be added for single/pair sentences.
|
||||||
@ -68,6 +68,7 @@ class BertProcessing(PostProcessor):
|
|||||||
:obj:`int`: The number of tokens to add
|
:obj:`int`: The number of tokens to add
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def process(self, encoding, pair=None, add_special_tokens=True):
|
def process(self, encoding, pair=None, add_special_tokens=True):
|
||||||
"""
|
"""
|
||||||
Post-process the given encodings, generating the final one
|
Post-process the given encodings, generating the final one
|
||||||
@ -98,9 +99,9 @@ class ByteLevel(PostProcessor):
|
|||||||
trim_offsets (:obj:`bool`):
|
trim_offsets (:obj:`bool`):
|
||||||
Whether to trim the whitespaces from the produced offsets.
|
Whether to trim the whitespaces from the produced offsets.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, trim_offsets=True):
|
def __init__(self, trim_offsets=True):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def num_special_tokens_to_add(self, is_pair):
|
def num_special_tokens_to_add(self, is_pair):
|
||||||
"""
|
"""
|
||||||
Return the number of special tokens that would be added for single/pair sentences.
|
Return the number of special tokens that would be added for single/pair sentences.
|
||||||
@ -113,6 +114,7 @@ class ByteLevel(PostProcessor):
|
|||||||
:obj:`int`: The number of tokens to add
|
:obj:`int`: The number of tokens to add
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def process(self, encoding, pair=None, add_special_tokens=True):
|
def process(self, encoding, pair=None, add_special_tokens=True):
|
||||||
"""
|
"""
|
||||||
Post-process the given encodings, generating the final one
|
Post-process the given encodings, generating the final one
|
||||||
@ -159,9 +161,9 @@ class RobertaProcessing(PostProcessor):
|
|||||||
Whether the add_prefix_space option was enabled during pre-tokenization. This
|
Whether the add_prefix_space option was enabled during pre-tokenization. This
|
||||||
is relevant because it defines the way the offsets are trimmed out.
|
is relevant because it defines the way the offsets are trimmed out.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, sep, cls, trim_offsets=True, add_prefix_space=True):
|
def __init__(self, sep, cls, trim_offsets=True, add_prefix_space=True):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def num_special_tokens_to_add(self, is_pair):
|
def num_special_tokens_to_add(self, is_pair):
|
||||||
"""
|
"""
|
||||||
Return the number of special tokens that would be added for single/pair sentences.
|
Return the number of special tokens that would be added for single/pair sentences.
|
||||||
@ -174,6 +176,7 @@ class RobertaProcessing(PostProcessor):
|
|||||||
:obj:`int`: The number of tokens to add
|
:obj:`int`: The number of tokens to add
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def process(self, encoding, pair=None, add_special_tokens=True):
|
def process(self, encoding, pair=None, add_special_tokens=True):
|
||||||
"""
|
"""
|
||||||
Post-process the given encodings, generating the final one
|
Post-process the given encodings, generating the final one
|
||||||
@ -201,9 +204,9 @@ class Sequence(PostProcessor):
|
|||||||
processors (:obj:`List[PostProcessor]`)
|
processors (:obj:`List[PostProcessor]`)
|
||||||
The processors that need to be chained
|
The processors that need to be chained
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, processors):
|
def __init__(self, processors):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def num_special_tokens_to_add(self, is_pair):
|
def num_special_tokens_to_add(self, is_pair):
|
||||||
"""
|
"""
|
||||||
Return the number of special tokens that would be added for single/pair sentences.
|
Return the number of special tokens that would be added for single/pair sentences.
|
||||||
@ -216,6 +219,7 @@ class Sequence(PostProcessor):
|
|||||||
:obj:`int`: The number of tokens to add
|
:obj:`int`: The number of tokens to add
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def process(self, encoding, pair=None, add_special_tokens=True):
|
def process(self, encoding, pair=None, add_special_tokens=True):
|
||||||
"""
|
"""
|
||||||
Post-process the given encodings, generating the final one
|
Post-process the given encodings, generating the final one
|
||||||
@ -302,9 +306,9 @@ class TemplateProcessing(PostProcessor):
|
|||||||
The given dict expects the provided :obj:`ids` and :obj:`tokens` lists to have
|
The given dict expects the provided :obj:`ids` and :obj:`tokens` lists to have
|
||||||
the same length.
|
the same length.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, single, pair, special_tokens):
|
def __init__(self, single, pair, special_tokens):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def num_special_tokens_to_add(self, is_pair):
|
def num_special_tokens_to_add(self, is_pair):
|
||||||
"""
|
"""
|
||||||
Return the number of special tokens that would be added for single/pair sentences.
|
Return the number of special tokens that would be added for single/pair sentences.
|
||||||
@ -317,6 +321,7 @@ class TemplateProcessing(PostProcessor):
|
|||||||
:obj:`int`: The number of tokens to add
|
:obj:`int`: The number of tokens to add
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def process(self, encoding, pair=None, add_special_tokens=True):
|
def process(self, encoding, pair=None, add_special_tokens=True):
|
||||||
"""
|
"""
|
||||||
Post-process the given encodings, generating the final one
|
Post-process the given encodings, generating the final one
|
||||||
|
@ -92,7 +92,7 @@ class EncodingVisualizer:
|
|||||||
if default_to_notebook:
|
if default_to_notebook:
|
||||||
try:
|
try:
|
||||||
from IPython.core.display import HTML, display
|
from IPython.core.display import HTML, display
|
||||||
except ImportError as e:
|
except ImportError:
|
||||||
raise Exception(
|
raise Exception(
|
||||||
"""We couldn't import IPython utils for html display.
|
"""We couldn't import IPython utils for html display.
|
||||||
Are you running in a notebook?
|
Are you running in a notebook?
|
||||||
@ -136,7 +136,7 @@ class EncodingVisualizer:
|
|||||||
if final_default_to_notebook:
|
if final_default_to_notebook:
|
||||||
try:
|
try:
|
||||||
from IPython.core.display import HTML, display
|
from IPython.core.display import HTML, display
|
||||||
except ImportError as e:
|
except ImportError:
|
||||||
raise Exception(
|
raise Exception(
|
||||||
"""We couldn't import IPython utils for html display.
|
"""We couldn't import IPython utils for html display.
|
||||||
Are you running in a notebook?"""
|
Are you running in a notebook?"""
|
||||||
@ -170,7 +170,7 @@ class EncodingVisualizer:
|
|||||||
if h_step < 20:
|
if h_step < 20:
|
||||||
h_step = 20
|
h_step = 20
|
||||||
s = 32
|
s = 32
|
||||||
l = 64
|
l = 64 # noqa: E741
|
||||||
h = 10
|
h = 10
|
||||||
colors = {}
|
colors = {}
|
||||||
|
|
||||||
|
@ -80,7 +80,6 @@ class UnigramTrainer(Trainer):
|
|||||||
The number of iterations of the EM algorithm to perform before
|
The number of iterations of the EM algorithm to perform before
|
||||||
pruning the vocabulary.
|
pruning the vocabulary.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
vocab_size=8000,
|
vocab_size=8000,
|
||||||
@ -143,7 +142,6 @@ class WordPieceTrainer(Trainer):
|
|||||||
end_of_word_suffix (:obj:`str`, `optional`):
|
end_of_word_suffix (:obj:`str`, `optional`):
|
||||||
A suffix to be used for every subword that is a end-of-word.
|
A suffix to be used for every subword that is a end-of-word.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
vocab_size=30000,
|
vocab_size=30000,
|
||||||
|
@ -34,7 +34,7 @@ Source = 'https://github.com/huggingface/tokenizers'
|
|||||||
|
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
testing = ["pytest", "requests", "numpy", "datasets", "black==22.3"]
|
testing = ["pytest", "requests", "numpy", "datasets", "black==22.3", "ruff"]
|
||||||
docs = ["sphinx", "sphinx_rtd_theme", "setuptools_rust"]
|
docs = ["sphinx", "sphinx_rtd_theme", "setuptools_rust"]
|
||||||
dev = ["tokenizers[testing]"]
|
dev = ["tokenizers[testing]"]
|
||||||
|
|
||||||
@ -52,3 +52,21 @@ features = ["pyo3/extension-module"]
|
|||||||
[tool.black]
|
[tool.black]
|
||||||
line-length = 119
|
line-length = 119
|
||||||
target-version = ['py35']
|
target-version = ['py35']
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
line-length = 119
|
||||||
|
target-version = "py311"
|
||||||
|
lint.ignore = [
|
||||||
|
# a == None in tests vs is None.
|
||||||
|
"E711",
|
||||||
|
# a == False in tests vs is False.
|
||||||
|
"E712",
|
||||||
|
# try.. import except.. pattern without using the lib.
|
||||||
|
"F401",
|
||||||
|
# Raw type equality is required in asserts
|
||||||
|
"E721",
|
||||||
|
# Import order
|
||||||
|
"E402",
|
||||||
|
# Fixtures unused import
|
||||||
|
"F811",
|
||||||
|
]
|
||||||
|
@ -80,9 +80,7 @@ class SpmConverter(Converter):
|
|||||||
tokenizer = Tokenizer(Unigram(vocab, unk_id))
|
tokenizer = Tokenizer(Unigram(vocab, unk_id))
|
||||||
elif model_type == 2:
|
elif model_type == 2:
|
||||||
vocab, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract()
|
vocab, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract()
|
||||||
tokenizer = Tokenizer(
|
tokenizer = Tokenizer(BPE(vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True))
|
||||||
BPE(vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True)
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
raise Exception(
|
raise Exception(
|
||||||
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
|
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
|
||||||
@ -105,12 +103,8 @@ class SpmConverter(Converter):
|
|||||||
|
|
||||||
replacement = "▁"
|
replacement = "▁"
|
||||||
add_prefix_space = True
|
add_prefix_space = True
|
||||||
tokenizer.pre_tokenizer = Metaspace(
|
tokenizer.pre_tokenizer = Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
|
||||||
replacement=replacement, add_prefix_space=add_prefix_space
|
tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
|
||||||
)
|
|
||||||
tokenizer.decoder = decoders.Metaspace(
|
|
||||||
replacement=replacement, add_prefix_space=add_prefix_space
|
|
||||||
)
|
|
||||||
post_processor = self.post_processor(tokenizer)
|
post_processor = self.post_processor(tokenizer)
|
||||||
if post_processor:
|
if post_processor:
|
||||||
tokenizer.post_processor = post_processor
|
tokenizer.post_processor = post_processor
|
||||||
@ -124,9 +118,7 @@ class SpmConverter(Converter):
|
|||||||
class AlbertConverter(SpmConverter):
|
class AlbertConverter(SpmConverter):
|
||||||
def vocab(self, proto):
|
def vocab(self, proto):
|
||||||
return [
|
return [
|
||||||
(piece.piece, piece.score)
|
(piece.piece, piece.score) if check_number_comma(piece.piece) else (piece.piece, piece.score - 100)
|
||||||
if check_number_comma(piece.piece)
|
|
||||||
else (piece.piece, piece.score - 100)
|
|
||||||
for piece in proto.pieces
|
for piece in proto.pieces
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -261,9 +253,7 @@ class XLMRobertaConverter(SpmConverter):
|
|||||||
class XLNetConverter(SpmConverter):
|
class XLNetConverter(SpmConverter):
|
||||||
def vocab(self, proto):
|
def vocab(self, proto):
|
||||||
return [
|
return [
|
||||||
(piece.piece, piece.score)
|
(piece.piece, piece.score) if check_number_comma(piece.piece) else (piece.piece, piece.score - 100)
|
||||||
if check_number_comma(piece.piece)
|
|
||||||
else (piece.piece, piece.score - 100)
|
|
||||||
for piece in proto.pieces
|
for piece in proto.pieces
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -420,9 +410,7 @@ def main():
|
|||||||
print(f"|{'-'*model_len}|{'-'*status_len}|{'-'*speedup_len}|")
|
print(f"|{'-'*model_len}|{'-'*status_len}|{'-'*speedup_len}|")
|
||||||
for pretrained in args.models:
|
for pretrained in args.models:
|
||||||
status, speedup = check(pretrained, args.filename)
|
status, speedup = check(pretrained, args.filename)
|
||||||
print(
|
print(f"|{pretrained:<{model_len}}|{status:^{status_len}}|{speedup:^{speedup_len - 1}.2f}x|")
|
||||||
f"|{pretrained:<{model_len}}|{status:^{status_len}}|{speedup:^{speedup_len - 1}.2f}x|"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -59,7 +59,6 @@ class YouTokenToMeExtractor:
|
|||||||
|
|
||||||
def extract(self) -> Tuple[Dict[str, int], List[Tuple]]:
|
def extract(self) -> Tuple[Dict[str, int], List[Tuple]]:
|
||||||
with open(self._model, "r") as model_f:
|
with open(self._model, "r") as model_f:
|
||||||
|
|
||||||
# Retrieve information
|
# Retrieve information
|
||||||
nb_pieces, nb_merges = map(int, model_f.readline().split())
|
nb_pieces, nb_merges = map(int, model_f.readline().split())
|
||||||
vocab, merges = {}, []
|
vocab, merges = {}, []
|
||||||
@ -97,9 +96,7 @@ if __name__ == "__main__":
|
|||||||
choices=["sentencepiece", "youtokentome"],
|
choices=["sentencepiece", "youtokentome"],
|
||||||
help="Indicate the format of the file.",
|
help="Indicate the format of the file.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument("--model", type=str, required=True, help="SentencePiece model to extract vocab from.")
|
||||||
"--model", type=str, required=True, help="SentencePiece model to extract vocab from."
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--vocab-output-path",
|
"--vocab-output-path",
|
||||||
type=str,
|
type=str,
|
||||||
@ -128,9 +125,7 @@ if __name__ == "__main__":
|
|||||||
args.model = f.name
|
args.model = f.name
|
||||||
|
|
||||||
# Allocate extractor
|
# Allocate extractor
|
||||||
extractor = (
|
extractor = SentencePieceExtractor if args.provider == "sentencepiece" else YouTokenToMeExtractor
|
||||||
SentencePieceExtractor if args.provider == "sentencepiece" else YouTokenToMeExtractor
|
|
||||||
)
|
|
||||||
extractor = extractor(args.model)
|
extractor = extractor(args.model)
|
||||||
|
|
||||||
logger.info(f"Using {type(extractor).__name__}")
|
logger.info(f"Using {type(extractor).__name__}")
|
||||||
|
@ -121,9 +121,7 @@ def check_train(args):
|
|||||||
break
|
break
|
||||||
|
|
||||||
print(f"Tokenizer used {tokenizer_tokens}, where spm used {spm_tokens}")
|
print(f"Tokenizer used {tokenizer_tokens}, where spm used {spm_tokens}")
|
||||||
assert (
|
assert tokenizer_tokens < spm_tokens, "Our trainer should be at least more efficient than the SPM one"
|
||||||
tokenizer_tokens < spm_tokens
|
|
||||||
), "Our trainer should be at least more efficient than the SPM one"
|
|
||||||
print("Ok our trainer is at least more efficient than the SPM one")
|
print("Ok our trainer is at least more efficient than the SPM one")
|
||||||
|
|
||||||
|
|
||||||
@ -131,9 +129,7 @@ def check_diff(spm_diff, tok_diff, sp, tok):
|
|||||||
if spm_diff == list(reversed(tok_diff)):
|
if spm_diff == list(reversed(tok_diff)):
|
||||||
# AAA -> AA+A vs A+AA case.
|
# AAA -> AA+A vs A+AA case.
|
||||||
return True
|
return True
|
||||||
elif len(spm_diff) == len(tok_diff) and tok.decode(spm_diff) == tok.decode(
|
elif len(spm_diff) == len(tok_diff) and tok.decode(spm_diff) == tok.decode(tok_diff):
|
||||||
tok_diff
|
|
||||||
):
|
|
||||||
# Second order OK
|
# Second order OK
|
||||||
# Barrich -> Barr + ich vs Bar + rich
|
# Barrich -> Barr + ich vs Bar + rich
|
||||||
return True
|
return True
|
||||||
@ -173,24 +169,17 @@ def check_details(line, spm_ids, tok_ids, sp, tok):
|
|||||||
spms = Counter(spm_ids[first:last])
|
spms = Counter(spm_ids[first:last])
|
||||||
toks = Counter(tok_ids[first:last])
|
toks = Counter(tok_ids[first:last])
|
||||||
|
|
||||||
removable_tokens = {
|
removable_tokens = {spm_ for (spm_, si) in spms.items() if toks.get(spm_, 0) == si}
|
||||||
spm_ for (spm_, si) in spms.items() if toks.get(spm_, 0) == si
|
|
||||||
}
|
|
||||||
min_width = 3
|
min_width = 3
|
||||||
for i in range(last - first - min_width):
|
for i in range(last - first - min_width):
|
||||||
if all(
|
if all(spm_ids[first + i + j] in removable_tokens for j in range(min_width)):
|
||||||
spm_ids[first + i + j] in removable_tokens for j in range(min_width)
|
|
||||||
):
|
|
||||||
possible_matches = [
|
possible_matches = [
|
||||||
k
|
k
|
||||||
for k in range(last - first - min_width)
|
for k in range(last - first - min_width)
|
||||||
if tok_ids[first + k : first + k + min_width]
|
if tok_ids[first + k : first + k + min_width] == spm_ids[first + i : first + i + min_width]
|
||||||
== spm_ids[first + i : first + i + min_width]
|
|
||||||
]
|
]
|
||||||
for j in possible_matches:
|
for j in possible_matches:
|
||||||
if check_diff(
|
if check_diff(spm_ids[first : first + i], tok_ids[first : first + j], sp, tok) and check_details(
|
||||||
spm_ids[first : first + i], tok_ids[first : first + j], sp, tok
|
|
||||||
) and check_details(
|
|
||||||
line,
|
line,
|
||||||
spm_ids[first + i : last],
|
spm_ids[first + i : last],
|
||||||
tok_ids[first + j : last],
|
tok_ids[first + j : last],
|
||||||
@ -210,9 +199,7 @@ def check_details(line, spm_ids, tok_ids, sp, tok):
|
|||||||
wrong = tok.decode(spm_ids[first:last])
|
wrong = tok.decode(spm_ids[first:last])
|
||||||
print()
|
print()
|
||||||
if has_color:
|
if has_color:
|
||||||
print(
|
print(f"{colored(ok_start, 'grey')}{colored(wrong, 'red')}{colored(ok_end, 'grey')}")
|
||||||
f"{colored(ok_start, 'grey')}{colored(wrong, 'red')}{colored(ok_end, 'grey')}"
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
print(wrong)
|
print(wrong)
|
||||||
return False
|
return False
|
||||||
@ -251,9 +238,7 @@ def check_encode(args):
|
|||||||
|
|
||||||
if args.verbose:
|
if args.verbose:
|
||||||
if i % 10000 == 0:
|
if i % 10000 == 0:
|
||||||
print(
|
print(f"({perfect} / {imperfect} / {wrong} ----- {perfect + imperfect + wrong})")
|
||||||
f"({perfect} / {imperfect} / {wrong} ----- {perfect + imperfect + wrong})"
|
|
||||||
)
|
|
||||||
print(f"SPM: {spm_total_time} - TOK: {tok_total_time}")
|
print(f"SPM: {spm_total_time} - TOK: {tok_total_time}")
|
||||||
|
|
||||||
if ids != encoded.ids:
|
if ids != encoded.ids:
|
||||||
@ -265,13 +250,13 @@ def check_encode(args):
|
|||||||
else:
|
else:
|
||||||
perfect += 1
|
perfect += 1
|
||||||
|
|
||||||
assert ids == encoded.ids, f"line {i}: {line} : \n\n{ids}\n{encoded.ids}\n{list(zip(encoded.ids, encoded.tokens))}"
|
assert (
|
||||||
|
ids == encoded.ids
|
||||||
|
), f"line {i}: {line} : \n\n{ids}\n{encoded.ids}\n{list(zip(encoded.ids, encoded.tokens))}"
|
||||||
|
|
||||||
print(f"({perfect} / {imperfect} / {wrong} ----- {perfect + imperfect + wrong})")
|
print(f"({perfect} / {imperfect} / {wrong} ----- {perfect + imperfect + wrong})")
|
||||||
total = perfect + imperfect + wrong
|
total = perfect + imperfect + wrong
|
||||||
print(
|
print(f"Accuracy {perfect * 100 / total:.2f} Slowdown : {tok_total_time/ spm_total_time:.2f}")
|
||||||
f"Accuracy {perfect * 100 / total:.2f} Slowdown : {tok_total_time/ spm_total_time:.2f}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -3,8 +3,6 @@ import inspect
|
|||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import black
|
|
||||||
|
|
||||||
|
|
||||||
INDENT = " " * 4
|
INDENT = " " * 4
|
||||||
GENERATED_COMMENT = "# Generated content DO NOT EDIT\n"
|
GENERATED_COMMENT = "# Generated content DO NOT EDIT\n"
|
||||||
@ -85,7 +83,7 @@ def pyi_file(obj, indent=""):
|
|||||||
body += f"{indent+INDENT}pass\n"
|
body += f"{indent+INDENT}pass\n"
|
||||||
body += "\n"
|
body += "\n"
|
||||||
|
|
||||||
for (name, fn) in fns:
|
for name, fn in fns:
|
||||||
body += pyi_file(fn, indent=indent)
|
body += pyi_file(fn, indent=indent)
|
||||||
|
|
||||||
if not body:
|
if not body:
|
||||||
@ -122,18 +120,17 @@ def py_file(module, origin):
|
|||||||
return string
|
return string
|
||||||
|
|
||||||
|
|
||||||
def do_black(content, is_pyi):
|
import subprocess
|
||||||
mode = black.Mode(
|
from typing import List, Optional, Tuple
|
||||||
target_versions={black.TargetVersion.PY35},
|
|
||||||
line_length=119,
|
|
||||||
is_pyi=is_pyi,
|
def do_ruff(code, is_pyi: bool):
|
||||||
string_normalization=True,
|
command = ["ruff", "format", "--config", "pyproject.toml", "--silent", "-"]
|
||||||
experimental_string_processing=False,
|
if is_pyi:
|
||||||
)
|
command.extend(["--stdin-filename", "test.pyi"])
|
||||||
try:
|
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
|
||||||
return black.format_file_contents(content, fast=True, mode=mode)
|
stdout, _ = process.communicate(input=code.encode("utf-8"))
|
||||||
except black.NothingChanged:
|
return stdout.decode("utf-8")
|
||||||
return content
|
|
||||||
|
|
||||||
|
|
||||||
def write(module, directory, origin, check=False):
|
def write(module, directory, origin, check=False):
|
||||||
@ -141,7 +138,7 @@ def write(module, directory, origin, check=False):
|
|||||||
|
|
||||||
filename = os.path.join(directory, "__init__.pyi")
|
filename = os.path.join(directory, "__init__.pyi")
|
||||||
pyi_content = pyi_file(module)
|
pyi_content = pyi_file(module)
|
||||||
pyi_content = do_black(pyi_content, is_pyi=True)
|
pyi_content = do_ruff(pyi_content, is_pyi=True)
|
||||||
os.makedirs(directory, exist_ok=True)
|
os.makedirs(directory, exist_ok=True)
|
||||||
if check:
|
if check:
|
||||||
with open(filename, "r") as f:
|
with open(filename, "r") as f:
|
||||||
@ -153,7 +150,7 @@ def write(module, directory, origin, check=False):
|
|||||||
|
|
||||||
filename = os.path.join(directory, "__init__.py")
|
filename = os.path.join(directory, "__init__.py")
|
||||||
py_content = py_file(module, origin)
|
py_content = py_file(module, origin)
|
||||||
py_content = do_black(py_content, is_pyi=False)
|
py_content = do_ruff(py_content, is_pyi=False)
|
||||||
os.makedirs(directory, exist_ok=True)
|
os.makedirs(directory, exist_ok=True)
|
||||||
|
|
||||||
is_auto = False
|
is_auto = False
|
||||||
|
@ -3,7 +3,6 @@ import pickle
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from tokenizers.models import BPE, Model, WordLevel, WordPiece
|
from tokenizers.models import BPE, Model, WordLevel, WordPiece
|
||||||
|
|
||||||
from ..utils import bert_files, data_dir, roberta_files
|
from ..utils import bert_files, data_dir, roberta_files
|
||||||
|
|
||||||
|
|
||||||
|
@ -2,8 +2,7 @@ import pickle
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from tokenizers import NormalizedString, Tokenizer
|
from tokenizers import NormalizedString
|
||||||
from tokenizers.models import BPE
|
|
||||||
from tokenizers.normalizers import BertNormalizer, Lowercase, Normalizer, Sequence, Strip, Prepend
|
from tokenizers.normalizers import BertNormalizer, Lowercase, Normalizer, Sequence, Strip, Prepend
|
||||||
|
|
||||||
|
|
||||||
|
@ -146,18 +146,18 @@ class TestTemplateProcessing:
|
|||||||
assert isinstance(pickle.loads(pickle.dumps(bert)), TemplateProcessing)
|
assert isinstance(pickle.loads(pickle.dumps(bert)), TemplateProcessing)
|
||||||
|
|
||||||
# It is absolutely legal to have tokens with spaces in the name:
|
# It is absolutely legal to have tokens with spaces in the name:
|
||||||
processor = TemplateProcessing(
|
TemplateProcessing(
|
||||||
single=["[ C L S ]", "Token with space"],
|
single=["[ C L S ]", "Token with space"],
|
||||||
special_tokens=[("[ C L S ]", 0), ("Token with space", 1)],
|
special_tokens=[("[ C L S ]", 0), ("Token with space", 1)],
|
||||||
)
|
)
|
||||||
# Sequence identifiers must be well formed:
|
# Sequence identifiers must be well formed:
|
||||||
with pytest.raises(Exception, match="Cannot build Piece"):
|
with pytest.raises(Exception, match="Cannot build Piece"):
|
||||||
processor = TemplateProcessing(single="[CLS] $$ [SEP]")
|
TemplateProcessing(single="[CLS] $$ [SEP]")
|
||||||
with pytest.raises(Exception, match="Cannot build Piece"):
|
with pytest.raises(Exception, match="Cannot build Piece"):
|
||||||
processor = TemplateProcessing(single="[CLS] $A: [SEP]")
|
TemplateProcessing(single="[CLS] $A: [SEP]")
|
||||||
# Special tokens must be provided when used in template:
|
# Special tokens must be provided when used in template:
|
||||||
with pytest.raises(Exception, match="Missing SpecialToken\\(s\\) with id\\(s\\)"):
|
with pytest.raises(Exception, match="Missing SpecialToken\\(s\\) with id\\(s\\)"):
|
||||||
processor = TemplateProcessing(single=["[CLS]"])
|
TemplateProcessing(single=["[CLS]"])
|
||||||
|
|
||||||
def test_bert_parity(self):
|
def test_bert_parity(self):
|
||||||
tokenizer = Tokenizer(BPE())
|
tokenizer = Tokenizer(BPE())
|
||||||
|
@ -5,10 +5,9 @@ import pytest
|
|||||||
|
|
||||||
from tokenizers import AddedToken, Encoding, Tokenizer
|
from tokenizers import AddedToken, Encoding, Tokenizer
|
||||||
from tokenizers.implementations import BertWordPieceTokenizer
|
from tokenizers.implementations import BertWordPieceTokenizer
|
||||||
from tokenizers.models import BPE, Model, WordPiece, Unigram
|
from tokenizers.models import BPE, Model, Unigram
|
||||||
from tokenizers.normalizers import Lowercase
|
|
||||||
from tokenizers.pre_tokenizers import ByteLevel
|
from tokenizers.pre_tokenizers import ByteLevel
|
||||||
from tokenizers.processors import BertProcessing, RobertaProcessing
|
from tokenizers.processors import RobertaProcessing
|
||||||
|
|
||||||
from ..utils import bert_files, data_dir, multiprocessing_with_parallelism, roberta_files
|
from ..utils import bert_files, data_dir, multiprocessing_with_parallelism, roberta_files
|
||||||
|
|
||||||
|
@ -2,7 +2,6 @@ from tokenizers import Tokenizer
|
|||||||
|
|
||||||
from ..utils import data_dir, doc_pipeline_bert_tokenizer, doc_wiki_tokenizer
|
from ..utils import data_dir, doc_pipeline_bert_tokenizer, doc_wiki_tokenizer
|
||||||
|
|
||||||
|
|
||||||
disable_printing = True
|
disable_printing = True
|
||||||
original_print = print
|
original_print = print
|
||||||
|
|
||||||
|
@ -1,8 +1,4 @@
|
|||||||
from tokenizers import Tokenizer
|
from tokenizers import Tokenizer
|
||||||
from tokenizers.models import BPE
|
|
||||||
from tokenizers.pre_tokenizers import Whitespace
|
|
||||||
from tokenizers.trainers import BpeTrainer
|
|
||||||
|
|
||||||
from ..utils import data_dir, doc_wiki_tokenizer
|
from ..utils import data_dir, doc_wiki_tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
# flake8: noqa
|
||||||
import gzip
|
import gzip
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
@ -1,5 +1,3 @@
|
|||||||
import pytest
|
|
||||||
|
|
||||||
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, processors
|
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, processors
|
||||||
from tokenizers.implementations import BaseTokenizer
|
from tokenizers.implementations import BaseTokenizer
|
||||||
|
|
||||||
|
@ -1,5 +1,3 @@
|
|||||||
import pytest
|
|
||||||
|
|
||||||
from tokenizers import BertWordPieceTokenizer
|
from tokenizers import BertWordPieceTokenizer
|
||||||
|
|
||||||
from ..utils import bert_files, data_dir, multiprocessing_with_parallelism
|
from ..utils import bert_files, data_dir, multiprocessing_with_parallelism
|
||||||
|
@ -1,5 +1,3 @@
|
|||||||
import pytest
|
|
||||||
|
|
||||||
from tokenizers import ByteLevelBPETokenizer
|
from tokenizers import ByteLevelBPETokenizer
|
||||||
|
|
||||||
from ..utils import data_dir, multiprocessing_with_parallelism, roberta_files
|
from ..utils import data_dir, multiprocessing_with_parallelism, roberta_files
|
||||||
|
@ -1,5 +1,3 @@
|
|||||||
import pytest
|
|
||||||
|
|
||||||
from tokenizers import CharBPETokenizer
|
from tokenizers import CharBPETokenizer
|
||||||
|
|
||||||
from ..utils import data_dir, multiprocessing_with_parallelism, openai_files
|
from ..utils import data_dir, multiprocessing_with_parallelism, openai_files
|
||||||
|
@ -1,5 +1,3 @@
|
|||||||
import os
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from tokenizers import SentencePieceBPETokenizer, SentencePieceUnigramTokenizer
|
from tokenizers import SentencePieceBPETokenizer, SentencePieceUnigramTokenizer
|
||||||
|
@ -6,7 +6,6 @@ import tqdm
|
|||||||
|
|
||||||
from huggingface_hub import HfApi, cached_download, hf_hub_url
|
from huggingface_hub import HfApi, cached_download, hf_hub_url
|
||||||
from tokenizers import Tokenizer
|
from tokenizers import Tokenizer
|
||||||
|
|
||||||
from .utils import albert_base, data_dir
|
from .utils import albert_base, data_dir
|
||||||
|
|
||||||
|
|
||||||
@ -15,7 +14,7 @@ class TestSerialization:
|
|||||||
# Check we can read this file.
|
# Check we can read this file.
|
||||||
# This used to fail because of BufReader that would fail because the
|
# This used to fail because of BufReader that would fail because the
|
||||||
# file exceeds the buffer capacity
|
# file exceeds the buffer capacity
|
||||||
tokenizer = Tokenizer.from_file(albert_base)
|
Tokenizer.from_file(albert_base)
|
||||||
|
|
||||||
|
|
||||||
def check(tokenizer_file) -> bool:
|
def check(tokenizer_file) -> bool:
|
||||||
@ -51,8 +50,6 @@ class TestFullDeserialization(unittest.TestCase):
|
|||||||
# Check we can read this file.
|
# Check we can read this file.
|
||||||
# This used to fail because of BufReader that would fail because the
|
# This used to fail because of BufReader that would fail because the
|
||||||
# file exceeds the buffer capacity
|
# file exceeds the buffer capacity
|
||||||
api = HfApi()
|
|
||||||
|
|
||||||
not_loadable = []
|
not_loadable = []
|
||||||
invalid_pre_tokenizer = []
|
invalid_pre_tokenizer = []
|
||||||
|
|
||||||
@ -77,7 +74,7 @@ class TestFullDeserialization(unittest.TestCase):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"{model_id} is not loadable: {e}")
|
print(f"{model_id} is not loadable: {e}")
|
||||||
not_loadable.append(model_id)
|
not_loadable.append(model_id)
|
||||||
except:
|
except: # noqa: E722
|
||||||
print(f"{model_id} is not loadable: Rust error")
|
print(f"{model_id} is not loadable: Rust error")
|
||||||
not_loadable.append(model_id)
|
not_loadable.append(model_id)
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user