Automatically stubbing the pyi files while keeping inspecting ability (#509)

* First pass on automatic stubbing our python files.

* And now modifying all rust docs to be visible in Pyi files.

* Better assert fail message.

* Fixing github workflow.

* Removing types not exported anymore.

* Fixing `Tokenizer` signature.

* Disabling auto __init__.py.

* Re-enabling some types.

* Don't overwrite non automated __init__.py

* Automated most __init__.py

* Restubbing after rebase.

* Fixing env for tests.

* Install blakc in the env.

* Use PY35 target in stub.py

Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
This commit is contained in:
Nicolas Patry
2020-11-17 21:13:00 +01:00
committed by GitHub
parent fff856cff7
commit 352c92ad33
25 changed files with 2511 additions and 1426 deletions

View File

@ -11,26 +11,6 @@ on:
- bindings/node/** - bindings/node/**
jobs: jobs:
code_quality:
name: Check Code Quality
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v1
- name: Install Python
uses: actions/setup-python@v1
with:
python-version: 3.6
architecture: "x64"
- name: Install dependencies
run: pip install black==20.8b1
- name: Check style
working-directory: ./bindings/python
run: make check-style
build_win_32: build_win_32:
name: Check it builds for Windows 32-bit name: Check it builds for Windows 32-bit
runs-on: windows-latest runs-on: windows-latest
@ -115,11 +95,23 @@ jobs:
python-version: 3.6 python-version: 3.6
architecture: "x64" architecture: "x64"
- name: Run tests - name: Install
working-directory: ./bindings/python working-directory: ./bindings/python
run: | run: |
python -m venv .env python -m venv .env
source .env/bin/activate source .env/bin/activate
pip install pytest requests setuptools_rust numpy pip install pytest requests setuptools_rust numpy
python setup.py develop python setup.py develop
- name: Check style
working-directory: ./bindings/python
run: |
source .env/bin/activate
pip install black==20.8b1
make check-style
- name: Run tests
working-directory: ./bindings/python
run: |
source .env/bin/activate
make test make test

View File

@ -6,10 +6,12 @@ dir_guard=@mkdir -p $(@D)
# Format source code automatically # Format source code automatically
style: style:
python stub.py
black --line-length 100 --target-version py35 examples py_src/tokenizers tests black --line-length 100 --target-version py35 examples py_src/tokenizers tests
# Check the source code is formatted correctly # Check the source code is formatted correctly
check-style: check-style:
python stub.py --check
black --check --line-length 100 --target-version py35 examples py_src/tokenizers tests black --check --line-length 100 --target-version py35 examples py_src/tokenizers tests
TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json

File diff suppressed because it is too large Load Diff

View File

@ -1,44 +1,52 @@
from typing import List # Generated content DO NOT EDIT
class Decoder: class Decoder:
"""Base class for all decoders """
Base class for all decoders
This class is not supposed to be instantiated directly. Instead, any implementation of This class is not supposed to be instantiated directly. Instead, any implementation of
a Decoder will return an instance of this class when instantiated. a Decoder will return an instance of this class when instantiated.
""" """
def decode(self, tokens: List[str]) -> str: def decode(self, tokens):
""" Decode the given list of string to a final string """ """
Decode the given list of string to a final string
"""
pass
class BPEDecoder(Decoder):
"""
Instantiate a new BPEDecoder
Args:
suffix: str:
The suffix that was used to caracterize an end-of-word. This suffix will
be replaced by whitespaces during the decoding
"""
def __init__(self, suffix="</w>"):
pass
def decode(self, tokens):
"""
Decode the given list of string to a final string
"""
pass pass
class ByteLevel(Decoder): class ByteLevel(Decoder):
""" ByteLevel Decoder """ """
ByteLevel Decoder
"""
def __init__(self) -> None: def __init__(self):
""" Instantiate a new ByteLevel Decoder """
pass pass
def decode(self, tokens):
class WordPiece(Decoder): """
""" WordPiece Decoder """ Decode the given list of string to a final string
@staticmethod
def __init__(self, prefix: str = "##", cleanup: bool = True) -> Decoder:
"""Instantiate a new WordPiece Decoder
Args:
prefix: str:
The prefix to use for subwords that are not a beginning-of-word
cleanup: bool:
Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
and some abbreviated english forms.
""" """
pass pass
class Metaspace(Decoder): class Metaspace(Decoder):
""" Metaspace decoder """ """
Instantiate a new Metaspace
def __init__(self, replacement: str = "", add_prefix_space: bool = True) -> None:
"""Instantiate a new Metaspace
Args: Args:
replacement: str: replacement: str:
@ -49,17 +57,31 @@ class Metaspace(Decoder):
Whether to add a space to the first word if there isn't already one. This Whether to add a space to the first word if there isn't already one. This
lets us treat `hello` exactly like `say hello`. lets us treat `hello` exactly like `say hello`.
""" """
def __init__(self, replacement="", add_prefix_space=True):
pass pass
def decode(self, tokens):
class BPEDecoder(Decoder): """
""" BPEDecoder """ Decode the given list of string to a final string
"""
def __init__(self, suffix: str = "</w>") -> None: pass
"""Instantiate a new BPEDecoder
class WordPiece(Decoder):
Args: """
suffix: str: Instantiate a new WordPiece Decoder
The suffix that was used to caracterize an end-of-word. This suffix will
be replaced by whitespaces during the decoding Args:
prefix: str:
The prefix to use for subwords that are not a beginning-of-word
cleanup: bool:
Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
and some abbreviated english forms.
"""
def __init__(self, prefix="##", cleanup=True):
pass
def decode(self, tokens):
"""
Decode the given list of string to a final string
""" """
pass pass

View File

@ -1,9 +1,8 @@
from typing import List, Tuple # Generated content DO NOT EDIT
from .. import models
from .. import models, Offsets
Model = models.Model Model = models.Model
BPE = models.BPE BPE = models.BPE
WordPiece = models.WordPiece
WordLevel = models.WordLevel
Unigram = models.Unigram Unigram = models.Unigram
WordLevel = models.WordLevel
WordPiece = models.WordPiece

View File

@ -1,34 +1,37 @@
from .. import Encoding, Offsets, Token # Generated content DO NOT EDIT
from typing import List, Optional, Union, Tuple, Dict
class Model: class Model:
"""Base class for all models """
A Model represents some tokenization algorithm like BPE or Word
This class is not supposed to be instantiated directly. Instead, any implementation of This class cannot be constructed directly. Please use one of the concrete models.
a Model will return a instance of this class when instantiated.
""" """
def tokenize(self, sequence: str) -> List[Token]: def id_to_token(self, id):
""" Tokenize the given sequence """ """
Returns the token associated with the given id
"""
pass pass
def token_to_id(self, token: str) -> Optional[int]: def save(self, folder, name):
""" Returns the id associated with the given token """ """
pass Save the current model
def id_to_token(self, id: int) -> Optional[str]:
""" Returns the token associated with the given id """
pass
def save(self, folder: str, name: Optional[str] = None) -> List[str]:
"""Save the current model
Save the current model in the given folder, using the given name for the various Save the current model in the given folder, using the given name for the various
files that will get created. files that will get created.
Any file with the same name that already exist in this folder will be overwritten. Any file with the same name that already exist in this folder will be overwritten.
""" """
pass pass
def token_to_id(self, tokens):
"""
Returns the id associated with the given token
"""
pass
def tokenize(self, tokens):
"""
Tokenize the given sequence
"""
pass
class BPE(Model): class BPE(Model):
"""BytePairEncoding model class """
Instantiate a BPE Model from the given vocab and merges. Instantiate a BPE Model from the given vocab and merges.
Args: Args:
@ -61,21 +64,18 @@ class BPE(Model):
def __init__( def __init__(
self, self,
vocab: Optional[Union[str, Dict[str, int]]], vocab=None,
merges: Optional[Union[str, List[Tuple[str, str]]]], merges=None,
cache_capacity: Optional[int], cache_capacity=None,
dropout: Optional[float], dropout=None,
unk_token: Optional[str], unk_token=None,
continuing_subword_prefix: Optional[str], continuing_subword_prefix=None,
end_of_word_suffix: Optional[str], end_of_word_suffix=None,
fuse_unk: Optional[bool], fuse_unk=None,
): ):
pass pass
@staticmethod @staticmethod
def read_file(vocab_filename: str, merges_filename: str) -> Tuple[Vocab, Merges]: def from_file(vocab_filename, merge_filename, **kwargs):
pass
@staticmethod
def from_file(vocab_filename: str, merges_filename: str, **kwargs) -> BPE:
""" """
Convenient method to intialize a BPE from files Convenient method to intialize a BPE from files
Roughly equivalent to Roughly equivalent to
@ -85,42 +85,73 @@ class BPE(Model):
return BPE(vocab, merges, **kwargs) return BPE(vocab, merges, **kwargs)
""" """
pass pass
def id_to_token(self, id):
"""
Returns the token associated with the given id
"""
pass
@staticmethod
def read_file(self, vocab_filename, merges_filename):
"""
Read a vocab_filename and merge_filename and stores result in memory
"""
pass
def save(self, folder, name):
"""
Save the current model
class WordPiece(Model): Save the current model in the given folder, using the given name for the various
"""WordPiece model class files that will get created.
Any file with the same name that already exist in this folder will be overwritten.
"""
pass
def token_to_id(self, tokens):
"""
Returns the id associated with the given token
"""
pass
def tokenize(self, tokens):
"""
Tokenize the given sequence
"""
pass
Instantiate a WordPiece Model from the given vocab file. class Unigram(Model):
"""
UnigramEncoding model class
Instantiate a Unigram Model from the given model file.
Args: Args:
vocab: (`optional`) string: vocab: ('`optional`) string:
A dictionnary of string keys and their ids {"am": 0,...} A list of vocabulary items and their relative score [("am", -0.2442),...]
unk_token: (`optional`) str:
The unknown token to be used by the model.
max_input_chars_per_word: (`optional`) int:
The maximum number of characters to authorize in a single word.
""" """
def __init__( def __init__(self, vocab):
self,
vocab: Optional[Union[str, Dict[str, int]]],
unk_token: Optional[str],
max_input_chars_per_word: Optional[int],
):
pass pass
@staticmethod def id_to_token(self, id):
def read_file(vocab_filename: str) -> Vocab:
pass
@staticmethod
def from_file(vocab_filename: str, **kwargs) -> WordPiece:
""" """
Convenient method to intialize a WordPiece from file Returns the token associated with the given id
Roughly equivalent to """
pass
def save(self, folder, name):
"""
Save the current model
def from_file(vocab_filename, **kwargs): Save the current model in the given folder, using the given name for the various
vocab, merges = WordPiece.read_file(vocab_filename) files that will get created.
return WordPiece(vocab, **kwargs) Any file with the same name that already exist in this folder will be overwritten.
"""
pass
def token_to_id(self, tokens):
"""
Returns the id associated with the given token
"""
pass
def tokenize(self, tokens):
"""
Tokenize the given sequence
""" """
pass pass
@ -138,34 +169,89 @@ class WordLevel(Model):
The unknown token to be used by the model. The unknown token to be used by the model.
""" """
def __init__(self, vocab: Optional[Union[str, Dict[str, int]]], unk_token: Optional[str]): def __init__(self, vocab, unk_token):
pass pass
@staticmethod def id_to_token(self, id):
def read_file(vocab_filename: str) -> Vocab:
pass
@staticmethod
def from_file(vocab_filename: str, **kwargs) -> WordLevelg:
""" """
Convenient method to intialize a WordLevelg from file Returns the token associated with the given id
"""
pass
def save(self, folder, name):
"""
Save the current model
Save the current model in the given folder, using the given name for the various
files that will get created.
Any file with the same name that already exist in this folder will be overwritten.
"""
pass
def token_to_id(self, tokens):
"""
Returns the id associated with the given token
"""
pass
def tokenize(self, tokens):
"""
Tokenize the given sequence
"""
pass
class WordPiece(Model):
"""
WordPiece model
Instantiate a WordPiece Model from the given vocab file.
Args:
vocab: (`optional`) string:
A dictionnary of string keys and their ids {"am": 0,...}
unk_token: (`optional`) str:
The unknown token to be used by the model.
max_input_chars_per_word: (`optional`) int:
The maximum number of characters to authorize in a single word.
"""
def __init__(self, vocab, unk_token, max_input_chars_per_word):
pass
@staticmethod
def from_file(vocab_filename, merge_filename, **kwargs):
"""
Convenient method to intialize a WordPiece from files
Roughly equivalent to Roughly equivalent to
def from_file(vocab_filename, **kwargs): def from_file(vocab_filename, **kwargs):
vocab, merges = WordLevelg.read_file(vocab_filename) vocab = WordPiece.read_file(vocab_filename)
return WordLevelg(vocab, **kwargs) return WordPiece(vocab, **kwargs)
""" """
pass pass
def id_to_token(self, id):
class Unigram(Model):
"""UnigramEncoding model class
Instantiate a Unigram Model from the given model file.
Args:
vocab: ('`optional`) string:
A list of vocabulary items and their relative score [("am", -0.2442),...]
""" """
Returns the token associated with the given id
"""
pass
@staticmethod @staticmethod
def __init__(self, vocab: Optional[List[Tuple[str, float]]]): def read_file(vocab_filename):
"""
Read a vocab_filename and stores result in memory
"""
pass
def save(self, folder, name):
"""
Save the current model
Save the current model in the given folder, using the given name for the various
files that will get created.
Any file with the same name that already exist in this folder will be overwritten.
"""
pass
def token_to_id(self, tokens):
"""
Returns the id associated with the given token
"""
pass
def tokenize(self, tokens):
"""
Tokenize the given sequence
"""
pass pass

View File

@ -1,35 +1,29 @@
from .. import NormalizedString # Generated content DO NOT EDIT
from typing import Optional, List
class Normalizer: class Normalizer:
"""Base class for all normalizers """
Base class for all normalizers
This class is not supposed to be instantiated directly. Instead, any implementation of a This class is not supposed to be instantiated directly. Instead, any implementation of a
Normalizer will return an instance of this class when instantiated. Normalizer will return an instance of this class when instantiated.
""" """
def normalize(self, normalized: NormalizedString): def normalize(self, normalized):
""" Normalize the given NormalizedString in-place """ """
Normalize the given NormalizedString in-place
"""
pass pass
def normalize_str(self, sequence: str) -> str: def normalize_str(self, sequence):
""" Normalize the given str """ """
Normalize the given str
"""
pass pass
class BertNormalizer(Normalizer): class BertNormalizer(Normalizer):
"""BertNormalizer """
BertNormalizer
Takes care of normalizing raw text before giving it to a Bert model. Takes care of normalizing raw text before giving it to a Bert model.
This includes cleaning the text, handling accents, chinese chars and lowercasing This includes cleaning the text, handling accents, chinese chars and lowercasing
"""
def __init__(
self,
clean_text: Optional[bool] = True,
handle_chinese_chars: Optional[bool] = True,
strip_accents: Optional[bool] = None,
lowercase: Optional[bool] = True,
) -> None:
"""Instantiate a BertNormalizer with the given options.
Args: Args:
clean_text: (`optional`) boolean: clean_text: (`optional`) boolean:
@ -49,92 +43,216 @@ class BertNormalizer(Normalizer):
Returns: Returns:
Normalizer Normalizer
""" """
def __init__(
self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True
):
pass
def normalize(self, normalized):
"""
Normalize the given NormalizedString in-place
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given str
"""
pass pass
class NFD(Normalizer): class Lowercase(Normalizer):
""" NFD Unicode Normalizer """ """
Lowercase Normalizer
"""
def __init__(self) -> None: def __init__(self):
""" Instantiate a new NFD Normalizer """
pass pass
def normalize(self, normalized):
class NFKD(Normalizer): """
""" NFKD Unicode Normalizer """ Normalize the given NormalizedString in-place
"""
def __init__(self) -> None: pass
""" Instantiate a new NFKD Normalizer """ def normalize_str(self, sequence):
"""
Normalize the given str
"""
pass pass
class NFC(Normalizer): class NFC(Normalizer):
""" NFC Unicode Normalizer """ """
NFC Unicode Normalizer
"""
def __init__(self) -> None: def __init__(self):
""" Instantiate a new NFC Normalizer """ pass
def normalize(self, normalized):
"""
Normalize the given NormalizedString in-place
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given str
"""
pass
class NFD(Normalizer):
"""
NFD Unicode Normalizer
"""
def __init__(self):
pass
def normalize(self, normalized):
"""
Normalize the given NormalizedString in-place
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given str
"""
pass pass
class NFKC(Normalizer): class NFKC(Normalizer):
""" NFKC Unicode Normalizer """ """
NFKC Unicode Normalizer
"""
def __init__(self) -> None: def __init__(self):
""" Instantiate a new NFKC Normalizer """ pass
def normalize(self, normalized):
"""
Normalize the given NormalizedString in-place
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given str
"""
pass
class NFKD(Normalizer):
"""
NFKD Unicode Normalizer
"""
def __init__(self):
pass
def normalize(self, normalized):
"""
Normalize the given NormalizedString in-place
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given str
"""
pass
class Nmt(Normalizer):
"""
Nmt normalizer
"""
def __init__(self):
pass
def normalize(self, normalized):
"""
Normalize the given NormalizedString in-place
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given str
"""
pass
class Precompiled(Normalizer):
"""
Precompiled normalizer
Don't use manually it is used for compatiblity for SentencePiece.
"""
def __init__(self, precompiled_charsmap):
pass
def normalize(self, normalized):
"""
Normalize the given NormalizedString in-place
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given str
"""
pass
class Replace(Normalizer):
"""
Replace normalizer
"""
def __init__(self, pattern, content):
pass
def normalize(self, normalized):
"""
Normalize the given NormalizedString in-place
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given str
"""
pass pass
class Sequence(Normalizer): class Sequence(Normalizer):
"""Allows concatenating multiple other Normalizer as a Sequence.
All the normalizers run in sequence in the given order
""" """
Allows concatenating multiple other Normalizer as a Sequence.
def __init__(self, normalizers: List[Normalizer]) -> None: All the normalizers run in sequence in the given order
"""Instantiate a new normalization Sequence using the given normalizers
Args: Args:
normalizers: List[Normalizer]: normalizers: List[Normalizer]:
A list of Normalizer to be run as a sequence A list of Normalizer to be run as a sequence
""" """
def normalize(self, normalized):
"""
Normalize the given NormalizedString in-place
"""
pass pass
def normalize_str(self, sequence):
class Lowercase(Normalizer): """
""" Lowercase Normalizer """ Normalize the given str
"""
def __init__(self) -> None:
""" Instantiate a new Lowercase Normalizer """
pass pass
class Strip(Normalizer): class Strip(Normalizer):
""" Strip normalizer """ """
Strip normalizer
"""
def __init__(self, left: bool = True, right: bool = True) -> Normalizer: def __init__(self, left=True, right=True):
pass
def normalize(self, normalized):
"""
Normalize the given NormalizedString in-place
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given str
"""
pass pass
class StripAccents(Normalizer): class StripAccents(Normalizer):
""" StripAccents normalizer """ def __init__(self):
def __init__(self) -> Normalizer:
pass pass
def normalize(self, normalized):
class Nmt(Normalizer):
""" Nmt normalizer """
def __init__(self) -> Normalizer:
pass
class Precompiled(Normalizer):
""" Precompiled normalizer """
def __init__(self, precompiled_charsmap: bytes) -> Normalizer:
pass
class Replace(Normalizer):
""" Replace normalizer """
def __init__(self, pattern: str, content: str) -> Normalizer:
pass
def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
""" """
Instanciate unicode normalizer from the normalizer name Normalize the given NormalizedString in-place
:param normalizer: Name of the normalizer """
:return: pass
def normalize_str(self, sequence):
"""
Normalize the given str
""" """
pass pass

View File

@ -1,13 +1,14 @@
# Generated content DO NOT EDIT
from .. import pre_tokenizers from .. import pre_tokenizers
PreTokenizer = pre_tokenizers.PreTokenizer PreTokenizer = pre_tokenizers.PreTokenizer
ByteLevel = pre_tokenizers.ByteLevel
Whitespace = pre_tokenizers.Whitespace
Punctuation = pre_tokenizers.Punctuation
Sequence = pre_tokenizers.Sequence
WhitespaceSplit = pre_tokenizers.WhitespaceSplit
BertPreTokenizer = pre_tokenizers.BertPreTokenizer BertPreTokenizer = pre_tokenizers.BertPreTokenizer
Metaspace = pre_tokenizers.Metaspace ByteLevel = pre_tokenizers.ByteLevel
CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit
Digits = pre_tokenizers.Digits Digits = pre_tokenizers.Digits
Metaspace = pre_tokenizers.Metaspace
Punctuation = pre_tokenizers.Punctuation
Sequence = pre_tokenizers.Sequence
UnicodeScripts = pre_tokenizers.UnicodeScripts UnicodeScripts = pre_tokenizers.UnicodeScripts
Whitespace = pre_tokenizers.Whitespace
WhitespaceSplit = pre_tokenizers.WhitespaceSplit

View File

@ -1,31 +1,51 @@
from .. import PreTokenizedString # Generated content DO NOT EDIT
from typing import Optional, List, Tuple
Offsets = Tuple[int, int]
class PreTokenizer: class PreTokenizer:
"""Base class for all pre-tokenizers """
Base class for all pre-tokenizers
This class is not supposed to be instantiated directly. Instead, any implementation of a This class is not supposed to be instantiated directly. Instead, any implementation of a
PreTokenizer will return an instance of this class when instantiated. PreTokenizer will return an instance of this class when instantiated.
""" """
def pre_tokenize(self, pretokenized: PreTokenizedString): def pre_tokenize(self, pretok):
""" Pre tokenize the given PreTokenizedString in-place """ """
Pre tokenize the given PreTokenizedString in-place
"""
pass pass
def pre_tokenize_str(self, sequence: str) -> List[Tuple[str, Offsets]]: def pre_tokenize_str(self, sequence):
""" Pre tokenize the given sequence """ """
Pre tokenize the given sequence
"""
pass
class BertPreTokenizer(PreTokenizer):
"""
BertPreTokenizer
This pre-tokenizer splits tokens on spaces, and also on punctuation.
Each occurence of a punctuation character will be treated separately.
"""
def __init__(self):
pass
def pre_tokenize(self, pretok):
"""
Pre tokenize the given PreTokenizedString in-place
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given sequence
"""
pass pass
class ByteLevel(PreTokenizer): class ByteLevel(PreTokenizer):
"""ByteLevel PreTokenizer """
ByteLevel PreTokenizer
This pre-tokenizer takes care of replacing all bytes of the given string This pre-tokenizer takes care of replacing all bytes of the given string
with a corresponding representation, as well as splitting into words. with a corresponding representation, as well as splitting into words.
"""
def __init__(self, add_prefix_space: bool = True) -> None:
"""Instantiate a new ByteLevel PreTokenizer
Args: Args:
add_prefix_space: (`optional`) boolean: add_prefix_space: (`optional`) boolean:
Whether to add a space to the first word if there isn't already one. This Whether to add a space to the first word if there isn't already one. This
@ -33,58 +53,78 @@ class ByteLevel(PreTokenizer):
Returns: Returns:
PreTokenizer PreTokenizer
""" """
def __init__(self, add_prefix_space=True):
pass pass
@staticmethod @staticmethod
def alphabet() -> List[str]: def alphabet():
"""Returns the alphabet used by this PreTokenizer. """
Returns the alphabet used by this PreTokenizer.
Since the ByteLevel works as its name suggests, at the byte level, it Since the ByteLevel works as its name suggests, at the byte level, it
encodes any byte to one visible character. This means that there is a encodes any byte to one visible character. This means that there is a
total of 256 different characters composing this alphabet. total of 256 different characters composing this alphabet.
""" """
pass pass
def pre_tokenize(self, pretok):
class Whitespace(PreTokenizer): """
"""Whitespace PreTokenizer Pre tokenize the given PreTokenizedString in-place
"""
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+` pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given sequence
""" """
def __init__(self) -> None:
""" Instantiate a new Whitespace PreTokenizer """
pass pass
class WhitespaceSplit(PreTokenizer): class CharDelimiterSplit(PreTokenizer):
"""Whitespace PreTokenizer """
This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
This pre-tokenizer simply splits on the whitespace. Works like `.split()` Args:
delimiter: str:
The delimiter char that will be used to split input
""" """
def __init__(self) -> None: def pre_tokenize(self, pretok):
""" Instantiate a new WhitespaceSplit PreTokenizer """ """
Pre tokenize the given PreTokenizedString in-place
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given sequence
"""
pass pass
class BertPreTokenizer(PreTokenizer): class Digits(PreTokenizer):
"""BertPreTokenizer """
This pre-tokenizer simply splits using the digits in separate tokens
This pre-tokenizer splits tokens on spaces, and also on punctuation. Args:
Each occurence of a punctuation character will be treated separately. individual_digits: bool:
If set to True, digits will each be separated "Call 123 please" -> "Call ", "1", "2", "3", " please"
If set to False, digits will grouped "Call 123 please" -> "Call ", "123", " please"
""" """
def __init__(self) -> None: def __init__(self, individual_digits=False):
""" Instantiate a new BertPreTokenizer """ pass
def pre_tokenize(self, pretok):
"""
Pre tokenize the given PreTokenizedString in-place
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given sequence
"""
pass pass
class Metaspace(PreTokenizer): class Metaspace(PreTokenizer):
"""Metaspace pre-tokenizer """
Metaspace pre-tokenizer
This pre-tokenizer replaces any whitespace by the provided replacement character. This pre-tokenizer replaces any whitespace by the provided replacement character.
It then tries to split on these spaces. It then tries to split on these spaces.
"""
def __init__(self, replacement: str = "", add_prefix_space: bool = True) -> None:
"""Instantiate a new Metaspace
Args: Args:
replacement: str: replacement: str:
The replacement character. Must be exactly one character. By default we The replacement character. Must be exactly one character. By default we
@ -94,70 +134,109 @@ class Metaspace(PreTokenizer):
Whether to add a space to the first word if there isn't already one. This Whether to add a space to the first word if there isn't already one. This
lets us treat `hello` exactly like `say hello`. lets us treat `hello` exactly like `say hello`.
""" """
def __init__(self, replacement="", add_prefix_space=True):
pass pass
def pre_tokenize(self, pretok):
class CharDelimiterSplit(PreTokenizer):
"""CharDelimiterSplit PreTokenizer
This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
""" """
Pre tokenize the given PreTokenizedString in-place
@staticmethod """
def __init__(self, delimiter: str) -> None: pass
"""Instantiate a new CharDelimiterSplit PreTokenizer def pre_tokenize_str(self, sequence):
"""
Args: Pre tokenize the given sequence
delimiter: str:
The delimiter char that will be used to split input
""" """
pass pass
class Punctuation(PreTokenizer): class Punctuation(PreTokenizer):
"""Punctuation PreTokenizer """
This pre-tokenizer simply splits on punctuation as individual characters.` This pre-tokenizer simply splits on punctuation as individual characters.`
""" """
def __init__(self) -> None: def __init__(self):
""" Instantiate a new Punctuation PreTokenizer """ pass
def pre_tokenize(self, pretok):
"""
Pre tokenize the given PreTokenizedString in-place
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given sequence
"""
pass pass
class Sequence(PreTokenizer): class Sequence(PreTokenizer):
"""Sequence PreTokenizer """
This pre-tokenizer composes other pre_tokenizers and applies them in sequence
This pre-tokenizer composes other pre_tokenizers and applies them in sequence`
""" """
def __init__(self) -> None: def __init__(self, pretokenizers):
""" Instantiate a new Sequence PreTokenizer """
pass pass
def pre_tokenize(self, pretok):
class Digits(PreTokenizer):
"""Digits PreTokenizer
This pre-tokenizer simply splits using the digits in separate tokens
""" """
Pre tokenize the given PreTokenizedString in-place
def __init__(self, individual_digits: bool) -> None: """
"""Instantiate a new Digits pass
def pre_tokenize_str(self, sequence):
Args: """
individual_digits: bool: Pre tokenize the given sequence
If set to True, digits will each be separated "Call 123 please" -> "Call ", "1", "2", "3", " please"
If set to False, digits will grouped "Call 123 please" -> "Call ", "123", " please"
""" """
pass pass
class UnicodeScripts(PreTokenizer): class UnicodeScripts(PreTokenizer):
"""UnicodeScripts PreTokenizer """
This pre-tokenizer splits on characters that belong to different language family This pre-tokenizer splits on characters that belong to different language family
It roughly follows https://github.com/google/sentencepiece/blob/master/data/Scripts.txt It roughly follows https://github.com/google/sentencepiece/blob/master/data/Scripts.txt
Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too. Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too.
This mimicks SentencePiece Unigram implementation. This mimicks SentencePiece Unigram implementation.
""" """
def __init__(self) -> None: def __init__(self):
""" Instantiate a new UnicodeScripts """ pass
def pre_tokenize(self, pretok):
"""
Pre tokenize the given PreTokenizedString in-place
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given sequence
"""
pass
class Whitespace(PreTokenizer):
"""
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
"""
def __init__(self):
pass
def pre_tokenize(self, pretok):
"""
Pre tokenize the given PreTokenizedString in-place
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given sequence
"""
pass
class WhitespaceSplit(PreTokenizer):
"""
This pre-tokenizer simply splits on the whitespace. Works like `.split()`
"""
def __init__(self):
pass
def pre_tokenize(self, pretok):
"""
Pre tokenize the given PreTokenizedString in-place
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given sequence
"""
pass pass

View File

@ -1,7 +1,8 @@
# Generated content DO NOT EDIT
from .. import processors from .. import processors
PostProcessor = processors.PostProcessor PostProcessor = processors.PostProcessor
BertProcessing = processors.BertProcessing BertProcessing = processors.BertProcessing
RobertaProcessing = processors.RobertaProcessing
ByteLevel = processors.ByteLevel ByteLevel = processors.ByteLevel
RobertaProcessing = processors.RobertaProcessing
TemplateProcessing = processors.TemplateProcessing TemplateProcessing = processors.TemplateProcessing

View File

@ -1,38 +1,31 @@
from .. import Encoding # Generated content DO NOT EDIT
from typing import Tuple, Union, List
class PostProcessor: class PostProcessor:
"""Base class for all post-processors """
Base class for all post-processors
This class is not supposed to be instantiated directly. Instead, any implementation of This class is not supposed to be instantiated directly. Instead, any implementation of
a PostProcessor will return an instance of this class when instantiated. a PostProcessor will return an instance of this class when instantiated.
""" """
def num_special_tokens_to_add(self, is_pair: bool) -> int: def num_special_tokens_to_add(self, is_pair):
""" """
Return the number of special tokens that would be added for single/pair sentences. Return the number of special tokens that would be added for single/pair sentences.
:param is_pair: Boolean indicating if the input would be a single sentence or a pair :param is_pair: Boolean indicating if the input would be a single sentence or a pair
:return: :return:
""" """
pass pass
def process( def process(self, encoding, pair=None, add_special_tokens=True):
self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True """
) -> Encoding: Post-process the given encodings, generating the final one
""" Post-process the given encodings, generating the final one """ """
pass pass
class BertProcessing(PostProcessor): class BertProcessing(PostProcessor):
"""BertProcessing """
This post-processor takes care of adding the special tokens needed by This post-processor takes care of adding the special tokens needed by
a Bert model: a Bert model:
- a SEP token - a SEP token
- a CLS token - a CLS token
"""
def __init__(self, sep: Tuple[str, int], cls: Tuple[str, int]) -> None:
"""Instantiate a new BertProcessing with the given tokens
Args: Args:
sep: Tuple[str, int]: sep: Tuple[str, int]:
A tuple with the string representation of the SEP token, and its id A tuple with the string representation of the SEP token, and its id
@ -43,11 +36,50 @@ class BertProcessing(PostProcessor):
Returns: Returns:
PostProcessor PostProcessor
""" """
def __init__(self, sep, cls):
pass
def num_special_tokens_to_add(self, is_pair):
"""
Return the number of special tokens that would be added for single/pair sentences.
:param is_pair: Boolean indicating if the input would be a single sentence or a pair
:return:
"""
pass
def process(self, encoding, pair=None, add_special_tokens=True):
"""
Post-process the given encodings, generating the final one
"""
pass
class ByteLevel(PostProcessor):
"""
This post-processor takes care of trimming the offsets.
By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
want the offsets to include these whitespaces, then this PostProcessor must be used.
Args:
trim_offsets: bool:
Whether to trim the whitespaces from the produced offsets.
"""
def __init__(self, trim_offsets=True):
pass
def num_special_tokens_to_add(self, is_pair):
"""
Return the number of special tokens that would be added for single/pair sentences.
:param is_pair: Boolean indicating if the input would be a single sentence or a pair
:return:
"""
pass
def process(self, encoding, pair=None, add_special_tokens=True):
"""
Post-process the given encodings, generating the final one
"""
pass pass
class RobertaProcessing(PostProcessor): class RobertaProcessing(PostProcessor):
"""RobertaProcessing """
This post-processor takes care of adding the special tokens needed by This post-processor takes care of adding the special tokens needed by
a Roberta model: a Roberta model:
- a SEP token - a SEP token
@ -57,17 +89,6 @@ class RobertaProcessing(PostProcessor):
By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
want the offsets to include these whitespaces, then this PostProcessor should be initialized want the offsets to include these whitespaces, then this PostProcessor should be initialized
with `trim_offsets=True` with `trim_offsets=True`
"""
def __init__(
self,
sep: Tuple[str, int],
cls: Tuple[str, int],
trim_offsets: bool = True,
add_prefix_space: bool = True,
) -> None:
"""Instantiate a new RobertaProcessing with the given tokens
Args: Args:
sep: Tuple[str, int]: sep: Tuple[str, int]:
A tuple with the string representation of the SEP token, and its id A tuple with the string representation of the SEP token, and its id
@ -85,31 +106,24 @@ class RobertaProcessing(PostProcessor):
Returns: Returns:
PostProcessor PostProcessor
""" """
def __init__(self, sep, cls, trim_offsets=True, add_prefix_space=True):
pass pass
def num_special_tokens_to_add(self, is_pair):
class ByteLevel(PostProcessor):
"""ByteLevel Post processing
This post-processor takes care of trimming the offsets.
By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
want the offsets to include these whitespaces, then this PostProcessor must be used.
""" """
Return the number of special tokens that would be added for single/pair sentences.
def __init__(self, trim_offsets: bool = True) -> None: :param is_pair: Boolean indicating if the input would be a single sentence or a pair
"""Instantiate a new ByteLevel :return:
"""
Args: pass
trim_offsets: bool: def process(self, encoding, pair=None, add_special_tokens=True):
Whether to trim the whitespaces from the produced offsets. """
Post-process the given encodings, generating the final one
""" """
pass pass
Template = Union[str, List[str]]
Tokens = List[Union[Tuple[int, str], Tuple[str, int], dict]]
class TemplateProcessing(PostProcessor): class TemplateProcessing(PostProcessor):
"""TemplateProcessing """
Provides a way to specify templates in order to add the special tokens to each Provides a way to specify templates in order to add the special tokens to each
input sequence as relevant. input sequence as relevant.
@ -147,10 +161,6 @@ class TemplateProcessing(PostProcessor):
will be added to the Encoding without any further check. If the given ids correspond will be added to the Encoding without any further check. If the given ids correspond
to something totally different in a `Tokenizer` using this `PostProcessor`, it to something totally different in a `Tokenizer` using this `PostProcessor`, it
might lead to unexpected results. might lead to unexpected results.
"""
def __init__(self, single: Template, pair: Template, special_tokens: Tokens) -> None:
"""Instantiate a new TemplateProcessing
Args: Args:
single: Template single: Template
@ -175,4 +185,18 @@ class TemplateProcessing(PostProcessor):
The given dict expects the provided `ids` and `tokens` lists to have The given dict expects the provided `ids` and `tokens` lists to have
the same length. the same length.
""" """
def __init__(self, single, pair, special_tokens):
pass
def num_special_tokens_to_add(self, is_pair):
"""
Return the number of special tokens that would be added for single/pair sentences.
:param is_pair: Boolean indicating if the input would be a single sentence or a pair
:return:
"""
pass
def process(self, encoding, pair=None, add_special_tokens=True):
"""
Post-process the given encodings, generating the final one
"""
pass pass

View File

@ -1,6 +1,7 @@
# Generated content DO NOT EDIT
from .. import trainers from .. import trainers
Trainer = trainers.Trainer Trainer = trainers.Trainer
BpeTrainer = trainers.BpeTrainer BpeTrainer = trainers.BpeTrainer
WordPieceTrainer = trainers.WordPieceTrainer
UnigramTrainer = trainers.UnigramTrainer UnigramTrainer = trainers.UnigramTrainer
WordPieceTrainer = trainers.WordPieceTrainer

View File

@ -1,84 +1,91 @@
from .. import AddedToken # Generated content DO NOT EDIT
from typing import Optional, List, Union
class Trainer: class Trainer:
"""Base class for all trainers """
Base class for all trainers
This class is not supposed to be instantiated directly. Instead, any implementation of a This class is not supposed to be instantiated directly. Instead, any implementation of a
Trainer will return an instance of this class when instantiated. Trainer will return an instance of this class when instantiated.
Args:
vocab_size: unsigned int:
The size of the final vocabulary, including all tokens and alphabet.
min_frequency: unsigned int:
The minimum frequency a pair should have in order to be merged.
show_progress: boolean:
Whether to show progress bars while training.
special_tokens: List[Union[str, AddedToken]]:
A list of special tokens the model should know of.
limit_alphabet: unsigned int:
The maximum different characters to keep in the alphabet.
initial_alphabet: List[str]:
A list of characters to include in the initial alphabet, even
if not seen in the training dataset.
If the strings contain more than one character, only the first one
is kept.
continuing_subword_prefix: Optional[str]:
A prefix to be used for every subword that is not a beginning-of-word.
end_of_word_suffix: Optional[str]:
A suffix to be used for every subword that is a end-of-word.
Returns:
Trainer
""" """
class BpeTrainer(Trainer): def __init__(
"""BpeTrainer self,
vocab_size=30000,
min_frequency=0,
show_progress=True,
special_tokens=[],
limit_alphabet=None,
initial_alphabet=[],
continuing_subword_prefix=None,
end_of_word_suffix=None,
):
pass
class BpeTrainer(Trainer):
"""
Capable of training a BPE model Capable of training a BPE model
""" """
def __init__( class UnigramTrainer(Trainer):
self, """
vocab_size: int = 30000, Capable of training a Unigram model
min_frequency: int = 0,
show_progress: bool = True,
special_tokens: List[Union[str, AddedToken]] = [],
limit_alphabet: Optional[int] = None,
initial_alphabet: List[str] = [],
continuing_subword_prefix: Optional[str] = None,
end_of_word_suffix: Optional[str] = None,
) -> None:
"""Instantiate a new BpeTrainer with the given options:
Args: Args:
vocab_size: unsigned int: vocab_size: unsigned int:
The size of the final vocabulary, including all tokens and alphabet. The size of the final vocabulary, including all tokens and alphabet.
min_frequency: unsigned int:
The minimum frequency a pair should have in order to be merged.
show_progress: boolean: show_progress: boolean:
Whether to show progress bars while training. Whether to show progress bars while training.
special_tokens: List[Union[str, AddedToken]]: special_tokens: List[Union[str, AddedToken]]:
A list of special tokens the model should know of. A list of special tokens the model should know of.
limit_alphabet: unsigned int:
The maximum different characters to keep in the alphabet.
initial_alphabet: List[str]: initial_alphabet: List[str]:
A list of characters to include in the initial alphabet, even A list of characters to include in the initial alphabet, even
if not seen in the training dataset. if not seen in the training dataset.
If the strings contain more than one character, only the first one If the strings contain more than one character, only the first one
is kept. is kept.
continuing_subword_prefix: Optional[str]:
A prefix to be used for every subword that is not a beginning-of-word.
end_of_word_suffix: Optional[str]:
A suffix to be used for every subword that is a end-of-word.
Returns: Returns:
Trainer Trainer
""" """
def __init__(self, vocab_size=8000, show_progress=True, special_tokens=[]):
pass pass
class WordPieceTrainer(Trainer): class WordPieceTrainer(Trainer):
"""WordPieceTrainer
Capable of training a WordPiece model
""" """
Capable of training a WordPiece model
def __init__(
self,
vocab_size: int = 30000,
min_frequency: int = 0,
show_progress: bool = True,
special_tokens: List[Union[str, AddedToken]] = [],
limit_alphabet: Optional[int] = None,
initial_alphabet: List[str] = [],
continuing_subword_prefix: Optional[str] = "##",
end_of_word_suffix: Optional[str] = None,
) -> Trainer:
"""Instantiate a new WordPieceTrainer with the given options:
Args: Args:
vocab_size: unsigned int: vocab_size: unsigned int:
The size of the final vocabulary, including all tokens and alphabet. The size of the final vocabulary, including all tokens and alphabet.
@ -110,39 +117,16 @@ class WordPieceTrainer(Trainer):
Returns: Returns:
Trainer Trainer
""" """
pass
class UnigramTrainer(Trainer):
"""UnigramTrainer
Capable of training a Unigram model
"""
def __init__( def __init__(
self, self,
vocab_size: int = 8000, vocab_size=30000,
show_progress: bool = True, min_frequency=0,
special_tokens: List[Union[str, AddedToken]] = [], show_progress=True,
) -> Trainer: special_tokens=[],
"""Instantiate a new UnigramTrainer with the given options: limit_alphabet=None,
initial_alphabet=[],
Args: continuing_subword_prefix="##",
vocab_size: unsigned int: end_of_word_suffix=None,
The size of the final vocabulary, including all tokens and alphabet. ):
show_progress: boolean:
Whether to show progress bars while training.
special_tokens: List[Union[str, AddedToken]]:
A list of special tokens the model should know of.
initial_alphabet: List[str]:
A list of characters to include in the initial alphabet, even
if not seen in the training dataset.
If the strings contain more than one character, only the first one
is kept.
Returns:
Trainer
"""
pass pass

View File

@ -15,6 +15,10 @@ use tokenizers as tk;
use super::error::ToPyResult; use super::error::ToPyResult;
/// Base class for all decoders
///
/// This class is not supposed to be instantiated directly. Instead, any implementation of
/// a Decoder will return an instance of this class when instantiated.
#[pyclass(dict, module = "tokenizers.decoders", name=Decoder)] #[pyclass(dict, module = "tokenizers.decoders", name=Decoder)]
#[derive(Clone, Deserialize, Serialize)] #[derive(Clone, Deserialize, Serialize)]
pub struct PyDecoder { pub struct PyDecoder {
@ -82,12 +86,16 @@ impl PyDecoder {
} }
} }
/// Decode the given list of string to a final string
#[text_signature = "(self, tokens)"]
fn decode(&self, tokens: Vec<String>) -> PyResult<String> { fn decode(&self, tokens: Vec<String>) -> PyResult<String> {
ToPyResult(self.decoder.decode(tokens)).into() ToPyResult(self.decoder.decode(tokens)).into()
} }
} }
/// ByteLevel Decoder
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name=ByteLevel)] #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name=ByteLevel)]
#[text_signature = "(self)"]
pub struct PyByteLevelDec {} pub struct PyByteLevelDec {}
#[pymethods] #[pymethods]
impl PyByteLevelDec { impl PyByteLevelDec {
@ -97,7 +105,16 @@ impl PyByteLevelDec {
} }
} }
/// Instantiate a new WordPiece Decoder
///
/// Args:
/// prefix: str:
/// The prefix to use for subwords that are not a beginning-of-word
/// cleanup: bool:
/// Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
/// and some abbreviated english forms.
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name=WordPiece)] #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name=WordPiece)]
#[text_signature = "(self, prefix=\"##\", cleanup=True)"]
pub struct PyWordPieceDec {} pub struct PyWordPieceDec {}
#[pymethods] #[pymethods]
impl PyWordPieceDec { impl PyWordPieceDec {
@ -120,7 +137,18 @@ impl PyWordPieceDec {
} }
} }
/// Instantiate a new Metaspace
///
/// Args:
/// replacement: str:
/// The replacement character. Must be exactly one character. By default we
/// use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
///
/// add_prefix_space: boolean:
/// Whether to add a space to the first word if there isn't already one. This
/// lets us treat `hello` exactly like `say hello`.
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name=Metaspace)] #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name=Metaspace)]
#[text_signature = "(self, replacement = \"\", add_prefix_space = True)"]
pub struct PyMetaspaceDec {} pub struct PyMetaspaceDec {}
#[pymethods] #[pymethods]
impl PyMetaspaceDec { impl PyMetaspaceDec {
@ -153,7 +181,14 @@ impl PyMetaspaceDec {
} }
} }
/// Instantiate a new BPEDecoder
///
/// Args:
/// suffix: str:
/// The suffix that was used to caracterize an end-of-word. This suffix will
/// be replaced by whitespaces during the decoding
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name=BPEDecoder)] #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name=BPEDecoder)]
#[text_signature = "(self, suffix=\"</w>\")"]
pub struct PyBPEDecoder {} pub struct PyBPEDecoder {}
#[pymethods] #[pymethods]
impl PyBPEDecoder { impl PyBPEDecoder {

View File

@ -107,7 +107,7 @@ impl PyEncoding {
/// ///
/// Set the given sequence index for the whole range of tokens contained in this /// Set the given sequence index for the whole range of tokens contained in this
/// :class:`~tokenizers.Encoding`. /// :class:`~tokenizers.Encoding`.
#[text_signature = "($self, sequence_id)"] #[text_signature = "(self, sequence_id)"]
fn set_sequence_id(&mut self, sequence_id: usize) { fn set_sequence_id(&mut self, sequence_id: usize) {
self.encoding.set_sequence_id(sequence_id); self.encoding.set_sequence_id(sequence_id);
} }
@ -269,7 +269,7 @@ impl PyEncoding {
/// Returns: /// Returns:
/// :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)` /// :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)`
#[args(sequence_index = 0)] #[args(sequence_index = 0)]
#[text_signature = "($self, word_index, sequence_index=0)"] #[text_signature = "(self, word_index, sequence_index=0)"]
fn word_to_tokens(&self, word_index: u32, sequence_index: usize) -> Option<(usize, usize)> { fn word_to_tokens(&self, word_index: u32, sequence_index: usize) -> Option<(usize, usize)> {
self.encoding.word_to_tokens(word_index, sequence_index) self.encoding.word_to_tokens(word_index, sequence_index)
} }
@ -285,7 +285,7 @@ impl PyEncoding {
/// Returns: /// Returns:
/// :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)` /// :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)`
#[args(sequence_index = 0)] #[args(sequence_index = 0)]
#[text_signature = "($self, word_index, sequence_index=0)"] #[text_signature = "(self, word_index, sequence_index=0)"]
fn word_to_chars(&self, word_index: u32, sequence_index: usize) -> Option<Offsets> { fn word_to_chars(&self, word_index: u32, sequence_index: usize) -> Option<Offsets> {
self.encoding.word_to_chars(word_index, sequence_index) self.encoding.word_to_chars(word_index, sequence_index)
} }
@ -301,7 +301,7 @@ impl PyEncoding {
/// ///
/// Returns: /// Returns:
/// :obj:`int`: The sequence id of the given token /// :obj:`int`: The sequence id of the given token
#[text_signature = "($self, token_index)"] #[text_signature = "(self, token_index)"]
fn token_to_sequence(&self, token_index: usize) -> Option<usize> { fn token_to_sequence(&self, token_index: usize) -> Option<usize> {
self.encoding.token_to_sequence(token_index) self.encoding.token_to_sequence(token_index)
} }
@ -318,7 +318,7 @@ impl PyEncoding {
/// ///
/// Returns: /// Returns:
/// :obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)` /// :obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)`
#[text_signature = "($self, token_index)"] #[text_signature = "(self, token_index)"]
fn token_to_chars(&self, token_index: usize) -> Option<Offsets> { fn token_to_chars(&self, token_index: usize) -> Option<Offsets> {
let (_, offsets) = self.encoding.token_to_chars(token_index)?; let (_, offsets) = self.encoding.token_to_chars(token_index)?;
Some(offsets) Some(offsets)
@ -336,7 +336,7 @@ impl PyEncoding {
/// ///
/// Returns: /// Returns:
/// :obj:`int`: The index of the word in the relevant input sequence. /// :obj:`int`: The index of the word in the relevant input sequence.
#[text_signature = "($self, token_index)"] #[text_signature = "(self, token_index)"]
fn token_to_word(&self, token_index: usize) -> Option<u32> { fn token_to_word(&self, token_index: usize) -> Option<u32> {
let (_, word_idx) = self.encoding.token_to_word(token_index)?; let (_, word_idx) = self.encoding.token_to_word(token_index)?;
Some(word_idx) Some(word_idx)
@ -353,7 +353,7 @@ impl PyEncoding {
/// Returns: /// Returns:
/// :obj:`int`: The index of the token that contains this char in the encoded sequence /// :obj:`int`: The index of the token that contains this char in the encoded sequence
#[args(sequence_index = 0)] #[args(sequence_index = 0)]
#[text_signature = "($self, char_pos, sequence_index=0)"] #[text_signature = "(self, char_pos, sequence_index=0)"]
fn char_to_token(&self, char_pos: usize, sequence_index: usize) -> Option<usize> { fn char_to_token(&self, char_pos: usize, sequence_index: usize) -> Option<usize> {
self.encoding.char_to_token(char_pos, sequence_index) self.encoding.char_to_token(char_pos, sequence_index)
} }
@ -369,7 +369,7 @@ impl PyEncoding {
/// Returns: /// Returns:
/// :obj:`int`: The index of the word that contains this char in the input sequence /// :obj:`int`: The index of the word that contains this char in the input sequence
#[args(sequence_index = 0)] #[args(sequence_index = 0)]
#[text_signature = "($self, char_pos, sequence_index=0)"] #[text_signature = "(self, char_pos, sequence_index=0)"]
fn char_to_word(&self, char_pos: usize, sequence_index: usize) -> Option<u32> { fn char_to_word(&self, char_pos: usize, sequence_index: usize) -> Option<u32> {
self.encoding.char_to_word(char_pos, sequence_index) self.encoding.char_to_word(char_pos, sequence_index)
} }
@ -392,7 +392,7 @@ impl PyEncoding {
/// pad_token (:obj:`str`, defaults to `[PAD]`): /// pad_token (:obj:`str`, defaults to `[PAD]`):
/// The pad token to use /// The pad token to use
#[args(kwargs = "**")] #[args(kwargs = "**")]
#[text_signature = "($self, length, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]')"] #[text_signature = "(self, length, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]')"]
fn pad(&mut self, length: usize, kwargs: Option<&PyDict>) -> PyResult<()> { fn pad(&mut self, length: usize, kwargs: Option<&PyDict>) -> PyResult<()> {
let mut pad_id = 0; let mut pad_id = 0;
let mut pad_type_id = 0; let mut pad_type_id = 0;
@ -440,7 +440,7 @@ impl PyEncoding {
/// stride (:obj:`int`, defaults to :obj:`0`): /// stride (:obj:`int`, defaults to :obj:`0`):
/// The length of previous content to be included in each overflowing piece /// The length of previous content to be included in each overflowing piece
#[args(stride = "0")] #[args(stride = "0")]
#[text_signature = "($self, max_length, stride=0)"] #[text_signature = "(self, max_length, stride=0)"]
fn truncate(&mut self, max_length: usize, stride: usize) -> PyResult<()> { fn truncate(&mut self, max_length: usize, stride: usize) -> PyResult<()> {
self.encoding.truncate(max_length, stride); self.encoding.truncate(max_length, stride);
Ok(()) Ok(())

View File

@ -106,6 +106,8 @@ impl PyModel {
} }
} }
/// Tokenize the given sequence
#[text_signature = "(self, tokens)"]
fn tokenize(&self, tokens: &str) -> PyResult<Vec<PyToken>> { fn tokenize(&self, tokens: &str) -> PyResult<Vec<PyToken>> {
Ok(ToPyResult(self.model.tokenize(tokens)) Ok(ToPyResult(self.model.tokenize(tokens))
.into_py()? .into_py()?
@ -114,14 +116,24 @@ impl PyModel {
.collect()) .collect())
} }
/// Returns the id associated with the given token
#[text_signature = "(self, tokens)"]
fn token_to_id(&self, token: &str) -> Option<u32> { fn token_to_id(&self, token: &str) -> Option<u32> {
self.model.token_to_id(token) self.model.token_to_id(token)
} }
/// Returns the token associated with the given id
#[text_signature = "(self, id)"]
fn id_to_token(&self, id: u32) -> Option<&str> { fn id_to_token(&self, id: u32) -> Option<&str> {
self.model.id_to_token(id) self.model.id_to_token(id)
} }
/// Save the current model
///
/// Save the current model in the given folder, using the given name for the various
/// files that will get created.
/// Any file with the same name that already exist in this folder will be overwritten.
#[text_signature = "(self, folder, name)"]
fn save(&self, folder: &str, name: Option<&str>) -> PyResult<Vec<String>> { fn save(&self, folder: &str, name: Option<&str>) -> PyResult<Vec<String>> {
let saved: PyResult<Vec<_>> = ToPyResult(self.model.save(Path::new(folder), name)).into(); let saved: PyResult<Vec<_>> = ToPyResult(self.model.save(Path::new(folder), name)).into();
@ -132,9 +144,36 @@ impl PyModel {
} }
} }
/// BPE Model /// Instantiate a BPE Model from the given vocab and merges.
/// Allows the creation of a BPE Model to be used with a Tokenizer ///
/// Args:
/// vocab: ('`optional`) Dict[str, int]:
/// A dictionnary of string keys and their ids {"am": 0,...}
///
/// merges: (`optional`) string:
/// A list of pairs of tokens [("a", "b"),...]
///
/// cache_capacity: (`optional`) int:
/// The number of words that the BPE cache can contain. The cache allows
/// to speed-up the process by keeping the result of the merge operations
/// for a number of words.
///
/// dropout: (`optional`) Optional[float] [0, 1]:
/// The BPE dropout to use. Must be an float between 0 and 1
///
/// unk_token: (`optional`) str:
/// The unknown token to be used by the model.
///
/// continuing_subword_prefix: (`optional`) str:
/// The prefix to attach to subword units that don't represent a beginning of word.
///
/// end_of_word_suffix: (`optional`) str:
/// The suffix to attach to subword units that represent an end of word.
///
/// fuse_unk: (`optional`) bool:
/// Multiple unk tokens get fused into only 1
#[pyclass(extends=PyModel, module = "tokenizers.models", name=BPE)] #[pyclass(extends=PyModel, module = "tokenizers.models", name=BPE)]
#[text_signature = "(self, vocab=None, merges=None, cache_capacity=None, dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=None)"]
pub struct PyBPE {} pub struct PyBPE {}
impl PyBPE { impl PyBPE {
@ -225,7 +264,9 @@ impl PyBPE {
PyBPE::with_builder(builder, kwargs) PyBPE::with_builder(builder, kwargs)
} }
/// Read a vocab_filename and merge_filename and stores result in memory
#[staticmethod] #[staticmethod]
#[text_signature = "(self, vocab_filename, merges_filename)"]
fn read_file(vocab_filename: &str, merges_filename: &str) -> PyResult<(Vocab, Merges)> { fn read_file(vocab_filename: &str, merges_filename: &str) -> PyResult<(Vocab, Merges)> {
BPE::read_file(vocab_filename, merges_filename).map_err(|e| { BPE::read_file(vocab_filename, merges_filename).map_err(|e| {
exceptions::PyValueError::new_err(format!( exceptions::PyValueError::new_err(format!(
@ -235,8 +276,15 @@ impl PyBPE {
}) })
} }
/// Convenient method to intialize a BPE from files
/// Roughly equivalent to
///
/// def from_file(vocab_filename, merges_filenames, **kwargs):
/// vocab, merges = BPE.read_file(vocab_filename, merges_filename)
/// return BPE(vocab, merges, **kwargs)
#[staticmethod] #[staticmethod]
#[args(kwargs = "**")] #[args(kwargs = "**")]
#[text_signature = "(vocab_filename, merge_filename, **kwargs)"]
fn from_file( fn from_file(
py: Python, py: Python,
vocab_filename: &str, vocab_filename: &str,
@ -257,8 +305,20 @@ impl PyBPE {
} }
} }
/// WordPiece Model /// WordPiece model
/// Instantiate a WordPiece Model from the given vocab file.
///
/// Args:
/// vocab: (`optional`) string:
/// A dictionnary of string keys and their ids {"am": 0,...}
///
/// unk_token: (`optional`) str:
/// The unknown token to be used by the model.
///
/// max_input_chars_per_word: (`optional`) int:
/// The maximum number of characters to authorize in a single word.
#[pyclass(extends=PyModel, module = "tokenizers.models", name=WordPiece)] #[pyclass(extends=PyModel, module = "tokenizers.models", name=WordPiece)]
#[text_signature = "(self, vocab, unk_token, max_input_chars_per_word)"]
pub struct PyWordPiece {} pub struct PyWordPiece {}
impl PyWordPiece { impl PyWordPiece {
@ -319,15 +379,24 @@ impl PyWordPiece {
PyWordPiece::with_builder(builder, kwargs) PyWordPiece::with_builder(builder, kwargs)
} }
/// Read a vocab_filename and stores result in memory
#[staticmethod] #[staticmethod]
#[text_signature = "(vocab_filename)"]
fn read_file(vocab_filename: &str) -> PyResult<Vocab> { fn read_file(vocab_filename: &str) -> PyResult<Vocab> {
WordPiece::read_file(vocab_filename).map_err(|e| { WordPiece::read_file(vocab_filename).map_err(|e| {
exceptions::PyValueError::new_err(format!("Error while reading WordPiece file: {}", e)) exceptions::PyValueError::new_err(format!("Error while reading WordPiece file: {}", e))
}) })
} }
/// Convenient method to intialize a WordPiece from files
/// Roughly equivalent to
///
/// def from_file(vocab_filename, **kwargs):
/// vocab = WordPiece.read_file(vocab_filename)
/// return WordPiece(vocab, **kwargs)
#[staticmethod] #[staticmethod]
#[args(kwargs = "**")] #[args(kwargs = "**")]
#[text_signature = "(vocab_filename, merge_filename, **kwargs)"]
fn from_file(py: Python, vocab_filename: &str, kwargs: Option<&PyDict>) -> PyResult<Py<Self>> { fn from_file(py: Python, vocab_filename: &str, kwargs: Option<&PyDict>) -> PyResult<Py<Self>> {
let vocab = WordPiece::read_file(vocab_filename).map_err(|e| { let vocab = WordPiece::read_file(vocab_filename).map_err(|e| {
exceptions::PyValueError::new_err(format!("Error while reading WordPiece file: {}", e)) exceptions::PyValueError::new_err(format!("Error while reading WordPiece file: {}", e))
@ -336,7 +405,18 @@ impl PyWordPiece {
} }
} }
/// Most simple tokenizer model based on mapping token from a vocab file to their corresponding id.
///
/// Instantiate a WordLevel Model from the given vocab file.
///
/// Args:
/// vocab: (`optional`) string:
/// A dictionnary of string keys and their ids {"am": 0,...}
///
/// unk_token: str:
/// The unknown token to be used by the model.
#[pyclass(extends=PyModel, module = "tokenizers.models", name=WordLevel)] #[pyclass(extends=PyModel, module = "tokenizers.models", name=WordLevel)]
#[text_signature = "(self, vocab, unk_token)"]
pub struct PyWordLevel {} pub struct PyWordLevel {}
impl PyWordLevel { impl PyWordLevel {
@ -411,7 +491,16 @@ impl PyWordLevel {
} }
} }
/// UnigramEncoding model class
///
/// Instantiate a Unigram Model from the given model file.
///
/// Args:
/// vocab: ('`optional`) string:
/// A list of vocabulary items and their relative score [("am", -0.2442),...]
///
#[pyclass(extends=PyModel, module = "tokenizers.models", name=Unigram)] #[pyclass(extends=PyModel, module = "tokenizers.models", name=Unigram)]
#[text_signature = "(self, vocab)"]
pub struct PyUnigram {} pub struct PyUnigram {}
#[pymethods] #[pymethods]

View File

@ -15,6 +15,10 @@ use tk::normalizers::{
use tk::{NormalizedString, Normalizer}; use tk::{NormalizedString, Normalizer};
use tokenizers as tk; use tokenizers as tk;
/// Base class for all normalizers
///
/// This class is not supposed to be instantiated directly. Instead, any implementation of a
/// Normalizer will return an instance of this class when instantiated.
#[pyclass(dict, module = "tokenizers.normalizers", name=Normalizer)] #[pyclass(dict, module = "tokenizers.normalizers", name=Normalizer)]
#[derive(Clone, Serialize, Deserialize)] #[derive(Clone, Serialize, Deserialize)]
pub struct PyNormalizer { pub struct PyNormalizer {
@ -105,10 +109,14 @@ impl PyNormalizer {
} }
} }
/// Normalize the given NormalizedString in-place
#[text_signature = "(self, normalized)"]
fn normalize(&self, normalized: &mut PyNormalizedString) -> PyResult<()> { fn normalize(&self, normalized: &mut PyNormalizedString) -> PyResult<()> {
ToPyResult(self.normalizer.normalize(&mut normalized.normalized)).into() ToPyResult(self.normalizer.normalize(&mut normalized.normalized)).into()
} }
/// Normalize the given str
#[text_signature = "(self, sequence)"]
fn normalize_str(&self, sequence: &str) -> PyResult<String> { fn normalize_str(&self, sequence: &str) -> PyResult<String> {
let mut normalized = NormalizedString::from(sequence); let mut normalized = NormalizedString::from(sequence);
ToPyResult(self.normalizer.normalize(&mut normalized)).into_py()?; ToPyResult(self.normalizer.normalize(&mut normalized)).into_py()?;
@ -116,7 +124,30 @@ impl PyNormalizer {
} }
} }
/// BertNormalizer
///
/// Takes care of normalizing raw text before giving it to a Bert model.
/// This includes cleaning the text, handling accents, chinese chars and lowercasing
///
/// Args:
/// clean_text: (`optional`) boolean:
/// Whether to clean the text, by removing any control characters
/// and replacing all whitespaces by the classic one.
///
/// handle_chinese_chars: (`optional`) boolean:
/// Whether to handle chinese chars by putting spaces around them.
///
/// strip_accents: (`optional`) boolean:
/// Whether to strip all accents. If this option is not specified (ie == None),
/// then it will be determined by the value for `lowercase` (as in the original Bert).
///
/// lowercase: (`optional`) boolean:
/// Whether to lowercase.
///
/// Returns:
/// Normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=BertNormalizer)] #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=BertNormalizer)]
#[text_signature = "(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True)"]
pub struct PyBertNormalizer {} pub struct PyBertNormalizer {}
#[pymethods] #[pymethods]
impl PyBertNormalizer { impl PyBertNormalizer {
@ -146,7 +177,9 @@ impl PyBertNormalizer {
} }
} }
/// NFD Unicode Normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFD)] #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFD)]
#[text_signature = "(self)"]
pub struct PyNFD {} pub struct PyNFD {}
#[pymethods] #[pymethods]
impl PyNFD { impl PyNFD {
@ -156,7 +189,9 @@ impl PyNFD {
} }
} }
/// NFKD Unicode Normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFKD)] #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFKD)]
#[text_signature = "(self)"]
pub struct PyNFKD {} pub struct PyNFKD {}
#[pymethods] #[pymethods]
impl PyNFKD { impl PyNFKD {
@ -166,7 +201,9 @@ impl PyNFKD {
} }
} }
/// NFC Unicode Normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFC)] #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFC)]
#[text_signature = "(self)"]
pub struct PyNFC {} pub struct PyNFC {}
#[pymethods] #[pymethods]
impl PyNFC { impl PyNFC {
@ -176,7 +213,9 @@ impl PyNFC {
} }
} }
/// NFKC Unicode Normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFKC)] #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFKC)]
#[text_signature = "(self)"]
pub struct PyNFKC {} pub struct PyNFKC {}
#[pymethods] #[pymethods]
impl PyNFKC { impl PyNFKC {
@ -186,6 +225,12 @@ impl PyNFKC {
} }
} }
/// Allows concatenating multiple other Normalizer as a Sequence.
/// All the normalizers run in sequence in the given order
///
/// Args:
/// normalizers: List[Normalizer]:
/// A list of Normalizer to be run as a sequence
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Sequence)] #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Sequence)]
pub struct PySequence {} pub struct PySequence {}
#[pymethods] #[pymethods]
@ -211,7 +256,9 @@ impl PySequence {
} }
} }
/// Lowercase Normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Lowercase)] #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Lowercase)]
#[text_signature = "(self)"]
pub struct PyLowercase {} pub struct PyLowercase {}
#[pymethods] #[pymethods]
impl PyLowercase { impl PyLowercase {
@ -221,7 +268,9 @@ impl PyLowercase {
} }
} }
/// Strip normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Strip)] #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Strip)]
#[text_signature = "(self, left=True, right=True)"]
pub struct PyStrip {} pub struct PyStrip {}
#[pymethods] #[pymethods]
impl PyStrip { impl PyStrip {
@ -245,6 +294,7 @@ impl PyStrip {
} }
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=StripAccents)] #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=StripAccents)]
#[text_signature = "(self)"]
pub struct PyStripAccents {} pub struct PyStripAccents {}
#[pymethods] #[pymethods]
impl PyStripAccents { impl PyStripAccents {
@ -389,7 +439,9 @@ impl Normalizer for PyNormalizerWrapper {
} }
} }
/// Nmt normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Nmt)] #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Nmt)]
#[text_signature = "(self)"]
pub struct PyNmt {} pub struct PyNmt {}
#[pymethods] #[pymethods]
impl PyNmt { impl PyNmt {
@ -399,7 +451,10 @@ impl PyNmt {
} }
} }
/// Precompiled normalizer
/// Don't use manually it is used for compatiblity for SentencePiece.
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Precompiled)] #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Precompiled)]
#[text_signature = "(self, precompiled_charsmap)"]
pub struct PyPrecompiled {} pub struct PyPrecompiled {}
#[pymethods] #[pymethods]
impl PyPrecompiled { impl PyPrecompiled {
@ -420,7 +475,9 @@ impl PyPrecompiled {
} }
} }
/// Replace normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Replace)] #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Replace)]
#[text_signature = "(self, pattern, content)"]
pub struct PyReplace {} pub struct PyReplace {}
#[pymethods] #[pymethods]
impl PyReplace { impl PyReplace {

View File

@ -22,6 +22,10 @@ use tokenizers as tk;
use super::error::ToPyResult; use super::error::ToPyResult;
use super::utils::*; use super::utils::*;
/// Base class for all pre-tokenizers
///
/// This class is not supposed to be instantiated directly. Instead, any implementation of a
/// PreTokenizer will return an instance of this class when instantiated.
#[pyclass(dict, module = "tokenizers.pre_tokenizers", name=PreTokenizer)] #[pyclass(dict, module = "tokenizers.pre_tokenizers", name=PreTokenizer)]
#[derive(Clone, Serialize, Deserialize)] #[derive(Clone, Serialize, Deserialize)]
pub struct PyPreTokenizer { pub struct PyPreTokenizer {
@ -121,10 +125,14 @@ impl PyPreTokenizer {
} }
} }
/// Pre tokenize the given PreTokenizedString in-place
#[text_signature = "(self, pretok)"]
fn pre_tokenize(&self, pretok: &mut PyPreTokenizedString) -> PyResult<()> { fn pre_tokenize(&self, pretok: &mut PyPreTokenizedString) -> PyResult<()> {
ToPyResult(self.pretok.pre_tokenize(&mut pretok.pretok)).into() ToPyResult(self.pretok.pre_tokenize(&mut pretok.pretok)).into()
} }
/// Pre tokenize the given sequence
#[text_signature = "(self, sequence)"]
fn pre_tokenize_str(&self, s: &str) -> PyResult<Vec<(String, Offsets)>> { fn pre_tokenize_str(&self, s: &str) -> PyResult<Vec<(String, Offsets)>> {
let mut pretokenized = tk::tokenizer::PreTokenizedString::from(s); let mut pretokenized = tk::tokenizer::PreTokenizedString::from(s);
@ -138,7 +146,19 @@ impl PyPreTokenizer {
} }
} }
/// ByteLevel PreTokenizer
///
/// This pre-tokenizer takes care of replacing all bytes of the given string
/// with a corresponding representation, as well as splitting into words.
///
/// Args:
/// add_prefix_space: (`optional`) boolean:
/// Whether to add a space to the first word if there isn't already one. This
/// lets us treat `hello` exactly like `say hello`.
/// Returns:
/// PreTokenizer
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=ByteLevel)] #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=ByteLevel)]
#[text_signature = "(self, add_prefix_space=True)"]
pub struct PyByteLevel {} pub struct PyByteLevel {}
#[pymethods] #[pymethods]
impl PyByteLevel { impl PyByteLevel {
@ -161,7 +181,13 @@ impl PyByteLevel {
Ok((PyByteLevel {}, byte_level.into())) Ok((PyByteLevel {}, byte_level.into()))
} }
/// Returns the alphabet used by this PreTokenizer.
///
/// Since the ByteLevel works as its name suggests, at the byte level, it
/// encodes any byte to one visible character. This means that there is a
/// total of 256 different characters composing this alphabet.
#[staticmethod] #[staticmethod]
#[text_signature = "()"]
fn alphabet() -> Vec<String> { fn alphabet() -> Vec<String> {
ByteLevel::alphabet() ByteLevel::alphabet()
.into_iter() .into_iter()
@ -170,7 +196,9 @@ impl PyByteLevel {
} }
} }
/// This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Whitespace)] #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Whitespace)]
#[text_signature = "(self)"]
pub struct PyWhitespace {} pub struct PyWhitespace {}
#[pymethods] #[pymethods]
impl PyWhitespace { impl PyWhitespace {
@ -180,7 +208,9 @@ impl PyWhitespace {
} }
} }
/// This pre-tokenizer simply splits on the whitespace. Works like `.split()`
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=WhitespaceSplit)] #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=WhitespaceSplit)]
#[text_signature = "(self)"]
pub struct PyWhitespaceSplit {} pub struct PyWhitespaceSplit {}
#[pymethods] #[pymethods]
impl PyWhitespaceSplit { impl PyWhitespaceSplit {
@ -190,6 +220,11 @@ impl PyWhitespaceSplit {
} }
} }
/// This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
///
/// Args:
/// delimiter: str:
/// The delimiter char that will be used to split input
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=CharDelimiterSplit)] #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=CharDelimiterSplit)]
pub struct PyCharDelimiterSplit {} pub struct PyCharDelimiterSplit {}
#[pymethods] #[pymethods]
@ -210,7 +245,12 @@ impl PyCharDelimiterSplit {
} }
} }
/// BertPreTokenizer
///
/// This pre-tokenizer splits tokens on spaces, and also on punctuation.
/// Each occurence of a punctuation character will be treated separately.
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=BertPreTokenizer)] #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=BertPreTokenizer)]
#[text_signature = "(self)"]
pub struct PyBertPreTokenizer {} pub struct PyBertPreTokenizer {}
#[pymethods] #[pymethods]
impl PyBertPreTokenizer { impl PyBertPreTokenizer {
@ -220,7 +260,9 @@ impl PyBertPreTokenizer {
} }
} }
/// This pre-tokenizer simply splits on punctuation as individual characters.`
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Punctuation)] #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Punctuation)]
#[text_signature = "(self)"]
pub struct PyPunctuation {} pub struct PyPunctuation {}
#[pymethods] #[pymethods]
impl PyPunctuation { impl PyPunctuation {
@ -230,7 +272,9 @@ impl PyPunctuation {
} }
} }
/// This pre-tokenizer composes other pre_tokenizers and applies them in sequence
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Sequence)] #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Sequence)]
#[text_signature = "(self, pretokenizers)"]
pub struct PySequence {} pub struct PySequence {}
#[pymethods] #[pymethods]
impl PySequence { impl PySequence {
@ -257,7 +301,20 @@ impl PySequence {
} }
} }
/// Metaspace pre-tokenizer
///
/// This pre-tokenizer replaces any whitespace by the provided replacement character.
/// It then tries to split on these spaces.
/// Args:
/// replacement: str:
/// The replacement character. Must be exactly one character. By default we
/// use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
///
/// add_prefix_space: boolean:
/// Whether to add a space to the first word if there isn't already one. This
/// lets us treat `hello` exactly like `say hello`.
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Metaspace)] #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Metaspace)]
#[text_signature = "(self, replacement=\"\", add_prefix_space=True)"]
pub struct PyMetaspace {} pub struct PyMetaspace {}
#[pymethods] #[pymethods]
impl PyMetaspace { impl PyMetaspace {
@ -290,7 +347,13 @@ impl PyMetaspace {
} }
} }
/// This pre-tokenizer simply splits using the digits in separate tokens
/// Args:
/// individual_digits: bool:
/// If set to True, digits will each be separated "Call 123 please" -> "Call ", "1", "2", "3", " please"
/// If set to False, digits will grouped "Call 123 please" -> "Call ", "123", " please"
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Digits)] #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Digits)]
#[text_signature = "(self, individual_digits=False)"]
pub struct PyDigits {} pub struct PyDigits {}
#[pymethods] #[pymethods]
impl PyDigits { impl PyDigits {
@ -301,7 +364,12 @@ impl PyDigits {
} }
} }
/// This pre-tokenizer splits on characters that belong to different language family
/// It roughly follows https://github.com/google/sentencepiece/blob/master/data/Scripts.txt
/// Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too.
/// This mimicks SentencePiece Unigram implementation.
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=UnicodeScripts)] #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=UnicodeScripts)]
#[text_signature = "(self)"]
pub struct PyUnicodeScripts {} pub struct PyUnicodeScripts {}
#[pymethods] #[pymethods]
impl PyUnicodeScripts { impl PyUnicodeScripts {

View File

@ -16,6 +16,10 @@ use tk::processors::PostProcessorWrapper;
use tk::{Encoding, PostProcessor}; use tk::{Encoding, PostProcessor};
use tokenizers as tk; use tokenizers as tk;
/// Base class for all post-processors
///
/// This class is not supposed to be instantiated directly. Instead, any implementation of
/// a PostProcessor will return an instance of this class when instantiated.
#[pyclass(dict, module = "tokenizers.processors", name=PostProcessor)] #[pyclass(dict, module = "tokenizers.processors", name=PostProcessor)]
#[derive(Clone, Deserialize, Serialize)] #[derive(Clone, Deserialize, Serialize)]
pub struct PyPostProcessor { pub struct PyPostProcessor {
@ -88,11 +92,17 @@ impl PyPostProcessor {
} }
} }
/// Return the number of special tokens that would be added for single/pair sentences.
/// :param is_pair: Boolean indicating if the input would be a single sentence or a pair
/// :return:
#[text_signature = "(self, is_pair)"]
fn num_special_tokens_to_add(&self, is_pair: bool) -> usize { fn num_special_tokens_to_add(&self, is_pair: bool) -> usize {
self.processor.added_tokens(is_pair) self.processor.added_tokens(is_pair)
} }
/// Post-process the given encodings, generating the final one
#[args(pair = "None", add_special_tokens = "true")] #[args(pair = "None", add_special_tokens = "true")]
#[text_signature = "(self, encoding, pair=None, add_special_tokens=True)"]
fn process( fn process(
&self, &self,
encoding: &PyEncoding, encoding: &PyEncoding,
@ -109,7 +119,21 @@ impl PyPostProcessor {
} }
} }
/// This post-processor takes care of adding the special tokens needed by
/// a Bert model:
/// - a SEP token
/// - a CLS token
/// Args:
/// sep: Tuple[str, int]:
/// A tuple with the string representation of the SEP token, and its id
///
/// cls: Tuple[str, int]:
/// A tuple with the string representation of the CLS token, and its id
///
/// Returns:
/// PostProcessor
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name=BertProcessing)] #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name=BertProcessing)]
#[text_signature = "(self, sep, cls)"]
pub struct PyBertProcessing {} pub struct PyBertProcessing {}
#[pymethods] #[pymethods]
impl PyBertProcessing { impl PyBertProcessing {
@ -126,7 +150,33 @@ impl PyBertProcessing {
} }
} }
/// This post-processor takes care of adding the special tokens needed by
/// a Roberta model:
/// - a SEP token
/// - a CLS token
///
/// It also takes care of trimming the offsets.
/// By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
/// want the offsets to include these whitespaces, then this PostProcessor should be initialized
/// with `trim_offsets=True`
/// Args:
/// sep: Tuple[str, int]:
/// A tuple with the string representation of the SEP token, and its id
///
/// cls: Tuple[str, int]:
/// A tuple with the string representation of the CLS token, and its id
///
/// trim_offsets: bool:
/// Whether to trim the whitespaces from the produced offsets.
///
/// add_prefix_space: bool:
/// Whether the add_prefix_space option was enabled during pre-tokenization. This
/// is relevant because it defines the way the offsets are trimmed out.
///
/// Returns:
/// PostProcessor
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name=RobertaProcessing)] #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name=RobertaProcessing)]
#[text_signature = "(self, sep, cls, trim_offsets=True, add_prefix_space=True)"]
pub struct PyRobertaProcessing {} pub struct PyRobertaProcessing {}
#[pymethods] #[pymethods]
impl PyRobertaProcessing { impl PyRobertaProcessing {
@ -152,7 +202,15 @@ impl PyRobertaProcessing {
} }
} }
/// This post-processor takes care of trimming the offsets.
/// By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
/// want the offsets to include these whitespaces, then this PostProcessor must be used.
///
/// Args:
/// trim_offsets: bool:
/// Whether to trim the whitespaces from the produced offsets.
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name=ByteLevel)] #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name=ByteLevel)]
#[text_signature = "(self, trim_offsets=True)"]
pub struct PyByteLevel {} pub struct PyByteLevel {}
#[pymethods] #[pymethods]
impl PyByteLevel { impl PyByteLevel {
@ -244,7 +302,68 @@ impl FromPyObject<'_> for PyTemplate {
} }
} }
/// Provides a way to specify templates in order to add the special tokens to each
/// input sequence as relevant.
///
/// Let's take `BERT` tokenizer as an example. It uses two special tokens, used to
/// delimitate each sequence. `[CLS]` is always used at the beginning of the first
/// sequence, and `[SEP]` is added at the end of both the first, and the pair
/// sequences. The final result looks like this:
/// - Single sequence: `[CLS] Hello there [SEP]`
/// - Pair sequences: `[CLS] My name is Anthony [SEP] What is my name? [SEP]`
/// With the type ids as following:
/// ```markdown
/// [CLS] ... [SEP] ... [SEP]
/// 0 0 0 1 1
/// ```
///
/// You can achieve such behavior using a TemplateProcessing:
/// ```
/// TemplateProcessing(
/// single="[CLS] $0 [SEP]",
/// pair="[CLS] $A [SEP] $B:1 [SEP]:1",
/// special_tokens=[("[CLS]", 1), ("[SEP]", 0)],
/// )
/// ```
///
/// In this example, each input sequence is identified using a `$` construct. This identifier
/// lets us specify each input sequence, and the type_id to use. When nothing is specified,
/// it uses the default values. Here are the different ways to specify it:
/// - Specifying the sequence, with default `type_id == 0`: `$A` or `$B`
/// - Specifying the `type_id` with default `sequence == A`: `$0`, `$1`, `$2`, ...
/// - Specifying both: `$A:0`, `$B:1`, ...
///
/// The same construct is used for special tokens: `<identifier>(:<type_id>)?`.
///
/// **Warning**: You must ensure that you are giving the correct tokens/ids as these
/// will be added to the Encoding without any further check. If the given ids correspond
/// to something totally different in a `Tokenizer` using this `PostProcessor`, it
/// might lead to unexpected results.
///
/// Args:
/// single: Template
/// The template used for single sequences
///
/// pair: Template:
/// The template used when both sequences are specified
///
/// special_tokens: Tokens:
/// The list of special tokens used in each sequences
///
/// Template: Union[str, List[str]]:
/// - If a `str` is provided, the whitespace is used as delimiter between tokens
/// - If a `List[str]` is provided, a list of tokens
///
/// Tokens: List[Union[Tuple[int, str], Tuple[str, int], dict]]:
/// - A Tuple with both a token and its associated ID, in any order
/// - A dict with the following keys:
/// - "id": str => The special token id, as specified in the Template
/// - "ids": List[int] => The associated IDs
/// - "tokens": List[str] => The associated tokens
/// The given dict expects the provided `ids` and `tokens` lists to have
/// the same length.
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name=TemplateProcessing)] #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name=TemplateProcessing)]
#[text_signature = "(self, single, pair, special_tokens)"]
pub struct PyTemplateProcessing {} pub struct PyTemplateProcessing {}
#[pymethods] #[pymethods]
impl PyTemplateProcessing { impl PyTemplateProcessing {

View File

@ -53,7 +53,7 @@ use crate::processors::PyPostProcessor;
/// Yesterday"``. /// Yesterday"``.
/// ///
#[pyclass(dict, module = "tokenizers", name=AddedToken)] #[pyclass(dict, module = "tokenizers", name=AddedToken)]
#[text_signature = "(content, single_word=False, lstrip=False, rstrip=False, normalized=True)"] #[text_signature = "(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True)"]
pub struct PyAddedToken { pub struct PyAddedToken {
pub content: String, pub content: String,
pub is_special_token: bool, pub is_special_token: bool,
@ -408,7 +408,7 @@ type Tokenizer = TokenizerImpl<PyModel, PyNormalizer, PyPreTokenizer, PyPostProc
/// The core algorithm that this :obj:`Tokenizer` should be using. /// The core algorithm that this :obj:`Tokenizer` should be using.
/// ///
#[pyclass(dict, module = "tokenizers", name=Tokenizer)] #[pyclass(dict, module = "tokenizers", name=Tokenizer)]
#[text_signature = "(model)"] #[text_signature = "(self, model)"]
#[derive(Clone)] #[derive(Clone)]
pub struct PyTokenizer { pub struct PyTokenizer {
tokenizer: Tokenizer, tokenizer: Tokenizer,
@ -523,7 +523,7 @@ impl PyTokenizer {
/// Returns: /// Returns:
/// :obj:`str`: A string representing the serialized Tokenizer /// :obj:`str`: A string representing the serialized Tokenizer
#[args(pretty = false)] #[args(pretty = false)]
#[text_signature = "($self, pretty=False)"] #[text_signature = "(self, pretty=False)"]
fn to_str(&self, pretty: bool) -> PyResult<String> { fn to_str(&self, pretty: bool) -> PyResult<String> {
ToPyResult(self.tokenizer.to_string(pretty)).into() ToPyResult(self.tokenizer.to_string(pretty)).into()
} }
@ -537,11 +537,15 @@ impl PyTokenizer {
/// pretty (:obj:`bool`, defaults to :obj:`False`): /// pretty (:obj:`bool`, defaults to :obj:`False`):
/// Whether the JSON file should be pretty formatted. /// Whether the JSON file should be pretty formatted.
#[args(pretty = false)] #[args(pretty = false)]
#[text_signature = "($self, pretty=False)"] #[text_signature = "(self, pretty=False)"]
fn save(&self, path: &str, pretty: bool) -> PyResult<()> { fn save(&self, path: &str, pretty: bool) -> PyResult<()> {
ToPyResult(self.tokenizer.save(path, pretty)).into() ToPyResult(self.tokenizer.save(path, pretty)).into()
} }
/// Return the number of special tokens that would be added for single/pair sentences.
/// :param is_pair: Boolean indicating if the input would be a single sentence or a pair
/// :return:
#[text_signature = "(self, is_pair)"]
fn num_special_tokens_to_add(&self, is_pair: bool) -> PyResult<usize> { fn num_special_tokens_to_add(&self, is_pair: bool) -> PyResult<usize> {
Ok(self Ok(self
.tokenizer .tokenizer
@ -558,7 +562,7 @@ impl PyTokenizer {
/// Returns: /// Returns:
/// :obj:`Dict[str, int]`: The vocabulary /// :obj:`Dict[str, int]`: The vocabulary
#[args(with_added_tokens = true)] #[args(with_added_tokens = true)]
#[text_signature = "($self, with_added_tokens=True)"] #[text_signature = "(self, with_added_tokens=True)"]
fn get_vocab(&self, with_added_tokens: bool) -> PyResult<HashMap<String, u32>> { fn get_vocab(&self, with_added_tokens: bool) -> PyResult<HashMap<String, u32>> {
Ok(self.tokenizer.get_vocab(with_added_tokens)) Ok(self.tokenizer.get_vocab(with_added_tokens))
} }
@ -572,7 +576,7 @@ impl PyTokenizer {
/// Returns: /// Returns:
/// :obj:`int`: The size of the vocabulary /// :obj:`int`: The size of the vocabulary
#[args(with_added_tokens = true)] #[args(with_added_tokens = true)]
#[text_signature = "($self, with_added_tokens=True)"] #[text_signature = "(self, with_added_tokens=True)"]
fn get_vocab_size(&self, with_added_tokens: bool) -> PyResult<usize> { fn get_vocab_size(&self, with_added_tokens: bool) -> PyResult<usize> {
Ok(self.tokenizer.get_vocab_size(with_added_tokens)) Ok(self.tokenizer.get_vocab_size(with_added_tokens))
} }
@ -591,7 +595,7 @@ impl PyTokenizer {
/// The strategy used to truncation. Can be one of ``longest_first``, ``only_first`` or /// The strategy used to truncation. Can be one of ``longest_first``, ``only_first`` or
/// ``only_second``. /// ``only_second``.
#[args(kwargs = "**")] #[args(kwargs = "**")]
#[text_signature = "($self, max_length, stride=0, strategy='longest_first')"] #[text_signature = "(self, max_length, stride=0, strategy='longest_first')"]
fn enable_truncation(&mut self, max_length: usize, kwargs: Option<&PyDict>) -> PyResult<()> { fn enable_truncation(&mut self, max_length: usize, kwargs: Option<&PyDict>) -> PyResult<()> {
let mut params = TruncationParams::default(); let mut params = TruncationParams::default();
params.max_length = max_length; params.max_length = max_length;
@ -626,7 +630,7 @@ impl PyTokenizer {
} }
/// Disable truncation /// Disable truncation
#[text_signature = "($self)"] #[text_signature = "(self)"]
fn no_truncation(&mut self) { fn no_truncation(&mut self) {
self.tokenizer.with_truncation(None); self.tokenizer.with_truncation(None);
} }
@ -675,7 +679,7 @@ impl PyTokenizer {
/// If specified, the length at which to pad. If not specified we pad using the size of /// If specified, the length at which to pad. If not specified we pad using the size of
/// the longest sequence in a batch. /// the longest sequence in a batch.
#[args(kwargs = "**")] #[args(kwargs = "**")]
#[text_signature = "($self, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]', length=None, pad_to_multiple_of=None)"] #[text_signature = "(self, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]', length=None, pad_to_multiple_of=None)"]
fn enable_padding(&mut self, kwargs: Option<&PyDict>) -> PyResult<()> { fn enable_padding(&mut self, kwargs: Option<&PyDict>) -> PyResult<()> {
let mut params = PaddingParams::default(); let mut params = PaddingParams::default();
@ -733,7 +737,7 @@ impl PyTokenizer {
} }
/// Disable padding /// Disable padding
#[text_signature = "($self)"] #[text_signature = "(self)"]
fn no_padding(&mut self) { fn no_padding(&mut self) {
self.tokenizer.with_padding(None); self.tokenizer.with_padding(None);
} }
@ -802,7 +806,7 @@ impl PyTokenizer {
/// :class:`~tokenizers.Encoding`: The encoded result /// :class:`~tokenizers.Encoding`: The encoded result
/// ///
#[args(pair = "None", is_pretokenized = "false", add_special_tokens = "true")] #[args(pair = "None", is_pretokenized = "false", add_special_tokens = "true")]
#[text_signature = "($self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)"] #[text_signature = "(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)"]
fn encode( fn encode(
&self, &self,
sequence: &PyAny, sequence: &PyAny,
@ -867,7 +871,7 @@ impl PyTokenizer {
/// A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch /// A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
/// ///
#[args(is_pretokenized = "false", add_special_tokens = "true")] #[args(is_pretokenized = "false", add_special_tokens = "true")]
#[text_signature = "($self, input, is_pretokenized=False, add_special_tokens=True)"] #[text_signature = "(self, input, is_pretokenized=False, add_special_tokens=True)"]
fn encode_batch( fn encode_batch(
&self, &self,
input: Vec<&PyAny>, input: Vec<&PyAny>,
@ -910,7 +914,7 @@ impl PyTokenizer {
/// Returns: /// Returns:
/// :obj:`str`: The decoded string /// :obj:`str`: The decoded string
#[args(skip_special_tokens = true)] #[args(skip_special_tokens = true)]
#[text_signature = "($self, ids, skip_special_tokens=True)"] #[text_signature = "(self, ids, skip_special_tokens=True)"]
fn decode(&self, ids: Vec<u32>, skip_special_tokens: bool) -> PyResult<String> { fn decode(&self, ids: Vec<u32>, skip_special_tokens: bool) -> PyResult<String> {
ToPyResult(self.tokenizer.decode(ids, skip_special_tokens)).into() ToPyResult(self.tokenizer.decode(ids, skip_special_tokens)).into()
} }
@ -927,7 +931,7 @@ impl PyTokenizer {
/// Returns: /// Returns:
/// :obj:`List[str]`: A list of decoded strings /// :obj:`List[str]`: A list of decoded strings
#[args(skip_special_tokens = true)] #[args(skip_special_tokens = true)]
#[text_signature = "($self, sequences, skip_special_tokens=True)"] #[text_signature = "(self, sequences, skip_special_tokens=True)"]
fn decode_batch( fn decode_batch(
&self, &self,
sequences: Vec<Vec<u32>>, sequences: Vec<Vec<u32>>,
@ -947,7 +951,7 @@ impl PyTokenizer {
/// ///
/// Returns: /// Returns:
/// :obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary /// :obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary
#[text_signature = "($self, token)"] #[text_signature = "(self, token)"]
fn token_to_id(&self, token: &str) -> Option<u32> { fn token_to_id(&self, token: &str) -> Option<u32> {
self.tokenizer.token_to_id(token) self.tokenizer.token_to_id(token)
} }
@ -960,7 +964,7 @@ impl PyTokenizer {
/// ///
/// Returns: /// Returns:
/// :obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary /// :obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary
#[text_signature = "($self, id)"] #[text_signature = "(self, id)"]
fn id_to_token(&self, id: u32) -> Option<&str> { fn id_to_token(&self, id: u32) -> Option<&str> {
self.tokenizer.id_to_token(id) self.tokenizer.id_to_token(id)
} }
@ -977,7 +981,7 @@ impl PyTokenizer {
/// ///
/// Returns: /// Returns:
/// :obj:`int`: The number of tokens that were created in the vocabulary /// :obj:`int`: The number of tokens that were created in the vocabulary
#[text_signature = "($self, tokens)"] #[text_signature = "(self, tokens)"]
fn add_tokens(&mut self, tokens: &PyList) -> PyResult<usize> { fn add_tokens(&mut self, tokens: &PyList) -> PyResult<usize> {
let tokens = tokens let tokens = tokens
.into_iter() .into_iter()
@ -1014,7 +1018,7 @@ impl PyTokenizer {
/// ///
/// Returns: /// Returns:
/// :obj:`int`: The number of tokens that were created in the vocabulary /// :obj:`int`: The number of tokens that were created in the vocabulary
#[text_signature = "($self, tokens)"] #[text_signature = "(self, tokens)"]
fn add_special_tokens(&mut self, tokens: &PyList) -> PyResult<usize> { fn add_special_tokens(&mut self, tokens: &PyList) -> PyResult<usize> {
let tokens = tokens let tokens = tokens
.into_iter() .into_iter()
@ -1064,7 +1068,7 @@ impl PyTokenizer {
/// Returns: /// Returns:
/// :class:`~tokenizers.Encoding`: The final post-processed encoding /// :class:`~tokenizers.Encoding`: The final post-processed encoding
#[args(pair = "None", add_special_tokens = true)] #[args(pair = "None", add_special_tokens = true)]
#[text_signature = "($self, encoding, pair=None, add_special_tokens=True)"] #[text_signature = "(self, encoding, pair=None, add_special_tokens=True)"]
fn post_process( fn post_process(
&self, &self,
encoding: &PyEncoding, encoding: &PyEncoding,

View File

@ -11,7 +11,43 @@ use tokenizers as tk;
use crate::models::PyModel; use crate::models::PyModel;
use crate::tokenizer::PyAddedToken; use crate::tokenizer::PyAddedToken;
/// Base class for all trainers
///
/// This class is not supposed to be instantiated directly. Instead, any implementation of a
/// Trainer will return an instance of this class when instantiated.
///
/// Args:
/// vocab_size: unsigned int:
/// The size of the final vocabulary, including all tokens and alphabet.
///
/// min_frequency: unsigned int:
/// The minimum frequency a pair should have in order to be merged.
///
/// show_progress: boolean:
/// Whether to show progress bars while training.
///
/// special_tokens: List[Union[str, AddedToken]]:
/// A list of special tokens the model should know of.
///
/// limit_alphabet: unsigned int:
/// The maximum different characters to keep in the alphabet.
///
/// initial_alphabet: List[str]:
/// A list of characters to include in the initial alphabet, even
/// if not seen in the training dataset.
/// If the strings contain more than one character, only the first one
/// is kept.
///
/// continuing_subword_prefix: Optional[str]:
/// A prefix to be used for every subword that is not a beginning-of-word.
///
/// end_of_word_suffix: Optional[str]:
/// A suffix to be used for every subword that is a end-of-word.
///
/// Returns:
/// Trainer
#[pyclass(name=Trainer)] #[pyclass(name=Trainer)]
#[text_signature = "(self, vocab_size=30000, min_frequency=0,show_progress=True, special_tokens=[],limit_alphabet=None, initial_alphabet = [], continuing_subword_prefix=None, end_of_word_suffix=None)"]
pub struct PyTrainer { pub struct PyTrainer {
pub trainer: TrainerWrapper, pub trainer: TrainerWrapper,
} }
@ -41,6 +77,7 @@ impl Trainer for PyTrainer {
} }
} }
/// Capable of training a BPE model
#[pyclass(extends=PyTrainer, name=BpeTrainer)] #[pyclass(extends=PyTrainer, name=BpeTrainer)]
pub struct PyBpeTrainer {} pub struct PyBpeTrainer {}
#[pymethods] #[pymethods]
@ -105,7 +142,39 @@ impl PyBpeTrainer {
} }
} }
/// Capable of training a WordPiece model
/// Args:
/// vocab_size: unsigned int:
/// The size of the final vocabulary, including all tokens and alphabet.
///
/// min_frequency: unsigned int:
/// The minimum frequency a pair should have in order to be merged.
///
/// show_progress: boolean:
/// Whether to show progress bars while training.
///
/// special_tokens: List[Union[str, AddedToken]]:
/// A list of special tokens the model should know of.
///
/// limit_alphabet: unsigned int:
/// The maximum different characters to keep in the alphabet.
///
/// initial_alphabet: List[str]:
/// A list of characters to include in the initial alphabet, even
/// if not seen in the training dataset.
/// If the strings contain more than one character, only the first one
/// is kept.
///
/// continuing_subword_prefix: Optional[str]:
/// A prefix to be used for every subword that is not a beginning-of-word.
///
/// end_of_word_suffix: Optional[str]:
/// A suffix to be used for every subword that is a end-of-word.
///
/// Returns:
/// Trainer
#[pyclass(extends=PyTrainer, name=WordPieceTrainer)] #[pyclass(extends=PyTrainer, name=WordPieceTrainer)]
#[text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"]
pub struct PyWordPieceTrainer {} pub struct PyWordPieceTrainer {}
#[pymethods] #[pymethods]
impl PyWordPieceTrainer { impl PyWordPieceTrainer {
@ -173,7 +242,28 @@ impl PyWordPieceTrainer {
} }
} }
/// Capable of training a Unigram model
///
/// Args:
/// vocab_size: unsigned int:
/// The size of the final vocabulary, including all tokens and alphabet.
///
/// show_progress: boolean:
/// Whether to show progress bars while training.
///
/// special_tokens: List[Union[str, AddedToken]]:
/// A list of special tokens the model should know of.
///
/// initial_alphabet: List[str]:
/// A list of characters to include in the initial alphabet, even
/// if not seen in the training dataset.
/// If the strings contain more than one character, only the first one
/// is kept.
///
/// Returns:
/// Trainer
#[pyclass(extends=PyTrainer, name=UnigramTrainer)] #[pyclass(extends=PyTrainer, name=UnigramTrainer)]
#[text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens= [])"]
pub struct PyUnigramTrainer {} pub struct PyUnigramTrainer {}
#[pymethods] #[pymethods]
impl PyUnigramTrainer { impl PyUnigramTrainer {

View File

@ -173,6 +173,15 @@ fn slice(
.flatten()) .flatten())
} }
/// NormalizedString
///
/// A NormalizedString takes care of modifying an "original" string, to obtain a "normalized" one.
/// While making all the requested modifications, it keeps track of the alignment information
/// between the two versions of the string.
///
/// Args:
/// sequence: str:
/// The string sequence used to initialize this NormalizedString
#[pyclass(module = "tokenizers", name=NormalizedString)] #[pyclass(module = "tokenizers", name=NormalizedString)]
#[derive(Clone)] #[derive(Clone)]
pub struct PyNormalizedString { pub struct PyNormalizedString {
@ -186,6 +195,7 @@ impl PyNormalizedString {
NormalizedString::from(s).into() NormalizedString::from(s).into()
} }
/// The normalized part of the string
#[getter] #[getter]
fn get_normalized(&self) -> &str { fn get_normalized(&self) -> &str {
self.normalized.get() self.normalized.get()
@ -196,70 +206,119 @@ impl PyNormalizedString {
self.normalized.get_original() self.normalized.get_original()
} }
/// Runs the NFD normalization
#[text_signature = "(self)"]
fn nfd(&mut self) { fn nfd(&mut self) {
self.normalized.nfd(); self.normalized.nfd();
} }
/// Runs the NFKD normalization
#[text_signature = "(self)"]
fn nfkd(&mut self) { fn nfkd(&mut self) {
self.normalized.nfkd(); self.normalized.nfkd();
} }
/// Runs the NFC normalization
#[text_signature = "(self)"]
fn nfc(&mut self) { fn nfc(&mut self) {
self.normalized.nfc(); self.normalized.nfc();
} }
/// Runs the NFKC normalization
#[text_signature = "(self)"]
fn nfkc(&mut self) { fn nfkc(&mut self) {
self.normalized.nfkc(); self.normalized.nfkc();
} }
/// Lowercase the string
#[text_signature = "(self)"]
fn lowercase(&mut self) { fn lowercase(&mut self) {
self.normalized.lowercase(); self.normalized.lowercase();
} }
/// Uppercase the string
#[text_signature = "(self)"]
fn uppercase(&mut self) { fn uppercase(&mut self) {
self.normalized.uppercase(); self.normalized.uppercase();
} }
/// Prepend the given sequence to the string
#[text_signature = "(self, s)"]
fn prepend(&mut self, s: &str) { fn prepend(&mut self, s: &str) {
self.normalized.prepend(s); self.normalized.prepend(s);
} }
/// Append the given sequence to the string
#[text_signature = "(self, s)"]
fn append(&mut self, s: &str) { fn append(&mut self, s: &str) {
self.normalized.append(s); self.normalized.append(s);
} }
/// Strip the left of the string
#[text_signature = "(self)"]
fn lstrip(&mut self) { fn lstrip(&mut self) {
self.normalized.lstrip(); self.normalized.lstrip();
} }
/// Strip the right of the string
#[text_signature = "(self)"]
fn rstrip(&mut self) { fn rstrip(&mut self) {
self.normalized.rstrip(); self.normalized.rstrip();
} }
/// Strip both ends of the string
#[text_signature = "(self)"]
fn strip(&mut self) { fn strip(&mut self) {
self.normalized.strip(); self.normalized.strip();
} }
/// Clears the string
#[text_signature = "(self)"]
fn clear(&mut self) { fn clear(&mut self) {
self.normalized.clear(); self.normalized.clear();
} }
/// Slice the string using the given range
#[text_signature = "(self, range)"]
fn slice(&self, range: PyRange) -> PyResult<Option<PyNormalizedString>> { fn slice(&self, range: PyRange) -> PyResult<Option<PyNormalizedString>> {
slice(&self.normalized, &range) slice(&self.normalized, &range)
} }
/// Filter each character of the string using the given func
#[text_signature = "(self, func)"]
fn filter(&mut self, func: &PyAny) -> PyResult<()> { fn filter(&mut self, func: &PyAny) -> PyResult<()> {
filter(&mut self.normalized, func) filter(&mut self.normalized, func)
} }
/// Calls the given function for each character of the string
#[text_signature = "(self, func)"]
fn for_each(&self, func: &PyAny) -> PyResult<()> { fn for_each(&self, func: &PyAny) -> PyResult<()> {
for_each(&self.normalized, func) for_each(&self.normalized, func)
} }
/// Calls the given function for each character of the string
///
/// Replaces each character of the string using the returned value. Each
/// returned value **must** be a str of length 1 (ie a character).
#[text_signature = "(self, func)"]
fn map(&mut self, func: &PyAny) -> PyResult<()> { fn map(&mut self, func: &PyAny) -> PyResult<()> {
map(&mut self.normalized, func) map(&mut self.normalized, func)
} }
/// Split the NormalizedString using the given pattern and the specified behavior
///
/// Args:
/// pattern: Pattern:
/// A pattern used to split the string. Usually a string or a Regex
///
/// behavior: SplitDelimiterBehavior:
/// The behavior to use when splitting.
/// Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
/// "contiguous"
///
/// Returns:
/// A list of NormalizedString, representing each split
#[text_signature = "(self, pattern, behavior)"]
fn split( fn split(
&mut self, &mut self,
pattern: PyPattern, pattern: PyPattern,
@ -272,6 +331,15 @@ impl PyNormalizedString {
.collect()) .collect())
} }
/// Replace the content of the given pattern with the provided content
///
/// Args:
/// pattern: Pattern:
/// A pattern used to match the string. Usually a string or a Regex
///
/// content: str:
/// The content to be used as replacement
#[text_signature = "(self, pattern, content)"]
fn replace(&mut self, pattern: PyPattern, content: &str) -> PyResult<()> { fn replace(&mut self, pattern: PyPattern, content: &str) -> PyResult<()> {
ToPyResult(self.normalized.replace(pattern, content)).into() ToPyResult(self.normalized.replace(pattern, content)).into()
} }

View File

@ -65,6 +65,7 @@ fn tokenize(pretok: &mut PreTokenizedString, func: &PyAny) -> PyResult<()> {
} }
} }
/// This is an enum
#[derive(Clone)] #[derive(Clone)]
pub struct PyOffsetReferential(OffsetReferential); pub struct PyOffsetReferential(OffsetReferential);
impl FromPyObject<'_> for PyOffsetReferential { impl FromPyObject<'_> for PyOffsetReferential {
@ -131,7 +132,23 @@ fn to_encoding(
.into()) .into())
} }
/// PreTokenizedString
///
/// Wrapper over a string, that provides a way to normalize, pre-tokenize, tokenize the
/// underlying string, while keeping track of the alignment information (offsets).
///
/// The PreTokenizedString manages what we call `splits`. Each split represents a substring
/// which is a subpart of the original string, with the relevant offsets and tokens.
///
/// When calling one of the methods used to modify the PreTokenizedString (namely one of
/// `split`, `normalize` or `tokenize), only the `splits` that don't have any associated
/// tokens will get modified.
///
/// Args:
/// sequence: str:
/// The string sequence used to initialize this PreTokenizedString
#[pyclass(module = "tokenizers", name=PreTokenizedString)] #[pyclass(module = "tokenizers", name=PreTokenizedString)]
#[text_signature = "(self, sequence)"]
pub struct PyPreTokenizedString { pub struct PyPreTokenizedString {
pub(crate) pretok: tk::PreTokenizedString, pub(crate) pretok: tk::PreTokenizedString,
} }
@ -155,27 +172,84 @@ impl PyPreTokenizedString {
PreTokenizedString::from(s).into() PreTokenizedString::from(s).into()
} }
/// Split the PreTokenizedString using the given `func`
///
/// Args:
/// func: Callable[[index, NormalizedString], List[NormalizedString]]:
/// The function used to split each underlying split.
/// It is expected to return a list of `NormalizedString`, that represent the new
/// splits. If the given `NormalizedString` does not need any splitting, we can
/// just return it directly.
/// In order for the offsets to be tracked accurately, any returned `NormalizedString`
/// should come from calling either `.split` or `.slice` on the received one.
#[text_signature = "(self, func)"]
fn split(&mut self, func: &PyAny) -> PyResult<()> { fn split(&mut self, func: &PyAny) -> PyResult<()> {
split(&mut self.pretok, func) split(&mut self.pretok, func)
} }
/// Normalize each split of the `PreTokenizedString` using the given `func`
///
/// Args:
/// func: Callable[[NormalizedString], None]:
/// The function used to normalize each underlying split. This function
/// does not need to return anything, just calling the methods on the provided
/// NormalizedString allow its modification.
#[text_signature = "(self, func)"]
fn normalize(&mut self, func: &PyAny) -> PyResult<()> { fn normalize(&mut self, func: &PyAny) -> PyResult<()> {
normalize(&mut self.pretok, func) normalize(&mut self.pretok, func)
} }
/// Tokenize each split of the `PreTokenizedString` using the given `func`
///
/// Args:
/// func: Callable[[str], List[Token]]:
/// The function used to tokenize each underlying split. This function must return
/// a list of Token generated from the input str.
#[text_signature = "(self, func)"]
fn tokenize(&mut self, func: &PyAny) -> PyResult<()> { fn tokenize(&mut self, func: &PyAny) -> PyResult<()> {
tokenize(&mut self.pretok, func) tokenize(&mut self.pretok, func)
} }
/// Return an Encoding generated from this PreTokenizedString
///
/// Args:
/// type_id: int = 0:
/// The type_id to be used on the generated Encoding.
///
/// word_idx: Optional[int] = None:
/// An optional word index to be used for each token of this Encoding. If provided,
/// all the word indices in the generated Encoding will use this value, instead
/// of the one automatically tracked during pre-tokenization.
///
/// Returns:
/// An Encoding
#[args(type_id = "0", word_idx = "None")] #[args(type_id = "0", word_idx = "None")]
#[text_signature = "(self, type_id=0, word_idx=None)"]
fn to_encoding(&self, type_id: u32, word_idx: Option<u32>) -> PyResult<PyEncoding> { fn to_encoding(&self, type_id: u32, word_idx: Option<u32>) -> PyResult<PyEncoding> {
to_encoding(&self.pretok, type_id, word_idx) to_encoding(&self.pretok, type_id, word_idx)
} }
/// Get the splits currently managed by the PreTokenizedString
///
/// Args:
/// offset_referential: :obj:`str`
/// Whether the returned splits should have offsets expressed relative
/// to the original string, or the normalized one. choices: "original", "normalized".
///
/// offset_type: :obj:`str`
/// Whether the returned splits should have offsets expressed in bytes or chars.
/// When slicing an str, we usually want to use chars, which is the default value.
/// Now in some cases it might be interesting to get these offsets expressed in bytes,
/// so it is possible to change this here.
/// choices: "char", "bytes"
///
/// Returns
/// A list of splits
#[args( #[args(
offset_referential = "PyOffsetReferential(OffsetReferential::Original)", offset_referential = "PyOffsetReferential(OffsetReferential::Original)",
offset_type = "PyOffsetType(OffsetType::Char)" offset_type = "PyOffsetType(OffsetType::Char)"
)] )]
#[text_signature = "(self, offset_referential=\"original\", offset_type=\"char\")"]
fn get_splits( fn get_splits(
&self, &self,
offset_referential: PyOffsetReferential, offset_referential: PyOffsetReferential,

View File

@ -2,7 +2,9 @@ use onig::Regex;
use pyo3::exceptions; use pyo3::exceptions;
use pyo3::prelude::*; use pyo3::prelude::*;
/// Instantiate a new Regex with the given pattern
#[pyclass(module = "tokenizers", name=Regex)] #[pyclass(module = "tokenizers", name=Regex)]
#[text_signature = "(self, pattern)"]
pub struct PyRegex { pub struct PyRegex {
pub inner: Regex, pub inner: Regex,
pub pattern: String, pub pattern: String,

192
bindings/python/stub.py Normal file
View File

@ -0,0 +1,192 @@
import inspect
import os
import argparse
import black
from pathlib import Path
INDENT = " " * 4
GENERATED_COMMENT = "# Generated content DO NOT EDIT\n"
def do_indent(text: str, indent: str):
return text.replace("\n", f"\n{indent}")
def function(obj, indent, text_signature=None):
if text_signature is None:
text_signature = obj.__text_signature__
string = ""
string += f"{indent}def {obj.__name__}{text_signature}:\n"
indent += INDENT
string += f'{indent}"""\n'
string += f"{indent}{do_indent(obj.__doc__, indent)}\n"
string += f'{indent}"""\n'
string += f"{indent}pass\n"
string += "\n"
string += "\n"
return string
def member_sort(member):
if inspect.isclass(member):
value = 10 + len(inspect.getmro(member))
else:
value = 1
return value
def fn_predicate(obj):
value = inspect.ismethoddescriptor(obj) or inspect.isbuiltin(obj)
if value:
return obj.__doc__ and obj.__text_signature__ and not obj.__name__.startswith("_")
if inspect.isgetsetdescriptor(obj):
return obj.__doc__ and not obj.__name__.startswith("_")
return False
def get_module_members(module):
members = [
member
for name, member in inspect.getmembers(module)
if not name.startswith("_") and not inspect.ismodule(member)
]
members.sort(key=member_sort)
return members
def pyi_file(obj, indent=""):
string = ""
if inspect.ismodule(obj):
string += GENERATED_COMMENT
members = get_module_members(obj)
for member in members:
string += pyi_file(member, indent)
elif inspect.isclass(obj):
indent += INDENT
mro = inspect.getmro(obj)
if len(mro) > 2:
inherit = f"({mro[1].__name__})"
else:
inherit = ""
string += f"class {obj.__name__}{inherit}:\n"
body = ""
if obj.__doc__:
body += f'{indent}"""\n{indent}{do_indent(obj.__doc__, indent)}\n{indent}"""\n'
fns = inspect.getmembers(obj, fn_predicate)
# Init
if obj.__text_signature__:
body += f"{indent}def __init__{obj.__text_signature__}:\n"
body += f"{indent+INDENT}pass\n"
body += "\n"
for (name, fn) in fns:
body += pyi_file(fn, indent=indent)
if not body:
body += f"{indent}pass\n"
string += body
string += "\n\n"
elif inspect.isbuiltin(obj):
string += f"{indent}@staticmethod\n"
string += function(obj, indent)
elif inspect.ismethoddescriptor(obj):
string += function(obj, indent)
elif inspect.isgetsetdescriptor(obj):
# TODO it would be interesing to add the setter maybe ?
string += f"{indent}@property\n"
string += function(obj, indent, text_signature="(self)")
else:
raise Exception(f"Object {obj} is not supported")
return string
def py_file(module, origin):
members = get_module_members(module)
string = GENERATED_COMMENT
string += f"from .. import {origin}\n"
string += "\n"
for member in members:
name = member.__name__
string += f"{name} = {origin}.{name}\n"
return string
def do_black(content, is_pyi):
mode = black.Mode(
target_versions={black.TargetVersion.PY35},
line_length=100,
is_pyi=is_pyi,
string_normalization=True,
experimental_string_processing=False,
)
try:
return black.format_file_contents(content, fast=True, mode=mode)
except black.NothingChanged:
return content
def write(module, directory, origin, check=False):
submodules = [
(name, member) for name, member in inspect.getmembers(module) if inspect.ismodule(member)
]
filename = os.path.join(directory, "__init__.pyi")
pyi_content = pyi_file(module)
pyi_content = do_black(pyi_content, is_pyi=True)
os.makedirs(directory, exist_ok=True)
if check:
with open(filename, "r") as f:
data = f.read()
assert (
data == pyi_content
), f"The content of {filename} seems outdated, please run `python stub.py`"
else:
with open(filename, "w") as f:
f.write(pyi_content)
filename = os.path.join(directory, "__init__.py")
py_content = py_file(module, origin)
py_content = do_black(py_content, is_pyi=False)
os.makedirs(directory, exist_ok=True)
is_auto = False
if not os.path.exists(filename):
is_auto = True
else:
with open(filename, "r") as f:
line = f.readline()
if line == GENERATED_COMMENT:
is_auto = True
if is_auto:
if check:
with open(filename, "r") as f:
data = f.read()
assert (
data == py_content
), f"The content of {filename} seems outdated, please run `python stub.py`"
else:
with open(filename, "w") as f:
f.write(py_content)
for name, submodule in submodules:
write(submodule, os.path.join(directory, name), f"{name}", check=check)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--check", action="store_true")
args = parser.parse_args()
import tokenizers
write(tokenizers.tokenizers, "py_src/tokenizers/", "tokenizers", check=args.check)