mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Automatically stubbing the pyi
files while keeping inspecting ability (#509)
* First pass on automatic stubbing our python files. * And now modifying all rust docs to be visible in Pyi files. * Better assert fail message. * Fixing github workflow. * Removing types not exported anymore. * Fixing `Tokenizer` signature. * Disabling auto __init__.py. * Re-enabling some types. * Don't overwrite non automated __init__.py * Automated most __init__.py * Restubbing after rebase. * Fixing env for tests. * Install blakc in the env. * Use PY35 target in stub.py Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
This commit is contained in:
34
.github/workflows/python.yml
vendored
34
.github/workflows/python.yml
vendored
@ -11,26 +11,6 @@ on:
|
||||
- bindings/node/**
|
||||
|
||||
jobs:
|
||||
code_quality:
|
||||
name: Check Code Quality
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v1
|
||||
with:
|
||||
python-version: 3.6
|
||||
architecture: "x64"
|
||||
|
||||
- name: Install dependencies
|
||||
run: pip install black==20.8b1
|
||||
|
||||
- name: Check style
|
||||
working-directory: ./bindings/python
|
||||
run: make check-style
|
||||
|
||||
build_win_32:
|
||||
name: Check it builds for Windows 32-bit
|
||||
runs-on: windows-latest
|
||||
@ -115,11 +95,23 @@ jobs:
|
||||
python-version: 3.6
|
||||
architecture: "x64"
|
||||
|
||||
- name: Run tests
|
||||
- name: Install
|
||||
working-directory: ./bindings/python
|
||||
run: |
|
||||
python -m venv .env
|
||||
source .env/bin/activate
|
||||
pip install pytest requests setuptools_rust numpy
|
||||
python setup.py develop
|
||||
|
||||
- name: Check style
|
||||
working-directory: ./bindings/python
|
||||
run: |
|
||||
source .env/bin/activate
|
||||
pip install black==20.8b1
|
||||
make check-style
|
||||
|
||||
- name: Run tests
|
||||
working-directory: ./bindings/python
|
||||
run: |
|
||||
source .env/bin/activate
|
||||
make test
|
||||
|
@ -6,10 +6,12 @@ dir_guard=@mkdir -p $(@D)
|
||||
|
||||
# Format source code automatically
|
||||
style:
|
||||
python stub.py
|
||||
black --line-length 100 --target-version py35 examples py_src/tokenizers tests
|
||||
|
||||
# Check the source code is formatted correctly
|
||||
check-style:
|
||||
python stub.py --check
|
||||
black --check --line-length 100 --target-version py35 examples py_src/tokenizers tests
|
||||
|
||||
TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,44 +1,52 @@
|
||||
from typing import List
|
||||
|
||||
# Generated content DO NOT EDIT
|
||||
class Decoder:
|
||||
"""Base class for all decoders
|
||||
"""
|
||||
Base class for all decoders
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of
|
||||
a Decoder will return an instance of this class when instantiated.
|
||||
"""
|
||||
|
||||
def decode(self, tokens: List[str]) -> str:
|
||||
""" Decode the given list of string to a final string """
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of string to a final string
|
||||
"""
|
||||
pass
|
||||
|
||||
class BPEDecoder(Decoder):
|
||||
"""
|
||||
Instantiate a new BPEDecoder
|
||||
|
||||
Args:
|
||||
suffix: str:
|
||||
The suffix that was used to caracterize an end-of-word. This suffix will
|
||||
be replaced by whitespaces during the decoding
|
||||
"""
|
||||
|
||||
def __init__(self, suffix="</w>"):
|
||||
pass
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of string to a final string
|
||||
"""
|
||||
pass
|
||||
|
||||
class ByteLevel(Decoder):
|
||||
""" ByteLevel Decoder """
|
||||
"""
|
||||
ByteLevel Decoder
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new ByteLevel Decoder """
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
class WordPiece(Decoder):
|
||||
""" WordPiece Decoder """
|
||||
|
||||
@staticmethod
|
||||
def __init__(self, prefix: str = "##", cleanup: bool = True) -> Decoder:
|
||||
"""Instantiate a new WordPiece Decoder
|
||||
|
||||
Args:
|
||||
prefix: str:
|
||||
The prefix to use for subwords that are not a beginning-of-word
|
||||
cleanup: bool:
|
||||
Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
|
||||
and some abbreviated english forms.
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of string to a final string
|
||||
"""
|
||||
pass
|
||||
|
||||
class Metaspace(Decoder):
|
||||
""" Metaspace decoder """
|
||||
|
||||
def __init__(self, replacement: str = "▁", add_prefix_space: bool = True) -> None:
|
||||
"""Instantiate a new Metaspace
|
||||
"""
|
||||
Instantiate a new Metaspace
|
||||
|
||||
Args:
|
||||
replacement: str:
|
||||
@ -49,17 +57,31 @@ class Metaspace(Decoder):
|
||||
Whether to add a space to the first word if there isn't already one. This
|
||||
lets us treat `hello` exactly like `say hello`.
|
||||
"""
|
||||
|
||||
def __init__(self, replacement="▁", add_prefix_space=True):
|
||||
pass
|
||||
|
||||
class BPEDecoder(Decoder):
|
||||
""" BPEDecoder """
|
||||
|
||||
def __init__(self, suffix: str = "</w>") -> None:
|
||||
"""Instantiate a new BPEDecoder
|
||||
|
||||
Args:
|
||||
suffix: str:
|
||||
The suffix that was used to caracterize an end-of-word. This suffix will
|
||||
be replaced by whitespaces during the decoding
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of string to a final string
|
||||
"""
|
||||
pass
|
||||
|
||||
class WordPiece(Decoder):
|
||||
"""
|
||||
Instantiate a new WordPiece Decoder
|
||||
|
||||
Args:
|
||||
prefix: str:
|
||||
The prefix to use for subwords that are not a beginning-of-word
|
||||
cleanup: bool:
|
||||
Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
|
||||
and some abbreviated english forms.
|
||||
"""
|
||||
|
||||
def __init__(self, prefix="##", cleanup=True):
|
||||
pass
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of string to a final string
|
||||
"""
|
||||
pass
|
||||
|
@ -1,9 +1,8 @@
|
||||
from typing import List, Tuple
|
||||
|
||||
from .. import models, Offsets
|
||||
# Generated content DO NOT EDIT
|
||||
from .. import models
|
||||
|
||||
Model = models.Model
|
||||
BPE = models.BPE
|
||||
WordPiece = models.WordPiece
|
||||
WordLevel = models.WordLevel
|
||||
Unigram = models.Unigram
|
||||
WordLevel = models.WordLevel
|
||||
WordPiece = models.WordPiece
|
||||
|
@ -1,34 +1,37 @@
|
||||
from .. import Encoding, Offsets, Token
|
||||
from typing import List, Optional, Union, Tuple, Dict
|
||||
|
||||
# Generated content DO NOT EDIT
|
||||
class Model:
|
||||
"""Base class for all models
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of
|
||||
a Model will return a instance of this class when instantiated.
|
||||
"""
|
||||
A Model represents some tokenization algorithm like BPE or Word
|
||||
This class cannot be constructed directly. Please use one of the concrete models.
|
||||
"""
|
||||
|
||||
def tokenize(self, sequence: str) -> List[Token]:
|
||||
""" Tokenize the given sequence """
|
||||
def id_to_token(self, id):
|
||||
"""
|
||||
Returns the token associated with the given id
|
||||
"""
|
||||
pass
|
||||
def token_to_id(self, token: str) -> Optional[int]:
|
||||
""" Returns the id associated with the given token """
|
||||
pass
|
||||
def id_to_token(self, id: int) -> Optional[str]:
|
||||
""" Returns the token associated with the given id """
|
||||
pass
|
||||
def save(self, folder: str, name: Optional[str] = None) -> List[str]:
|
||||
"""Save the current model
|
||||
def save(self, folder, name):
|
||||
"""
|
||||
Save the current model
|
||||
|
||||
Save the current model in the given folder, using the given name for the various
|
||||
files that will get created.
|
||||
Any file with the same name that already exist in this folder will be overwritten.
|
||||
"""
|
||||
pass
|
||||
def token_to_id(self, tokens):
|
||||
"""
|
||||
Returns the id associated with the given token
|
||||
"""
|
||||
pass
|
||||
def tokenize(self, tokens):
|
||||
"""
|
||||
Tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
class BPE(Model):
|
||||
"""BytePairEncoding model class
|
||||
|
||||
"""
|
||||
Instantiate a BPE Model from the given vocab and merges.
|
||||
|
||||
Args:
|
||||
@ -61,21 +64,18 @@ class BPE(Model):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Optional[Union[str, Dict[str, int]]],
|
||||
merges: Optional[Union[str, List[Tuple[str, str]]]],
|
||||
cache_capacity: Optional[int],
|
||||
dropout: Optional[float],
|
||||
unk_token: Optional[str],
|
||||
continuing_subword_prefix: Optional[str],
|
||||
end_of_word_suffix: Optional[str],
|
||||
fuse_unk: Optional[bool],
|
||||
vocab=None,
|
||||
merges=None,
|
||||
cache_capacity=None,
|
||||
dropout=None,
|
||||
unk_token=None,
|
||||
continuing_subword_prefix=None,
|
||||
end_of_word_suffix=None,
|
||||
fuse_unk=None,
|
||||
):
|
||||
pass
|
||||
@staticmethod
|
||||
def read_file(vocab_filename: str, merges_filename: str) -> Tuple[Vocab, Merges]:
|
||||
pass
|
||||
@staticmethod
|
||||
def from_file(vocab_filename: str, merges_filename: str, **kwargs) -> BPE:
|
||||
def from_file(vocab_filename, merge_filename, **kwargs):
|
||||
"""
|
||||
Convenient method to intialize a BPE from files
|
||||
Roughly equivalent to
|
||||
@ -85,42 +85,73 @@ class BPE(Model):
|
||||
return BPE(vocab, merges, **kwargs)
|
||||
"""
|
||||
pass
|
||||
def id_to_token(self, id):
|
||||
"""
|
||||
Returns the token associated with the given id
|
||||
"""
|
||||
pass
|
||||
@staticmethod
|
||||
def read_file(self, vocab_filename, merges_filename):
|
||||
"""
|
||||
Read a vocab_filename and merge_filename and stores result in memory
|
||||
"""
|
||||
pass
|
||||
def save(self, folder, name):
|
||||
"""
|
||||
Save the current model
|
||||
|
||||
class WordPiece(Model):
|
||||
"""WordPiece model class
|
||||
Save the current model in the given folder, using the given name for the various
|
||||
files that will get created.
|
||||
Any file with the same name that already exist in this folder will be overwritten.
|
||||
"""
|
||||
pass
|
||||
def token_to_id(self, tokens):
|
||||
"""
|
||||
Returns the id associated with the given token
|
||||
"""
|
||||
pass
|
||||
def tokenize(self, tokens):
|
||||
"""
|
||||
Tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
Instantiate a WordPiece Model from the given vocab file.
|
||||
class Unigram(Model):
|
||||
"""
|
||||
UnigramEncoding model class
|
||||
|
||||
Instantiate a Unigram Model from the given model file.
|
||||
|
||||
Args:
|
||||
vocab: (`optional`) string:
|
||||
A dictionnary of string keys and their ids {"am": 0,...}
|
||||
vocab: ('`optional`) string:
|
||||
A list of vocabulary items and their relative score [("am", -0.2442),...]
|
||||
|
||||
unk_token: (`optional`) str:
|
||||
The unknown token to be used by the model.
|
||||
|
||||
max_input_chars_per_word: (`optional`) int:
|
||||
The maximum number of characters to authorize in a single word.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Optional[Union[str, Dict[str, int]]],
|
||||
unk_token: Optional[str],
|
||||
max_input_chars_per_word: Optional[int],
|
||||
):
|
||||
def __init__(self, vocab):
|
||||
pass
|
||||
@staticmethod
|
||||
def read_file(vocab_filename: str) -> Vocab:
|
||||
pass
|
||||
@staticmethod
|
||||
def from_file(vocab_filename: str, **kwargs) -> WordPiece:
|
||||
def id_to_token(self, id):
|
||||
"""
|
||||
Convenient method to intialize a WordPiece from file
|
||||
Roughly equivalent to
|
||||
Returns the token associated with the given id
|
||||
"""
|
||||
pass
|
||||
def save(self, folder, name):
|
||||
"""
|
||||
Save the current model
|
||||
|
||||
def from_file(vocab_filename, **kwargs):
|
||||
vocab, merges = WordPiece.read_file(vocab_filename)
|
||||
return WordPiece(vocab, **kwargs)
|
||||
Save the current model in the given folder, using the given name for the various
|
||||
files that will get created.
|
||||
Any file with the same name that already exist in this folder will be overwritten.
|
||||
"""
|
||||
pass
|
||||
def token_to_id(self, tokens):
|
||||
"""
|
||||
Returns the id associated with the given token
|
||||
"""
|
||||
pass
|
||||
def tokenize(self, tokens):
|
||||
"""
|
||||
Tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
@ -138,34 +169,89 @@ class WordLevel(Model):
|
||||
The unknown token to be used by the model.
|
||||
"""
|
||||
|
||||
def __init__(self, vocab: Optional[Union[str, Dict[str, int]]], unk_token: Optional[str]):
|
||||
def __init__(self, vocab, unk_token):
|
||||
pass
|
||||
@staticmethod
|
||||
def read_file(vocab_filename: str) -> Vocab:
|
||||
pass
|
||||
@staticmethod
|
||||
def from_file(vocab_filename: str, **kwargs) -> WordLevelg:
|
||||
def id_to_token(self, id):
|
||||
"""
|
||||
Convenient method to intialize a WordLevelg from file
|
||||
Returns the token associated with the given id
|
||||
"""
|
||||
pass
|
||||
def save(self, folder, name):
|
||||
"""
|
||||
Save the current model
|
||||
|
||||
Save the current model in the given folder, using the given name for the various
|
||||
files that will get created.
|
||||
Any file with the same name that already exist in this folder will be overwritten.
|
||||
"""
|
||||
pass
|
||||
def token_to_id(self, tokens):
|
||||
"""
|
||||
Returns the id associated with the given token
|
||||
"""
|
||||
pass
|
||||
def tokenize(self, tokens):
|
||||
"""
|
||||
Tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
class WordPiece(Model):
|
||||
"""
|
||||
WordPiece model
|
||||
Instantiate a WordPiece Model from the given vocab file.
|
||||
|
||||
Args:
|
||||
vocab: (`optional`) string:
|
||||
A dictionnary of string keys and their ids {"am": 0,...}
|
||||
|
||||
unk_token: (`optional`) str:
|
||||
The unknown token to be used by the model.
|
||||
|
||||
max_input_chars_per_word: (`optional`) int:
|
||||
The maximum number of characters to authorize in a single word.
|
||||
"""
|
||||
|
||||
def __init__(self, vocab, unk_token, max_input_chars_per_word):
|
||||
pass
|
||||
@staticmethod
|
||||
def from_file(vocab_filename, merge_filename, **kwargs):
|
||||
"""
|
||||
Convenient method to intialize a WordPiece from files
|
||||
Roughly equivalent to
|
||||
|
||||
def from_file(vocab_filename, **kwargs):
|
||||
vocab, merges = WordLevelg.read_file(vocab_filename)
|
||||
return WordLevelg(vocab, **kwargs)
|
||||
vocab = WordPiece.read_file(vocab_filename)
|
||||
return WordPiece(vocab, **kwargs)
|
||||
"""
|
||||
pass
|
||||
|
||||
class Unigram(Model):
|
||||
"""UnigramEncoding model class
|
||||
|
||||
Instantiate a Unigram Model from the given model file.
|
||||
|
||||
Args:
|
||||
vocab: ('`optional`) string:
|
||||
A list of vocabulary items and their relative score [("am", -0.2442),...]
|
||||
|
||||
def id_to_token(self, id):
|
||||
"""
|
||||
|
||||
Returns the token associated with the given id
|
||||
"""
|
||||
pass
|
||||
@staticmethod
|
||||
def __init__(self, vocab: Optional[List[Tuple[str, float]]]):
|
||||
def read_file(vocab_filename):
|
||||
"""
|
||||
Read a vocab_filename and stores result in memory
|
||||
"""
|
||||
pass
|
||||
def save(self, folder, name):
|
||||
"""
|
||||
Save the current model
|
||||
|
||||
Save the current model in the given folder, using the given name for the various
|
||||
files that will get created.
|
||||
Any file with the same name that already exist in this folder will be overwritten.
|
||||
"""
|
||||
pass
|
||||
def token_to_id(self, tokens):
|
||||
"""
|
||||
Returns the id associated with the given token
|
||||
"""
|
||||
pass
|
||||
def tokenize(self, tokens):
|
||||
"""
|
||||
Tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
@ -1,35 +1,29 @@
|
||||
from .. import NormalizedString
|
||||
from typing import Optional, List
|
||||
|
||||
# Generated content DO NOT EDIT
|
||||
class Normalizer:
|
||||
"""Base class for all normalizers
|
||||
"""
|
||||
Base class for all normalizers
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
Normalizer will return an instance of this class when instantiated.
|
||||
"""
|
||||
|
||||
def normalize(self, normalized: NormalizedString):
|
||||
""" Normalize the given NormalizedString in-place """
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize the given NormalizedString in-place
|
||||
"""
|
||||
pass
|
||||
def normalize_str(self, sequence: str) -> str:
|
||||
""" Normalize the given str """
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given str
|
||||
"""
|
||||
pass
|
||||
|
||||
class BertNormalizer(Normalizer):
|
||||
"""BertNormalizer
|
||||
"""
|
||||
BertNormalizer
|
||||
|
||||
Takes care of normalizing raw text before giving it to a Bert model.
|
||||
This includes cleaning the text, handling accents, chinese chars and lowercasing
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
clean_text: Optional[bool] = True,
|
||||
handle_chinese_chars: Optional[bool] = True,
|
||||
strip_accents: Optional[bool] = None,
|
||||
lowercase: Optional[bool] = True,
|
||||
) -> None:
|
||||
"""Instantiate a BertNormalizer with the given options.
|
||||
|
||||
Args:
|
||||
clean_text: (`optional`) boolean:
|
||||
@ -49,92 +43,216 @@ class BertNormalizer(Normalizer):
|
||||
Returns:
|
||||
Normalizer
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True
|
||||
):
|
||||
pass
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize the given NormalizedString in-place
|
||||
"""
|
||||
pass
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given str
|
||||
"""
|
||||
pass
|
||||
|
||||
class NFD(Normalizer):
|
||||
""" NFD Unicode Normalizer """
|
||||
class Lowercase(Normalizer):
|
||||
"""
|
||||
Lowercase Normalizer
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new NFD Normalizer """
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
class NFKD(Normalizer):
|
||||
""" NFKD Unicode Normalizer """
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new NFKD Normalizer """
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize the given NormalizedString in-place
|
||||
"""
|
||||
pass
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given str
|
||||
"""
|
||||
pass
|
||||
|
||||
class NFC(Normalizer):
|
||||
""" NFC Unicode Normalizer """
|
||||
"""
|
||||
NFC Unicode Normalizer
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new NFC Normalizer """
|
||||
def __init__(self):
|
||||
pass
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize the given NormalizedString in-place
|
||||
"""
|
||||
pass
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given str
|
||||
"""
|
||||
pass
|
||||
|
||||
class NFD(Normalizer):
|
||||
"""
|
||||
NFD Unicode Normalizer
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize the given NormalizedString in-place
|
||||
"""
|
||||
pass
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given str
|
||||
"""
|
||||
pass
|
||||
|
||||
class NFKC(Normalizer):
|
||||
""" NFKC Unicode Normalizer """
|
||||
"""
|
||||
NFKC Unicode Normalizer
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new NFKC Normalizer """
|
||||
def __init__(self):
|
||||
pass
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize the given NormalizedString in-place
|
||||
"""
|
||||
pass
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given str
|
||||
"""
|
||||
pass
|
||||
|
||||
class NFKD(Normalizer):
|
||||
"""
|
||||
NFKD Unicode Normalizer
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize the given NormalizedString in-place
|
||||
"""
|
||||
pass
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given str
|
||||
"""
|
||||
pass
|
||||
|
||||
class Nmt(Normalizer):
|
||||
"""
|
||||
Nmt normalizer
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize the given NormalizedString in-place
|
||||
"""
|
||||
pass
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given str
|
||||
"""
|
||||
pass
|
||||
|
||||
class Precompiled(Normalizer):
|
||||
"""
|
||||
Precompiled normalizer
|
||||
Don't use manually it is used for compatiblity for SentencePiece.
|
||||
"""
|
||||
|
||||
def __init__(self, precompiled_charsmap):
|
||||
pass
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize the given NormalizedString in-place
|
||||
"""
|
||||
pass
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given str
|
||||
"""
|
||||
pass
|
||||
|
||||
class Replace(Normalizer):
|
||||
"""
|
||||
Replace normalizer
|
||||
"""
|
||||
|
||||
def __init__(self, pattern, content):
|
||||
pass
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize the given NormalizedString in-place
|
||||
"""
|
||||
pass
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given str
|
||||
"""
|
||||
pass
|
||||
|
||||
class Sequence(Normalizer):
|
||||
"""Allows concatenating multiple other Normalizer as a Sequence.
|
||||
|
||||
All the normalizers run in sequence in the given order
|
||||
"""
|
||||
|
||||
def __init__(self, normalizers: List[Normalizer]) -> None:
|
||||
"""Instantiate a new normalization Sequence using the given normalizers
|
||||
Allows concatenating multiple other Normalizer as a Sequence.
|
||||
All the normalizers run in sequence in the given order
|
||||
|
||||
Args:
|
||||
normalizers: List[Normalizer]:
|
||||
A list of Normalizer to be run as a sequence
|
||||
"""
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize the given NormalizedString in-place
|
||||
"""
|
||||
pass
|
||||
|
||||
class Lowercase(Normalizer):
|
||||
""" Lowercase Normalizer """
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new Lowercase Normalizer """
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given str
|
||||
"""
|
||||
pass
|
||||
|
||||
class Strip(Normalizer):
|
||||
""" Strip normalizer """
|
||||
"""
|
||||
Strip normalizer
|
||||
"""
|
||||
|
||||
def __init__(self, left: bool = True, right: bool = True) -> Normalizer:
|
||||
def __init__(self, left=True, right=True):
|
||||
pass
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize the given NormalizedString in-place
|
||||
"""
|
||||
pass
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given str
|
||||
"""
|
||||
pass
|
||||
|
||||
class StripAccents(Normalizer):
|
||||
""" StripAccents normalizer """
|
||||
|
||||
def __init__(self) -> Normalizer:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
class Nmt(Normalizer):
|
||||
""" Nmt normalizer """
|
||||
|
||||
def __init__(self) -> Normalizer:
|
||||
pass
|
||||
|
||||
class Precompiled(Normalizer):
|
||||
""" Precompiled normalizer """
|
||||
|
||||
def __init__(self, precompiled_charsmap: bytes) -> Normalizer:
|
||||
pass
|
||||
|
||||
class Replace(Normalizer):
|
||||
""" Replace normalizer """
|
||||
|
||||
def __init__(self, pattern: str, content: str) -> Normalizer:
|
||||
pass
|
||||
|
||||
def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Instanciate unicode normalizer from the normalizer name
|
||||
:param normalizer: Name of the normalizer
|
||||
:return:
|
||||
Normalize the given NormalizedString in-place
|
||||
"""
|
||||
pass
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given str
|
||||
"""
|
||||
pass
|
||||
|
@ -1,13 +1,14 @@
|
||||
# Generated content DO NOT EDIT
|
||||
from .. import pre_tokenizers
|
||||
|
||||
PreTokenizer = pre_tokenizers.PreTokenizer
|
||||
ByteLevel = pre_tokenizers.ByteLevel
|
||||
Whitespace = pre_tokenizers.Whitespace
|
||||
Punctuation = pre_tokenizers.Punctuation
|
||||
Sequence = pre_tokenizers.Sequence
|
||||
WhitespaceSplit = pre_tokenizers.WhitespaceSplit
|
||||
BertPreTokenizer = pre_tokenizers.BertPreTokenizer
|
||||
Metaspace = pre_tokenizers.Metaspace
|
||||
ByteLevel = pre_tokenizers.ByteLevel
|
||||
CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit
|
||||
Digits = pre_tokenizers.Digits
|
||||
Metaspace = pre_tokenizers.Metaspace
|
||||
Punctuation = pre_tokenizers.Punctuation
|
||||
Sequence = pre_tokenizers.Sequence
|
||||
UnicodeScripts = pre_tokenizers.UnicodeScripts
|
||||
Whitespace = pre_tokenizers.Whitespace
|
||||
WhitespaceSplit = pre_tokenizers.WhitespaceSplit
|
||||
|
@ -1,31 +1,51 @@
|
||||
from .. import PreTokenizedString
|
||||
from typing import Optional, List, Tuple
|
||||
|
||||
Offsets = Tuple[int, int]
|
||||
|
||||
# Generated content DO NOT EDIT
|
||||
class PreTokenizer:
|
||||
"""Base class for all pre-tokenizers
|
||||
"""
|
||||
Base class for all pre-tokenizers
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
PreTokenizer will return an instance of this class when instantiated.
|
||||
"""
|
||||
|
||||
def pre_tokenize(self, pretokenized: PreTokenizedString):
|
||||
""" Pre tokenize the given PreTokenizedString in-place """
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence: str) -> List[Tuple[str, Offsets]]:
|
||||
""" Pre tokenize the given sequence """
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
class BertPreTokenizer(PreTokenizer):
|
||||
"""
|
||||
BertPreTokenizer
|
||||
|
||||
This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
||||
Each occurence of a punctuation character will be treated separately.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
class ByteLevel(PreTokenizer):
|
||||
"""ByteLevel PreTokenizer
|
||||
"""
|
||||
ByteLevel PreTokenizer
|
||||
|
||||
This pre-tokenizer takes care of replacing all bytes of the given string
|
||||
with a corresponding representation, as well as splitting into words.
|
||||
"""
|
||||
|
||||
def __init__(self, add_prefix_space: bool = True) -> None:
|
||||
"""Instantiate a new ByteLevel PreTokenizer
|
||||
Args:
|
||||
add_prefix_space: (`optional`) boolean:
|
||||
Whether to add a space to the first word if there isn't already one. This
|
||||
@ -33,58 +53,78 @@ class ByteLevel(PreTokenizer):
|
||||
Returns:
|
||||
PreTokenizer
|
||||
"""
|
||||
|
||||
def __init__(self, add_prefix_space=True):
|
||||
pass
|
||||
@staticmethod
|
||||
def alphabet() -> List[str]:
|
||||
"""Returns the alphabet used by this PreTokenizer.
|
||||
def alphabet():
|
||||
"""
|
||||
Returns the alphabet used by this PreTokenizer.
|
||||
|
||||
Since the ByteLevel works as its name suggests, at the byte level, it
|
||||
encodes any byte to one visible character. This means that there is a
|
||||
total of 256 different characters composing this alphabet.
|
||||
"""
|
||||
pass
|
||||
|
||||
class Whitespace(PreTokenizer):
|
||||
"""Whitespace PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new Whitespace PreTokenizer """
|
||||
pass
|
||||
|
||||
class WhitespaceSplit(PreTokenizer):
|
||||
"""Whitespace PreTokenizer
|
||||
class CharDelimiterSplit(PreTokenizer):
|
||||
"""
|
||||
This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
|
||||
|
||||
This pre-tokenizer simply splits on the whitespace. Works like `.split()`
|
||||
Args:
|
||||
delimiter: str:
|
||||
The delimiter char that will be used to split input
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new WhitespaceSplit PreTokenizer """
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
class BertPreTokenizer(PreTokenizer):
|
||||
"""BertPreTokenizer
|
||||
|
||||
This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
||||
Each occurence of a punctuation character will be treated separately.
|
||||
class Digits(PreTokenizer):
|
||||
"""
|
||||
This pre-tokenizer simply splits using the digits in separate tokens
|
||||
Args:
|
||||
individual_digits: bool:
|
||||
If set to True, digits will each be separated "Call 123 please" -> "Call ", "1", "2", "3", " please"
|
||||
If set to False, digits will grouped "Call 123 please" -> "Call ", "123", " please"
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new BertPreTokenizer """
|
||||
def __init__(self, individual_digits=False):
|
||||
pass
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
class Metaspace(PreTokenizer):
|
||||
"""Metaspace pre-tokenizer
|
||||
"""
|
||||
Metaspace pre-tokenizer
|
||||
|
||||
This pre-tokenizer replaces any whitespace by the provided replacement character.
|
||||
It then tries to split on these spaces.
|
||||
"""
|
||||
|
||||
def __init__(self, replacement: str = "▁", add_prefix_space: bool = True) -> None:
|
||||
"""Instantiate a new Metaspace
|
||||
|
||||
Args:
|
||||
replacement: str:
|
||||
The replacement character. Must be exactly one character. By default we
|
||||
@ -94,70 +134,109 @@ class Metaspace(PreTokenizer):
|
||||
Whether to add a space to the first word if there isn't already one. This
|
||||
lets us treat `hello` exactly like `say hello`.
|
||||
"""
|
||||
|
||||
def __init__(self, replacement="▁", add_prefix_space=True):
|
||||
pass
|
||||
|
||||
class CharDelimiterSplit(PreTokenizer):
|
||||
"""CharDelimiterSplit PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def __init__(self, delimiter: str) -> None:
|
||||
"""Instantiate a new CharDelimiterSplit PreTokenizer
|
||||
|
||||
Args:
|
||||
delimiter: str:
|
||||
The delimiter char that will be used to split input
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
class Punctuation(PreTokenizer):
|
||||
"""Punctuation PreTokenizer
|
||||
|
||||
"""
|
||||
This pre-tokenizer simply splits on punctuation as individual characters.`
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new Punctuation PreTokenizer """
|
||||
def __init__(self):
|
||||
pass
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
class Sequence(PreTokenizer):
|
||||
"""Sequence PreTokenizer
|
||||
|
||||
This pre-tokenizer composes other pre_tokenizers and applies them in sequence`
|
||||
"""
|
||||
This pre-tokenizer composes other pre_tokenizers and applies them in sequence
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new Sequence PreTokenizer """
|
||||
def __init__(self, pretokenizers):
|
||||
pass
|
||||
|
||||
class Digits(PreTokenizer):
|
||||
"""Digits PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits using the digits in separate tokens
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
|
||||
def __init__(self, individual_digits: bool) -> None:
|
||||
"""Instantiate a new Digits
|
||||
|
||||
Args:
|
||||
individual_digits: bool:
|
||||
If set to True, digits will each be separated "Call 123 please" -> "Call ", "1", "2", "3", " please"
|
||||
If set to False, digits will grouped "Call 123 please" -> "Call ", "123", " please"
|
||||
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
class UnicodeScripts(PreTokenizer):
|
||||
"""UnicodeScripts PreTokenizer
|
||||
|
||||
"""
|
||||
This pre-tokenizer splits on characters that belong to different language family
|
||||
It roughly follows https://github.com/google/sentencepiece/blob/master/data/Scripts.txt
|
||||
Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too.
|
||||
This mimicks SentencePiece Unigram implementation.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new UnicodeScripts """
|
||||
def __init__(self):
|
||||
pass
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
class Whitespace(PreTokenizer):
|
||||
"""
|
||||
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
class WhitespaceSplit(PreTokenizer):
|
||||
"""
|
||||
This pre-tokenizer simply splits on the whitespace. Works like `.split()`
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
@ -1,7 +1,8 @@
|
||||
# Generated content DO NOT EDIT
|
||||
from .. import processors
|
||||
|
||||
PostProcessor = processors.PostProcessor
|
||||
BertProcessing = processors.BertProcessing
|
||||
RobertaProcessing = processors.RobertaProcessing
|
||||
ByteLevel = processors.ByteLevel
|
||||
RobertaProcessing = processors.RobertaProcessing
|
||||
TemplateProcessing = processors.TemplateProcessing
|
||||
|
@ -1,38 +1,31 @@
|
||||
from .. import Encoding
|
||||
from typing import Tuple, Union, List
|
||||
|
||||
# Generated content DO NOT EDIT
|
||||
class PostProcessor:
|
||||
"""Base class for all post-processors
|
||||
"""
|
||||
Base class for all post-processors
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of
|
||||
a PostProcessor will return an instance of this class when instantiated.
|
||||
"""
|
||||
|
||||
def num_special_tokens_to_add(self, is_pair: bool) -> int:
|
||||
def num_special_tokens_to_add(self, is_pair):
|
||||
"""
|
||||
Return the number of special tokens that would be added for single/pair sentences.
|
||||
:param is_pair: Boolean indicating if the input would be a single sentence or a pair
|
||||
:return:
|
||||
"""
|
||||
pass
|
||||
def process(
|
||||
self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True
|
||||
) -> Encoding:
|
||||
""" Post-process the given encodings, generating the final one """
|
||||
def process(self, encoding, pair=None, add_special_tokens=True):
|
||||
"""
|
||||
Post-process the given encodings, generating the final one
|
||||
"""
|
||||
pass
|
||||
|
||||
class BertProcessing(PostProcessor):
|
||||
"""BertProcessing
|
||||
|
||||
"""
|
||||
This post-processor takes care of adding the special tokens needed by
|
||||
a Bert model:
|
||||
- a SEP token
|
||||
- a CLS token
|
||||
"""
|
||||
|
||||
def __init__(self, sep: Tuple[str, int], cls: Tuple[str, int]) -> None:
|
||||
"""Instantiate a new BertProcessing with the given tokens
|
||||
|
||||
Args:
|
||||
sep: Tuple[str, int]:
|
||||
A tuple with the string representation of the SEP token, and its id
|
||||
@ -43,11 +36,50 @@ class BertProcessing(PostProcessor):
|
||||
Returns:
|
||||
PostProcessor
|
||||
"""
|
||||
|
||||
def __init__(self, sep, cls):
|
||||
pass
|
||||
def num_special_tokens_to_add(self, is_pair):
|
||||
"""
|
||||
Return the number of special tokens that would be added for single/pair sentences.
|
||||
:param is_pair: Boolean indicating if the input would be a single sentence or a pair
|
||||
:return:
|
||||
"""
|
||||
pass
|
||||
def process(self, encoding, pair=None, add_special_tokens=True):
|
||||
"""
|
||||
Post-process the given encodings, generating the final one
|
||||
"""
|
||||
pass
|
||||
|
||||
class ByteLevel(PostProcessor):
|
||||
"""
|
||||
This post-processor takes care of trimming the offsets.
|
||||
By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
|
||||
want the offsets to include these whitespaces, then this PostProcessor must be used.
|
||||
|
||||
Args:
|
||||
trim_offsets: bool:
|
||||
Whether to trim the whitespaces from the produced offsets.
|
||||
"""
|
||||
|
||||
def __init__(self, trim_offsets=True):
|
||||
pass
|
||||
def num_special_tokens_to_add(self, is_pair):
|
||||
"""
|
||||
Return the number of special tokens that would be added for single/pair sentences.
|
||||
:param is_pair: Boolean indicating if the input would be a single sentence or a pair
|
||||
:return:
|
||||
"""
|
||||
pass
|
||||
def process(self, encoding, pair=None, add_special_tokens=True):
|
||||
"""
|
||||
Post-process the given encodings, generating the final one
|
||||
"""
|
||||
pass
|
||||
|
||||
class RobertaProcessing(PostProcessor):
|
||||
"""RobertaProcessing
|
||||
|
||||
"""
|
||||
This post-processor takes care of adding the special tokens needed by
|
||||
a Roberta model:
|
||||
- a SEP token
|
||||
@ -57,17 +89,6 @@ class RobertaProcessing(PostProcessor):
|
||||
By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
|
||||
want the offsets to include these whitespaces, then this PostProcessor should be initialized
|
||||
with `trim_offsets=True`
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
sep: Tuple[str, int],
|
||||
cls: Tuple[str, int],
|
||||
trim_offsets: bool = True,
|
||||
add_prefix_space: bool = True,
|
||||
) -> None:
|
||||
"""Instantiate a new RobertaProcessing with the given tokens
|
||||
|
||||
Args:
|
||||
sep: Tuple[str, int]:
|
||||
A tuple with the string representation of the SEP token, and its id
|
||||
@ -85,31 +106,24 @@ class RobertaProcessing(PostProcessor):
|
||||
Returns:
|
||||
PostProcessor
|
||||
"""
|
||||
|
||||
def __init__(self, sep, cls, trim_offsets=True, add_prefix_space=True):
|
||||
pass
|
||||
|
||||
class ByteLevel(PostProcessor):
|
||||
"""ByteLevel Post processing
|
||||
|
||||
This post-processor takes care of trimming the offsets.
|
||||
By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
|
||||
want the offsets to include these whitespaces, then this PostProcessor must be used.
|
||||
def num_special_tokens_to_add(self, is_pair):
|
||||
"""
|
||||
|
||||
def __init__(self, trim_offsets: bool = True) -> None:
|
||||
"""Instantiate a new ByteLevel
|
||||
|
||||
Args:
|
||||
trim_offsets: bool:
|
||||
Whether to trim the whitespaces from the produced offsets.
|
||||
Return the number of special tokens that would be added for single/pair sentences.
|
||||
:param is_pair: Boolean indicating if the input would be a single sentence or a pair
|
||||
:return:
|
||||
"""
|
||||
pass
|
||||
def process(self, encoding, pair=None, add_special_tokens=True):
|
||||
"""
|
||||
Post-process the given encodings, generating the final one
|
||||
"""
|
||||
pass
|
||||
|
||||
Template = Union[str, List[str]]
|
||||
Tokens = List[Union[Tuple[int, str], Tuple[str, int], dict]]
|
||||
|
||||
class TemplateProcessing(PostProcessor):
|
||||
"""TemplateProcessing
|
||||
|
||||
"""
|
||||
Provides a way to specify templates in order to add the special tokens to each
|
||||
input sequence as relevant.
|
||||
|
||||
@ -147,10 +161,6 @@ class TemplateProcessing(PostProcessor):
|
||||
will be added to the Encoding without any further check. If the given ids correspond
|
||||
to something totally different in a `Tokenizer` using this `PostProcessor`, it
|
||||
might lead to unexpected results.
|
||||
"""
|
||||
|
||||
def __init__(self, single: Template, pair: Template, special_tokens: Tokens) -> None:
|
||||
"""Instantiate a new TemplateProcessing
|
||||
|
||||
Args:
|
||||
single: Template
|
||||
@ -175,4 +185,18 @@ class TemplateProcessing(PostProcessor):
|
||||
The given dict expects the provided `ids` and `tokens` lists to have
|
||||
the same length.
|
||||
"""
|
||||
|
||||
def __init__(self, single, pair, special_tokens):
|
||||
pass
|
||||
def num_special_tokens_to_add(self, is_pair):
|
||||
"""
|
||||
Return the number of special tokens that would be added for single/pair sentences.
|
||||
:param is_pair: Boolean indicating if the input would be a single sentence or a pair
|
||||
:return:
|
||||
"""
|
||||
pass
|
||||
def process(self, encoding, pair=None, add_special_tokens=True):
|
||||
"""
|
||||
Post-process the given encodings, generating the final one
|
||||
"""
|
||||
pass
|
||||
|
@ -1,6 +1,7 @@
|
||||
# Generated content DO NOT EDIT
|
||||
from .. import trainers
|
||||
|
||||
Trainer = trainers.Trainer
|
||||
BpeTrainer = trainers.BpeTrainer
|
||||
WordPieceTrainer = trainers.WordPieceTrainer
|
||||
UnigramTrainer = trainers.UnigramTrainer
|
||||
WordPieceTrainer = trainers.WordPieceTrainer
|
||||
|
@ -1,84 +1,91 @@
|
||||
from .. import AddedToken
|
||||
from typing import Optional, List, Union
|
||||
|
||||
# Generated content DO NOT EDIT
|
||||
class Trainer:
|
||||
"""Base class for all trainers
|
||||
"""
|
||||
Base class for all trainers
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
Trainer will return an instance of this class when instantiated.
|
||||
|
||||
Args:
|
||||
vocab_size: unsigned int:
|
||||
The size of the final vocabulary, including all tokens and alphabet.
|
||||
|
||||
min_frequency: unsigned int:
|
||||
The minimum frequency a pair should have in order to be merged.
|
||||
|
||||
show_progress: boolean:
|
||||
Whether to show progress bars while training.
|
||||
|
||||
special_tokens: List[Union[str, AddedToken]]:
|
||||
A list of special tokens the model should know of.
|
||||
|
||||
limit_alphabet: unsigned int:
|
||||
The maximum different characters to keep in the alphabet.
|
||||
|
||||
initial_alphabet: List[str]:
|
||||
A list of characters to include in the initial alphabet, even
|
||||
if not seen in the training dataset.
|
||||
If the strings contain more than one character, only the first one
|
||||
is kept.
|
||||
|
||||
continuing_subword_prefix: Optional[str]:
|
||||
A prefix to be used for every subword that is not a beginning-of-word.
|
||||
|
||||
end_of_word_suffix: Optional[str]:
|
||||
A suffix to be used for every subword that is a end-of-word.
|
||||
|
||||
Returns:
|
||||
Trainer
|
||||
"""
|
||||
|
||||
class BpeTrainer(Trainer):
|
||||
"""BpeTrainer
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=30000,
|
||||
min_frequency=0,
|
||||
show_progress=True,
|
||||
special_tokens=[],
|
||||
limit_alphabet=None,
|
||||
initial_alphabet=[],
|
||||
continuing_subword_prefix=None,
|
||||
end_of_word_suffix=None,
|
||||
):
|
||||
pass
|
||||
|
||||
class BpeTrainer(Trainer):
|
||||
"""
|
||||
Capable of training a BPE model
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size: int = 30000,
|
||||
min_frequency: int = 0,
|
||||
show_progress: bool = True,
|
||||
special_tokens: List[Union[str, AddedToken]] = [],
|
||||
limit_alphabet: Optional[int] = None,
|
||||
initial_alphabet: List[str] = [],
|
||||
continuing_subword_prefix: Optional[str] = None,
|
||||
end_of_word_suffix: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Instantiate a new BpeTrainer with the given options:
|
||||
class UnigramTrainer(Trainer):
|
||||
"""
|
||||
Capable of training a Unigram model
|
||||
|
||||
Args:
|
||||
vocab_size: unsigned int:
|
||||
The size of the final vocabulary, including all tokens and alphabet.
|
||||
|
||||
min_frequency: unsigned int:
|
||||
The minimum frequency a pair should have in order to be merged.
|
||||
|
||||
show_progress: boolean:
|
||||
Whether to show progress bars while training.
|
||||
|
||||
special_tokens: List[Union[str, AddedToken]]:
|
||||
A list of special tokens the model should know of.
|
||||
|
||||
limit_alphabet: unsigned int:
|
||||
The maximum different characters to keep in the alphabet.
|
||||
|
||||
initial_alphabet: List[str]:
|
||||
A list of characters to include in the initial alphabet, even
|
||||
if not seen in the training dataset.
|
||||
If the strings contain more than one character, only the first one
|
||||
is kept.
|
||||
|
||||
continuing_subword_prefix: Optional[str]:
|
||||
A prefix to be used for every subword that is not a beginning-of-word.
|
||||
|
||||
end_of_word_suffix: Optional[str]:
|
||||
A suffix to be used for every subword that is a end-of-word.
|
||||
|
||||
Returns:
|
||||
Trainer
|
||||
"""
|
||||
|
||||
def __init__(self, vocab_size=8000, show_progress=True, special_tokens=[]):
|
||||
pass
|
||||
|
||||
class WordPieceTrainer(Trainer):
|
||||
"""WordPieceTrainer
|
||||
|
||||
Capable of training a WordPiece model
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size: int = 30000,
|
||||
min_frequency: int = 0,
|
||||
show_progress: bool = True,
|
||||
special_tokens: List[Union[str, AddedToken]] = [],
|
||||
limit_alphabet: Optional[int] = None,
|
||||
initial_alphabet: List[str] = [],
|
||||
continuing_subword_prefix: Optional[str] = "##",
|
||||
end_of_word_suffix: Optional[str] = None,
|
||||
) -> Trainer:
|
||||
"""Instantiate a new WordPieceTrainer with the given options:
|
||||
|
||||
Capable of training a WordPiece model
|
||||
Args:
|
||||
vocab_size: unsigned int:
|
||||
The size of the final vocabulary, including all tokens and alphabet.
|
||||
@ -110,39 +117,16 @@ class WordPieceTrainer(Trainer):
|
||||
Returns:
|
||||
Trainer
|
||||
"""
|
||||
pass
|
||||
|
||||
class UnigramTrainer(Trainer):
|
||||
"""UnigramTrainer
|
||||
|
||||
Capable of training a Unigram model
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size: int = 8000,
|
||||
show_progress: bool = True,
|
||||
special_tokens: List[Union[str, AddedToken]] = [],
|
||||
) -> Trainer:
|
||||
"""Instantiate a new UnigramTrainer with the given options:
|
||||
|
||||
Args:
|
||||
vocab_size: unsigned int:
|
||||
The size of the final vocabulary, including all tokens and alphabet.
|
||||
|
||||
show_progress: boolean:
|
||||
Whether to show progress bars while training.
|
||||
|
||||
special_tokens: List[Union[str, AddedToken]]:
|
||||
A list of special tokens the model should know of.
|
||||
|
||||
initial_alphabet: List[str]:
|
||||
A list of characters to include in the initial alphabet, even
|
||||
if not seen in the training dataset.
|
||||
If the strings contain more than one character, only the first one
|
||||
is kept.
|
||||
|
||||
Returns:
|
||||
Trainer
|
||||
"""
|
||||
vocab_size=30000,
|
||||
min_frequency=0,
|
||||
show_progress=True,
|
||||
special_tokens=[],
|
||||
limit_alphabet=None,
|
||||
initial_alphabet=[],
|
||||
continuing_subword_prefix="##",
|
||||
end_of_word_suffix=None,
|
||||
):
|
||||
pass
|
||||
|
@ -15,6 +15,10 @@ use tokenizers as tk;
|
||||
|
||||
use super::error::ToPyResult;
|
||||
|
||||
/// Base class for all decoders
|
||||
///
|
||||
/// This class is not supposed to be instantiated directly. Instead, any implementation of
|
||||
/// a Decoder will return an instance of this class when instantiated.
|
||||
#[pyclass(dict, module = "tokenizers.decoders", name=Decoder)]
|
||||
#[derive(Clone, Deserialize, Serialize)]
|
||||
pub struct PyDecoder {
|
||||
@ -82,12 +86,16 @@ impl PyDecoder {
|
||||
}
|
||||
}
|
||||
|
||||
/// Decode the given list of string to a final string
|
||||
#[text_signature = "(self, tokens)"]
|
||||
fn decode(&self, tokens: Vec<String>) -> PyResult<String> {
|
||||
ToPyResult(self.decoder.decode(tokens)).into()
|
||||
}
|
||||
}
|
||||
|
||||
/// ByteLevel Decoder
|
||||
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name=ByteLevel)]
|
||||
#[text_signature = "(self)"]
|
||||
pub struct PyByteLevelDec {}
|
||||
#[pymethods]
|
||||
impl PyByteLevelDec {
|
||||
@ -97,7 +105,16 @@ impl PyByteLevelDec {
|
||||
}
|
||||
}
|
||||
|
||||
/// Instantiate a new WordPiece Decoder
|
||||
///
|
||||
/// Args:
|
||||
/// prefix: str:
|
||||
/// The prefix to use for subwords that are not a beginning-of-word
|
||||
/// cleanup: bool:
|
||||
/// Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
|
||||
/// and some abbreviated english forms.
|
||||
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name=WordPiece)]
|
||||
#[text_signature = "(self, prefix=\"##\", cleanup=True)"]
|
||||
pub struct PyWordPieceDec {}
|
||||
#[pymethods]
|
||||
impl PyWordPieceDec {
|
||||
@ -120,7 +137,18 @@ impl PyWordPieceDec {
|
||||
}
|
||||
}
|
||||
|
||||
/// Instantiate a new Metaspace
|
||||
///
|
||||
/// Args:
|
||||
/// replacement: str:
|
||||
/// The replacement character. Must be exactly one character. By default we
|
||||
/// use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
||||
///
|
||||
/// add_prefix_space: boolean:
|
||||
/// Whether to add a space to the first word if there isn't already one. This
|
||||
/// lets us treat `hello` exactly like `say hello`.
|
||||
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name=Metaspace)]
|
||||
#[text_signature = "(self, replacement = \"▁\", add_prefix_space = True)"]
|
||||
pub struct PyMetaspaceDec {}
|
||||
#[pymethods]
|
||||
impl PyMetaspaceDec {
|
||||
@ -153,7 +181,14 @@ impl PyMetaspaceDec {
|
||||
}
|
||||
}
|
||||
|
||||
/// Instantiate a new BPEDecoder
|
||||
///
|
||||
/// Args:
|
||||
/// suffix: str:
|
||||
/// The suffix that was used to caracterize an end-of-word. This suffix will
|
||||
/// be replaced by whitespaces during the decoding
|
||||
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name=BPEDecoder)]
|
||||
#[text_signature = "(self, suffix=\"</w>\")"]
|
||||
pub struct PyBPEDecoder {}
|
||||
#[pymethods]
|
||||
impl PyBPEDecoder {
|
||||
|
@ -107,7 +107,7 @@ impl PyEncoding {
|
||||
///
|
||||
/// Set the given sequence index for the whole range of tokens contained in this
|
||||
/// :class:`~tokenizers.Encoding`.
|
||||
#[text_signature = "($self, sequence_id)"]
|
||||
#[text_signature = "(self, sequence_id)"]
|
||||
fn set_sequence_id(&mut self, sequence_id: usize) {
|
||||
self.encoding.set_sequence_id(sequence_id);
|
||||
}
|
||||
@ -269,7 +269,7 @@ impl PyEncoding {
|
||||
/// Returns:
|
||||
/// :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)`
|
||||
#[args(sequence_index = 0)]
|
||||
#[text_signature = "($self, word_index, sequence_index=0)"]
|
||||
#[text_signature = "(self, word_index, sequence_index=0)"]
|
||||
fn word_to_tokens(&self, word_index: u32, sequence_index: usize) -> Option<(usize, usize)> {
|
||||
self.encoding.word_to_tokens(word_index, sequence_index)
|
||||
}
|
||||
@ -285,7 +285,7 @@ impl PyEncoding {
|
||||
/// Returns:
|
||||
/// :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)`
|
||||
#[args(sequence_index = 0)]
|
||||
#[text_signature = "($self, word_index, sequence_index=0)"]
|
||||
#[text_signature = "(self, word_index, sequence_index=0)"]
|
||||
fn word_to_chars(&self, word_index: u32, sequence_index: usize) -> Option<Offsets> {
|
||||
self.encoding.word_to_chars(word_index, sequence_index)
|
||||
}
|
||||
@ -301,7 +301,7 @@ impl PyEncoding {
|
||||
///
|
||||
/// Returns:
|
||||
/// :obj:`int`: The sequence id of the given token
|
||||
#[text_signature = "($self, token_index)"]
|
||||
#[text_signature = "(self, token_index)"]
|
||||
fn token_to_sequence(&self, token_index: usize) -> Option<usize> {
|
||||
self.encoding.token_to_sequence(token_index)
|
||||
}
|
||||
@ -318,7 +318,7 @@ impl PyEncoding {
|
||||
///
|
||||
/// Returns:
|
||||
/// :obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)`
|
||||
#[text_signature = "($self, token_index)"]
|
||||
#[text_signature = "(self, token_index)"]
|
||||
fn token_to_chars(&self, token_index: usize) -> Option<Offsets> {
|
||||
let (_, offsets) = self.encoding.token_to_chars(token_index)?;
|
||||
Some(offsets)
|
||||
@ -336,7 +336,7 @@ impl PyEncoding {
|
||||
///
|
||||
/// Returns:
|
||||
/// :obj:`int`: The index of the word in the relevant input sequence.
|
||||
#[text_signature = "($self, token_index)"]
|
||||
#[text_signature = "(self, token_index)"]
|
||||
fn token_to_word(&self, token_index: usize) -> Option<u32> {
|
||||
let (_, word_idx) = self.encoding.token_to_word(token_index)?;
|
||||
Some(word_idx)
|
||||
@ -353,7 +353,7 @@ impl PyEncoding {
|
||||
/// Returns:
|
||||
/// :obj:`int`: The index of the token that contains this char in the encoded sequence
|
||||
#[args(sequence_index = 0)]
|
||||
#[text_signature = "($self, char_pos, sequence_index=0)"]
|
||||
#[text_signature = "(self, char_pos, sequence_index=0)"]
|
||||
fn char_to_token(&self, char_pos: usize, sequence_index: usize) -> Option<usize> {
|
||||
self.encoding.char_to_token(char_pos, sequence_index)
|
||||
}
|
||||
@ -369,7 +369,7 @@ impl PyEncoding {
|
||||
/// Returns:
|
||||
/// :obj:`int`: The index of the word that contains this char in the input sequence
|
||||
#[args(sequence_index = 0)]
|
||||
#[text_signature = "($self, char_pos, sequence_index=0)"]
|
||||
#[text_signature = "(self, char_pos, sequence_index=0)"]
|
||||
fn char_to_word(&self, char_pos: usize, sequence_index: usize) -> Option<u32> {
|
||||
self.encoding.char_to_word(char_pos, sequence_index)
|
||||
}
|
||||
@ -392,7 +392,7 @@ impl PyEncoding {
|
||||
/// pad_token (:obj:`str`, defaults to `[PAD]`):
|
||||
/// The pad token to use
|
||||
#[args(kwargs = "**")]
|
||||
#[text_signature = "($self, length, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]')"]
|
||||
#[text_signature = "(self, length, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]')"]
|
||||
fn pad(&mut self, length: usize, kwargs: Option<&PyDict>) -> PyResult<()> {
|
||||
let mut pad_id = 0;
|
||||
let mut pad_type_id = 0;
|
||||
@ -440,7 +440,7 @@ impl PyEncoding {
|
||||
/// stride (:obj:`int`, defaults to :obj:`0`):
|
||||
/// The length of previous content to be included in each overflowing piece
|
||||
#[args(stride = "0")]
|
||||
#[text_signature = "($self, max_length, stride=0)"]
|
||||
#[text_signature = "(self, max_length, stride=0)"]
|
||||
fn truncate(&mut self, max_length: usize, stride: usize) -> PyResult<()> {
|
||||
self.encoding.truncate(max_length, stride);
|
||||
Ok(())
|
||||
|
@ -106,6 +106,8 @@ impl PyModel {
|
||||
}
|
||||
}
|
||||
|
||||
/// Tokenize the given sequence
|
||||
#[text_signature = "(self, tokens)"]
|
||||
fn tokenize(&self, tokens: &str) -> PyResult<Vec<PyToken>> {
|
||||
Ok(ToPyResult(self.model.tokenize(tokens))
|
||||
.into_py()?
|
||||
@ -114,14 +116,24 @@ impl PyModel {
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Returns the id associated with the given token
|
||||
#[text_signature = "(self, tokens)"]
|
||||
fn token_to_id(&self, token: &str) -> Option<u32> {
|
||||
self.model.token_to_id(token)
|
||||
}
|
||||
|
||||
/// Returns the token associated with the given id
|
||||
#[text_signature = "(self, id)"]
|
||||
fn id_to_token(&self, id: u32) -> Option<&str> {
|
||||
self.model.id_to_token(id)
|
||||
}
|
||||
|
||||
/// Save the current model
|
||||
///
|
||||
/// Save the current model in the given folder, using the given name for the various
|
||||
/// files that will get created.
|
||||
/// Any file with the same name that already exist in this folder will be overwritten.
|
||||
#[text_signature = "(self, folder, name)"]
|
||||
fn save(&self, folder: &str, name: Option<&str>) -> PyResult<Vec<String>> {
|
||||
let saved: PyResult<Vec<_>> = ToPyResult(self.model.save(Path::new(folder), name)).into();
|
||||
|
||||
@ -132,9 +144,36 @@ impl PyModel {
|
||||
}
|
||||
}
|
||||
|
||||
/// BPE Model
|
||||
/// Allows the creation of a BPE Model to be used with a Tokenizer
|
||||
/// Instantiate a BPE Model from the given vocab and merges.
|
||||
///
|
||||
/// Args:
|
||||
/// vocab: ('`optional`) Dict[str, int]:
|
||||
/// A dictionnary of string keys and their ids {"am": 0,...}
|
||||
///
|
||||
/// merges: (`optional`) string:
|
||||
/// A list of pairs of tokens [("a", "b"),...]
|
||||
///
|
||||
/// cache_capacity: (`optional`) int:
|
||||
/// The number of words that the BPE cache can contain. The cache allows
|
||||
/// to speed-up the process by keeping the result of the merge operations
|
||||
/// for a number of words.
|
||||
///
|
||||
/// dropout: (`optional`) Optional[float] [0, 1]:
|
||||
/// The BPE dropout to use. Must be an float between 0 and 1
|
||||
///
|
||||
/// unk_token: (`optional`) str:
|
||||
/// The unknown token to be used by the model.
|
||||
///
|
||||
/// continuing_subword_prefix: (`optional`) str:
|
||||
/// The prefix to attach to subword units that don't represent a beginning of word.
|
||||
///
|
||||
/// end_of_word_suffix: (`optional`) str:
|
||||
/// The suffix to attach to subword units that represent an end of word.
|
||||
///
|
||||
/// fuse_unk: (`optional`) bool:
|
||||
/// Multiple unk tokens get fused into only 1
|
||||
#[pyclass(extends=PyModel, module = "tokenizers.models", name=BPE)]
|
||||
#[text_signature = "(self, vocab=None, merges=None, cache_capacity=None, dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=None)"]
|
||||
pub struct PyBPE {}
|
||||
|
||||
impl PyBPE {
|
||||
@ -225,7 +264,9 @@ impl PyBPE {
|
||||
PyBPE::with_builder(builder, kwargs)
|
||||
}
|
||||
|
||||
/// Read a vocab_filename and merge_filename and stores result in memory
|
||||
#[staticmethod]
|
||||
#[text_signature = "(self, vocab_filename, merges_filename)"]
|
||||
fn read_file(vocab_filename: &str, merges_filename: &str) -> PyResult<(Vocab, Merges)> {
|
||||
BPE::read_file(vocab_filename, merges_filename).map_err(|e| {
|
||||
exceptions::PyValueError::new_err(format!(
|
||||
@ -235,8 +276,15 @@ impl PyBPE {
|
||||
})
|
||||
}
|
||||
|
||||
/// Convenient method to intialize a BPE from files
|
||||
/// Roughly equivalent to
|
||||
///
|
||||
/// def from_file(vocab_filename, merges_filenames, **kwargs):
|
||||
/// vocab, merges = BPE.read_file(vocab_filename, merges_filename)
|
||||
/// return BPE(vocab, merges, **kwargs)
|
||||
#[staticmethod]
|
||||
#[args(kwargs = "**")]
|
||||
#[text_signature = "(vocab_filename, merge_filename, **kwargs)"]
|
||||
fn from_file(
|
||||
py: Python,
|
||||
vocab_filename: &str,
|
||||
@ -257,8 +305,20 @@ impl PyBPE {
|
||||
}
|
||||
}
|
||||
|
||||
/// WordPiece Model
|
||||
/// WordPiece model
|
||||
/// Instantiate a WordPiece Model from the given vocab file.
|
||||
///
|
||||
/// Args:
|
||||
/// vocab: (`optional`) string:
|
||||
/// A dictionnary of string keys and their ids {"am": 0,...}
|
||||
///
|
||||
/// unk_token: (`optional`) str:
|
||||
/// The unknown token to be used by the model.
|
||||
///
|
||||
/// max_input_chars_per_word: (`optional`) int:
|
||||
/// The maximum number of characters to authorize in a single word.
|
||||
#[pyclass(extends=PyModel, module = "tokenizers.models", name=WordPiece)]
|
||||
#[text_signature = "(self, vocab, unk_token, max_input_chars_per_word)"]
|
||||
pub struct PyWordPiece {}
|
||||
|
||||
impl PyWordPiece {
|
||||
@ -319,15 +379,24 @@ impl PyWordPiece {
|
||||
PyWordPiece::with_builder(builder, kwargs)
|
||||
}
|
||||
|
||||
/// Read a vocab_filename and stores result in memory
|
||||
#[staticmethod]
|
||||
#[text_signature = "(vocab_filename)"]
|
||||
fn read_file(vocab_filename: &str) -> PyResult<Vocab> {
|
||||
WordPiece::read_file(vocab_filename).map_err(|e| {
|
||||
exceptions::PyValueError::new_err(format!("Error while reading WordPiece file: {}", e))
|
||||
})
|
||||
}
|
||||
|
||||
/// Convenient method to intialize a WordPiece from files
|
||||
/// Roughly equivalent to
|
||||
///
|
||||
/// def from_file(vocab_filename, **kwargs):
|
||||
/// vocab = WordPiece.read_file(vocab_filename)
|
||||
/// return WordPiece(vocab, **kwargs)
|
||||
#[staticmethod]
|
||||
#[args(kwargs = "**")]
|
||||
#[text_signature = "(vocab_filename, merge_filename, **kwargs)"]
|
||||
fn from_file(py: Python, vocab_filename: &str, kwargs: Option<&PyDict>) -> PyResult<Py<Self>> {
|
||||
let vocab = WordPiece::read_file(vocab_filename).map_err(|e| {
|
||||
exceptions::PyValueError::new_err(format!("Error while reading WordPiece file: {}", e))
|
||||
@ -336,7 +405,18 @@ impl PyWordPiece {
|
||||
}
|
||||
}
|
||||
|
||||
/// Most simple tokenizer model based on mapping token from a vocab file to their corresponding id.
|
||||
///
|
||||
/// Instantiate a WordLevel Model from the given vocab file.
|
||||
///
|
||||
/// Args:
|
||||
/// vocab: (`optional`) string:
|
||||
/// A dictionnary of string keys and their ids {"am": 0,...}
|
||||
///
|
||||
/// unk_token: str:
|
||||
/// The unknown token to be used by the model.
|
||||
#[pyclass(extends=PyModel, module = "tokenizers.models", name=WordLevel)]
|
||||
#[text_signature = "(self, vocab, unk_token)"]
|
||||
pub struct PyWordLevel {}
|
||||
|
||||
impl PyWordLevel {
|
||||
@ -411,7 +491,16 @@ impl PyWordLevel {
|
||||
}
|
||||
}
|
||||
|
||||
/// UnigramEncoding model class
|
||||
///
|
||||
/// Instantiate a Unigram Model from the given model file.
|
||||
///
|
||||
/// Args:
|
||||
/// vocab: ('`optional`) string:
|
||||
/// A list of vocabulary items and their relative score [("am", -0.2442),...]
|
||||
///
|
||||
#[pyclass(extends=PyModel, module = "tokenizers.models", name=Unigram)]
|
||||
#[text_signature = "(self, vocab)"]
|
||||
pub struct PyUnigram {}
|
||||
|
||||
#[pymethods]
|
||||
|
@ -15,6 +15,10 @@ use tk::normalizers::{
|
||||
use tk::{NormalizedString, Normalizer};
|
||||
use tokenizers as tk;
|
||||
|
||||
/// Base class for all normalizers
|
||||
///
|
||||
/// This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
/// Normalizer will return an instance of this class when instantiated.
|
||||
#[pyclass(dict, module = "tokenizers.normalizers", name=Normalizer)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct PyNormalizer {
|
||||
@ -105,10 +109,14 @@ impl PyNormalizer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Normalize the given NormalizedString in-place
|
||||
#[text_signature = "(self, normalized)"]
|
||||
fn normalize(&self, normalized: &mut PyNormalizedString) -> PyResult<()> {
|
||||
ToPyResult(self.normalizer.normalize(&mut normalized.normalized)).into()
|
||||
}
|
||||
|
||||
/// Normalize the given str
|
||||
#[text_signature = "(self, sequence)"]
|
||||
fn normalize_str(&self, sequence: &str) -> PyResult<String> {
|
||||
let mut normalized = NormalizedString::from(sequence);
|
||||
ToPyResult(self.normalizer.normalize(&mut normalized)).into_py()?;
|
||||
@ -116,7 +124,30 @@ impl PyNormalizer {
|
||||
}
|
||||
}
|
||||
|
||||
/// BertNormalizer
|
||||
///
|
||||
/// Takes care of normalizing raw text before giving it to a Bert model.
|
||||
/// This includes cleaning the text, handling accents, chinese chars and lowercasing
|
||||
///
|
||||
/// Args:
|
||||
/// clean_text: (`optional`) boolean:
|
||||
/// Whether to clean the text, by removing any control characters
|
||||
/// and replacing all whitespaces by the classic one.
|
||||
///
|
||||
/// handle_chinese_chars: (`optional`) boolean:
|
||||
/// Whether to handle chinese chars by putting spaces around them.
|
||||
///
|
||||
/// strip_accents: (`optional`) boolean:
|
||||
/// Whether to strip all accents. If this option is not specified (ie == None),
|
||||
/// then it will be determined by the value for `lowercase` (as in the original Bert).
|
||||
///
|
||||
/// lowercase: (`optional`) boolean:
|
||||
/// Whether to lowercase.
|
||||
///
|
||||
/// Returns:
|
||||
/// Normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=BertNormalizer)]
|
||||
#[text_signature = "(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True)"]
|
||||
pub struct PyBertNormalizer {}
|
||||
#[pymethods]
|
||||
impl PyBertNormalizer {
|
||||
@ -146,7 +177,9 @@ impl PyBertNormalizer {
|
||||
}
|
||||
}
|
||||
|
||||
/// NFD Unicode Normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFD)]
|
||||
#[text_signature = "(self)"]
|
||||
pub struct PyNFD {}
|
||||
#[pymethods]
|
||||
impl PyNFD {
|
||||
@ -156,7 +189,9 @@ impl PyNFD {
|
||||
}
|
||||
}
|
||||
|
||||
/// NFKD Unicode Normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFKD)]
|
||||
#[text_signature = "(self)"]
|
||||
pub struct PyNFKD {}
|
||||
#[pymethods]
|
||||
impl PyNFKD {
|
||||
@ -166,7 +201,9 @@ impl PyNFKD {
|
||||
}
|
||||
}
|
||||
|
||||
/// NFC Unicode Normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFC)]
|
||||
#[text_signature = "(self)"]
|
||||
pub struct PyNFC {}
|
||||
#[pymethods]
|
||||
impl PyNFC {
|
||||
@ -176,7 +213,9 @@ impl PyNFC {
|
||||
}
|
||||
}
|
||||
|
||||
/// NFKC Unicode Normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFKC)]
|
||||
#[text_signature = "(self)"]
|
||||
pub struct PyNFKC {}
|
||||
#[pymethods]
|
||||
impl PyNFKC {
|
||||
@ -186,6 +225,12 @@ impl PyNFKC {
|
||||
}
|
||||
}
|
||||
|
||||
/// Allows concatenating multiple other Normalizer as a Sequence.
|
||||
/// All the normalizers run in sequence in the given order
|
||||
///
|
||||
/// Args:
|
||||
/// normalizers: List[Normalizer]:
|
||||
/// A list of Normalizer to be run as a sequence
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Sequence)]
|
||||
pub struct PySequence {}
|
||||
#[pymethods]
|
||||
@ -211,7 +256,9 @@ impl PySequence {
|
||||
}
|
||||
}
|
||||
|
||||
/// Lowercase Normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Lowercase)]
|
||||
#[text_signature = "(self)"]
|
||||
pub struct PyLowercase {}
|
||||
#[pymethods]
|
||||
impl PyLowercase {
|
||||
@ -221,7 +268,9 @@ impl PyLowercase {
|
||||
}
|
||||
}
|
||||
|
||||
/// Strip normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Strip)]
|
||||
#[text_signature = "(self, left=True, right=True)"]
|
||||
pub struct PyStrip {}
|
||||
#[pymethods]
|
||||
impl PyStrip {
|
||||
@ -245,6 +294,7 @@ impl PyStrip {
|
||||
}
|
||||
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=StripAccents)]
|
||||
#[text_signature = "(self)"]
|
||||
pub struct PyStripAccents {}
|
||||
#[pymethods]
|
||||
impl PyStripAccents {
|
||||
@ -389,7 +439,9 @@ impl Normalizer for PyNormalizerWrapper {
|
||||
}
|
||||
}
|
||||
|
||||
/// Nmt normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Nmt)]
|
||||
#[text_signature = "(self)"]
|
||||
pub struct PyNmt {}
|
||||
#[pymethods]
|
||||
impl PyNmt {
|
||||
@ -399,7 +451,10 @@ impl PyNmt {
|
||||
}
|
||||
}
|
||||
|
||||
/// Precompiled normalizer
|
||||
/// Don't use manually it is used for compatiblity for SentencePiece.
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Precompiled)]
|
||||
#[text_signature = "(self, precompiled_charsmap)"]
|
||||
pub struct PyPrecompiled {}
|
||||
#[pymethods]
|
||||
impl PyPrecompiled {
|
||||
@ -420,7 +475,9 @@ impl PyPrecompiled {
|
||||
}
|
||||
}
|
||||
|
||||
/// Replace normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Replace)]
|
||||
#[text_signature = "(self, pattern, content)"]
|
||||
pub struct PyReplace {}
|
||||
#[pymethods]
|
||||
impl PyReplace {
|
||||
|
@ -22,6 +22,10 @@ use tokenizers as tk;
|
||||
use super::error::ToPyResult;
|
||||
use super::utils::*;
|
||||
|
||||
/// Base class for all pre-tokenizers
|
||||
///
|
||||
/// This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
/// PreTokenizer will return an instance of this class when instantiated.
|
||||
#[pyclass(dict, module = "tokenizers.pre_tokenizers", name=PreTokenizer)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct PyPreTokenizer {
|
||||
@ -121,10 +125,14 @@ impl PyPreTokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Pre tokenize the given PreTokenizedString in-place
|
||||
#[text_signature = "(self, pretok)"]
|
||||
fn pre_tokenize(&self, pretok: &mut PyPreTokenizedString) -> PyResult<()> {
|
||||
ToPyResult(self.pretok.pre_tokenize(&mut pretok.pretok)).into()
|
||||
}
|
||||
|
||||
/// Pre tokenize the given sequence
|
||||
#[text_signature = "(self, sequence)"]
|
||||
fn pre_tokenize_str(&self, s: &str) -> PyResult<Vec<(String, Offsets)>> {
|
||||
let mut pretokenized = tk::tokenizer::PreTokenizedString::from(s);
|
||||
|
||||
@ -138,7 +146,19 @@ impl PyPreTokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
/// ByteLevel PreTokenizer
|
||||
///
|
||||
/// This pre-tokenizer takes care of replacing all bytes of the given string
|
||||
/// with a corresponding representation, as well as splitting into words.
|
||||
///
|
||||
/// Args:
|
||||
/// add_prefix_space: (`optional`) boolean:
|
||||
/// Whether to add a space to the first word if there isn't already one. This
|
||||
/// lets us treat `hello` exactly like `say hello`.
|
||||
/// Returns:
|
||||
/// PreTokenizer
|
||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=ByteLevel)]
|
||||
#[text_signature = "(self, add_prefix_space=True)"]
|
||||
pub struct PyByteLevel {}
|
||||
#[pymethods]
|
||||
impl PyByteLevel {
|
||||
@ -161,7 +181,13 @@ impl PyByteLevel {
|
||||
Ok((PyByteLevel {}, byte_level.into()))
|
||||
}
|
||||
|
||||
/// Returns the alphabet used by this PreTokenizer.
|
||||
///
|
||||
/// Since the ByteLevel works as its name suggests, at the byte level, it
|
||||
/// encodes any byte to one visible character. This means that there is a
|
||||
/// total of 256 different characters composing this alphabet.
|
||||
#[staticmethod]
|
||||
#[text_signature = "()"]
|
||||
fn alphabet() -> Vec<String> {
|
||||
ByteLevel::alphabet()
|
||||
.into_iter()
|
||||
@ -170,7 +196,9 @@ impl PyByteLevel {
|
||||
}
|
||||
}
|
||||
|
||||
/// This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Whitespace)]
|
||||
#[text_signature = "(self)"]
|
||||
pub struct PyWhitespace {}
|
||||
#[pymethods]
|
||||
impl PyWhitespace {
|
||||
@ -180,7 +208,9 @@ impl PyWhitespace {
|
||||
}
|
||||
}
|
||||
|
||||
/// This pre-tokenizer simply splits on the whitespace. Works like `.split()`
|
||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=WhitespaceSplit)]
|
||||
#[text_signature = "(self)"]
|
||||
pub struct PyWhitespaceSplit {}
|
||||
#[pymethods]
|
||||
impl PyWhitespaceSplit {
|
||||
@ -190,6 +220,11 @@ impl PyWhitespaceSplit {
|
||||
}
|
||||
}
|
||||
|
||||
/// This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
|
||||
///
|
||||
/// Args:
|
||||
/// delimiter: str:
|
||||
/// The delimiter char that will be used to split input
|
||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=CharDelimiterSplit)]
|
||||
pub struct PyCharDelimiterSplit {}
|
||||
#[pymethods]
|
||||
@ -210,7 +245,12 @@ impl PyCharDelimiterSplit {
|
||||
}
|
||||
}
|
||||
|
||||
/// BertPreTokenizer
|
||||
///
|
||||
/// This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
||||
/// Each occurence of a punctuation character will be treated separately.
|
||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=BertPreTokenizer)]
|
||||
#[text_signature = "(self)"]
|
||||
pub struct PyBertPreTokenizer {}
|
||||
#[pymethods]
|
||||
impl PyBertPreTokenizer {
|
||||
@ -220,7 +260,9 @@ impl PyBertPreTokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
/// This pre-tokenizer simply splits on punctuation as individual characters.`
|
||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Punctuation)]
|
||||
#[text_signature = "(self)"]
|
||||
pub struct PyPunctuation {}
|
||||
#[pymethods]
|
||||
impl PyPunctuation {
|
||||
@ -230,7 +272,9 @@ impl PyPunctuation {
|
||||
}
|
||||
}
|
||||
|
||||
/// This pre-tokenizer composes other pre_tokenizers and applies them in sequence
|
||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Sequence)]
|
||||
#[text_signature = "(self, pretokenizers)"]
|
||||
pub struct PySequence {}
|
||||
#[pymethods]
|
||||
impl PySequence {
|
||||
@ -257,7 +301,20 @@ impl PySequence {
|
||||
}
|
||||
}
|
||||
|
||||
/// Metaspace pre-tokenizer
|
||||
///
|
||||
/// This pre-tokenizer replaces any whitespace by the provided replacement character.
|
||||
/// It then tries to split on these spaces.
|
||||
/// Args:
|
||||
/// replacement: str:
|
||||
/// The replacement character. Must be exactly one character. By default we
|
||||
/// use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
||||
///
|
||||
/// add_prefix_space: boolean:
|
||||
/// Whether to add a space to the first word if there isn't already one. This
|
||||
/// lets us treat `hello` exactly like `say hello`.
|
||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Metaspace)]
|
||||
#[text_signature = "(self, replacement=\"▁\", add_prefix_space=True)"]
|
||||
pub struct PyMetaspace {}
|
||||
#[pymethods]
|
||||
impl PyMetaspace {
|
||||
@ -290,7 +347,13 @@ impl PyMetaspace {
|
||||
}
|
||||
}
|
||||
|
||||
/// This pre-tokenizer simply splits using the digits in separate tokens
|
||||
/// Args:
|
||||
/// individual_digits: bool:
|
||||
/// If set to True, digits will each be separated "Call 123 please" -> "Call ", "1", "2", "3", " please"
|
||||
/// If set to False, digits will grouped "Call 123 please" -> "Call ", "123", " please"
|
||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Digits)]
|
||||
#[text_signature = "(self, individual_digits=False)"]
|
||||
pub struct PyDigits {}
|
||||
#[pymethods]
|
||||
impl PyDigits {
|
||||
@ -301,7 +364,12 @@ impl PyDigits {
|
||||
}
|
||||
}
|
||||
|
||||
/// This pre-tokenizer splits on characters that belong to different language family
|
||||
/// It roughly follows https://github.com/google/sentencepiece/blob/master/data/Scripts.txt
|
||||
/// Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too.
|
||||
/// This mimicks SentencePiece Unigram implementation.
|
||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=UnicodeScripts)]
|
||||
#[text_signature = "(self)"]
|
||||
pub struct PyUnicodeScripts {}
|
||||
#[pymethods]
|
||||
impl PyUnicodeScripts {
|
||||
|
@ -16,6 +16,10 @@ use tk::processors::PostProcessorWrapper;
|
||||
use tk::{Encoding, PostProcessor};
|
||||
use tokenizers as tk;
|
||||
|
||||
/// Base class for all post-processors
|
||||
///
|
||||
/// This class is not supposed to be instantiated directly. Instead, any implementation of
|
||||
/// a PostProcessor will return an instance of this class when instantiated.
|
||||
#[pyclass(dict, module = "tokenizers.processors", name=PostProcessor)]
|
||||
#[derive(Clone, Deserialize, Serialize)]
|
||||
pub struct PyPostProcessor {
|
||||
@ -88,11 +92,17 @@ impl PyPostProcessor {
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the number of special tokens that would be added for single/pair sentences.
|
||||
/// :param is_pair: Boolean indicating if the input would be a single sentence or a pair
|
||||
/// :return:
|
||||
#[text_signature = "(self, is_pair)"]
|
||||
fn num_special_tokens_to_add(&self, is_pair: bool) -> usize {
|
||||
self.processor.added_tokens(is_pair)
|
||||
}
|
||||
|
||||
/// Post-process the given encodings, generating the final one
|
||||
#[args(pair = "None", add_special_tokens = "true")]
|
||||
#[text_signature = "(self, encoding, pair=None, add_special_tokens=True)"]
|
||||
fn process(
|
||||
&self,
|
||||
encoding: &PyEncoding,
|
||||
@ -109,7 +119,21 @@ impl PyPostProcessor {
|
||||
}
|
||||
}
|
||||
|
||||
/// This post-processor takes care of adding the special tokens needed by
|
||||
/// a Bert model:
|
||||
/// - a SEP token
|
||||
/// - a CLS token
|
||||
/// Args:
|
||||
/// sep: Tuple[str, int]:
|
||||
/// A tuple with the string representation of the SEP token, and its id
|
||||
///
|
||||
/// cls: Tuple[str, int]:
|
||||
/// A tuple with the string representation of the CLS token, and its id
|
||||
///
|
||||
/// Returns:
|
||||
/// PostProcessor
|
||||
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name=BertProcessing)]
|
||||
#[text_signature = "(self, sep, cls)"]
|
||||
pub struct PyBertProcessing {}
|
||||
#[pymethods]
|
||||
impl PyBertProcessing {
|
||||
@ -126,7 +150,33 @@ impl PyBertProcessing {
|
||||
}
|
||||
}
|
||||
|
||||
/// This post-processor takes care of adding the special tokens needed by
|
||||
/// a Roberta model:
|
||||
/// - a SEP token
|
||||
/// - a CLS token
|
||||
///
|
||||
/// It also takes care of trimming the offsets.
|
||||
/// By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
|
||||
/// want the offsets to include these whitespaces, then this PostProcessor should be initialized
|
||||
/// with `trim_offsets=True`
|
||||
/// Args:
|
||||
/// sep: Tuple[str, int]:
|
||||
/// A tuple with the string representation of the SEP token, and its id
|
||||
///
|
||||
/// cls: Tuple[str, int]:
|
||||
/// A tuple with the string representation of the CLS token, and its id
|
||||
///
|
||||
/// trim_offsets: bool:
|
||||
/// Whether to trim the whitespaces from the produced offsets.
|
||||
///
|
||||
/// add_prefix_space: bool:
|
||||
/// Whether the add_prefix_space option was enabled during pre-tokenization. This
|
||||
/// is relevant because it defines the way the offsets are trimmed out.
|
||||
///
|
||||
/// Returns:
|
||||
/// PostProcessor
|
||||
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name=RobertaProcessing)]
|
||||
#[text_signature = "(self, sep, cls, trim_offsets=True, add_prefix_space=True)"]
|
||||
pub struct PyRobertaProcessing {}
|
||||
#[pymethods]
|
||||
impl PyRobertaProcessing {
|
||||
@ -152,7 +202,15 @@ impl PyRobertaProcessing {
|
||||
}
|
||||
}
|
||||
|
||||
/// This post-processor takes care of trimming the offsets.
|
||||
/// By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
|
||||
/// want the offsets to include these whitespaces, then this PostProcessor must be used.
|
||||
///
|
||||
/// Args:
|
||||
/// trim_offsets: bool:
|
||||
/// Whether to trim the whitespaces from the produced offsets.
|
||||
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name=ByteLevel)]
|
||||
#[text_signature = "(self, trim_offsets=True)"]
|
||||
pub struct PyByteLevel {}
|
||||
#[pymethods]
|
||||
impl PyByteLevel {
|
||||
@ -244,7 +302,68 @@ impl FromPyObject<'_> for PyTemplate {
|
||||
}
|
||||
}
|
||||
|
||||
/// Provides a way to specify templates in order to add the special tokens to each
|
||||
/// input sequence as relevant.
|
||||
///
|
||||
/// Let's take `BERT` tokenizer as an example. It uses two special tokens, used to
|
||||
/// delimitate each sequence. `[CLS]` is always used at the beginning of the first
|
||||
/// sequence, and `[SEP]` is added at the end of both the first, and the pair
|
||||
/// sequences. The final result looks like this:
|
||||
/// - Single sequence: `[CLS] Hello there [SEP]`
|
||||
/// - Pair sequences: `[CLS] My name is Anthony [SEP] What is my name? [SEP]`
|
||||
/// With the type ids as following:
|
||||
/// ```markdown
|
||||
/// [CLS] ... [SEP] ... [SEP]
|
||||
/// 0 0 0 1 1
|
||||
/// ```
|
||||
///
|
||||
/// You can achieve such behavior using a TemplateProcessing:
|
||||
/// ```
|
||||
/// TemplateProcessing(
|
||||
/// single="[CLS] $0 [SEP]",
|
||||
/// pair="[CLS] $A [SEP] $B:1 [SEP]:1",
|
||||
/// special_tokens=[("[CLS]", 1), ("[SEP]", 0)],
|
||||
/// )
|
||||
/// ```
|
||||
///
|
||||
/// In this example, each input sequence is identified using a `$` construct. This identifier
|
||||
/// lets us specify each input sequence, and the type_id to use. When nothing is specified,
|
||||
/// it uses the default values. Here are the different ways to specify it:
|
||||
/// - Specifying the sequence, with default `type_id == 0`: `$A` or `$B`
|
||||
/// - Specifying the `type_id` with default `sequence == A`: `$0`, `$1`, `$2`, ...
|
||||
/// - Specifying both: `$A:0`, `$B:1`, ...
|
||||
///
|
||||
/// The same construct is used for special tokens: `<identifier>(:<type_id>)?`.
|
||||
///
|
||||
/// **Warning**: You must ensure that you are giving the correct tokens/ids as these
|
||||
/// will be added to the Encoding without any further check. If the given ids correspond
|
||||
/// to something totally different in a `Tokenizer` using this `PostProcessor`, it
|
||||
/// might lead to unexpected results.
|
||||
///
|
||||
/// Args:
|
||||
/// single: Template
|
||||
/// The template used for single sequences
|
||||
///
|
||||
/// pair: Template:
|
||||
/// The template used when both sequences are specified
|
||||
///
|
||||
/// special_tokens: Tokens:
|
||||
/// The list of special tokens used in each sequences
|
||||
///
|
||||
/// Template: Union[str, List[str]]:
|
||||
/// - If a `str` is provided, the whitespace is used as delimiter between tokens
|
||||
/// - If a `List[str]` is provided, a list of tokens
|
||||
///
|
||||
/// Tokens: List[Union[Tuple[int, str], Tuple[str, int], dict]]:
|
||||
/// - A Tuple with both a token and its associated ID, in any order
|
||||
/// - A dict with the following keys:
|
||||
/// - "id": str => The special token id, as specified in the Template
|
||||
/// - "ids": List[int] => The associated IDs
|
||||
/// - "tokens": List[str] => The associated tokens
|
||||
/// The given dict expects the provided `ids` and `tokens` lists to have
|
||||
/// the same length.
|
||||
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name=TemplateProcessing)]
|
||||
#[text_signature = "(self, single, pair, special_tokens)"]
|
||||
pub struct PyTemplateProcessing {}
|
||||
#[pymethods]
|
||||
impl PyTemplateProcessing {
|
||||
|
@ -53,7 +53,7 @@ use crate::processors::PyPostProcessor;
|
||||
/// Yesterday"``.
|
||||
///
|
||||
#[pyclass(dict, module = "tokenizers", name=AddedToken)]
|
||||
#[text_signature = "(content, single_word=False, lstrip=False, rstrip=False, normalized=True)"]
|
||||
#[text_signature = "(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True)"]
|
||||
pub struct PyAddedToken {
|
||||
pub content: String,
|
||||
pub is_special_token: bool,
|
||||
@ -408,7 +408,7 @@ type Tokenizer = TokenizerImpl<PyModel, PyNormalizer, PyPreTokenizer, PyPostProc
|
||||
/// The core algorithm that this :obj:`Tokenizer` should be using.
|
||||
///
|
||||
#[pyclass(dict, module = "tokenizers", name=Tokenizer)]
|
||||
#[text_signature = "(model)"]
|
||||
#[text_signature = "(self, model)"]
|
||||
#[derive(Clone)]
|
||||
pub struct PyTokenizer {
|
||||
tokenizer: Tokenizer,
|
||||
@ -523,7 +523,7 @@ impl PyTokenizer {
|
||||
/// Returns:
|
||||
/// :obj:`str`: A string representing the serialized Tokenizer
|
||||
#[args(pretty = false)]
|
||||
#[text_signature = "($self, pretty=False)"]
|
||||
#[text_signature = "(self, pretty=False)"]
|
||||
fn to_str(&self, pretty: bool) -> PyResult<String> {
|
||||
ToPyResult(self.tokenizer.to_string(pretty)).into()
|
||||
}
|
||||
@ -537,11 +537,15 @@ impl PyTokenizer {
|
||||
/// pretty (:obj:`bool`, defaults to :obj:`False`):
|
||||
/// Whether the JSON file should be pretty formatted.
|
||||
#[args(pretty = false)]
|
||||
#[text_signature = "($self, pretty=False)"]
|
||||
#[text_signature = "(self, pretty=False)"]
|
||||
fn save(&self, path: &str, pretty: bool) -> PyResult<()> {
|
||||
ToPyResult(self.tokenizer.save(path, pretty)).into()
|
||||
}
|
||||
|
||||
/// Return the number of special tokens that would be added for single/pair sentences.
|
||||
/// :param is_pair: Boolean indicating if the input would be a single sentence or a pair
|
||||
/// :return:
|
||||
#[text_signature = "(self, is_pair)"]
|
||||
fn num_special_tokens_to_add(&self, is_pair: bool) -> PyResult<usize> {
|
||||
Ok(self
|
||||
.tokenizer
|
||||
@ -558,7 +562,7 @@ impl PyTokenizer {
|
||||
/// Returns:
|
||||
/// :obj:`Dict[str, int]`: The vocabulary
|
||||
#[args(with_added_tokens = true)]
|
||||
#[text_signature = "($self, with_added_tokens=True)"]
|
||||
#[text_signature = "(self, with_added_tokens=True)"]
|
||||
fn get_vocab(&self, with_added_tokens: bool) -> PyResult<HashMap<String, u32>> {
|
||||
Ok(self.tokenizer.get_vocab(with_added_tokens))
|
||||
}
|
||||
@ -572,7 +576,7 @@ impl PyTokenizer {
|
||||
/// Returns:
|
||||
/// :obj:`int`: The size of the vocabulary
|
||||
#[args(with_added_tokens = true)]
|
||||
#[text_signature = "($self, with_added_tokens=True)"]
|
||||
#[text_signature = "(self, with_added_tokens=True)"]
|
||||
fn get_vocab_size(&self, with_added_tokens: bool) -> PyResult<usize> {
|
||||
Ok(self.tokenizer.get_vocab_size(with_added_tokens))
|
||||
}
|
||||
@ -591,7 +595,7 @@ impl PyTokenizer {
|
||||
/// The strategy used to truncation. Can be one of ``longest_first``, ``only_first`` or
|
||||
/// ``only_second``.
|
||||
#[args(kwargs = "**")]
|
||||
#[text_signature = "($self, max_length, stride=0, strategy='longest_first')"]
|
||||
#[text_signature = "(self, max_length, stride=0, strategy='longest_first')"]
|
||||
fn enable_truncation(&mut self, max_length: usize, kwargs: Option<&PyDict>) -> PyResult<()> {
|
||||
let mut params = TruncationParams::default();
|
||||
params.max_length = max_length;
|
||||
@ -626,7 +630,7 @@ impl PyTokenizer {
|
||||
}
|
||||
|
||||
/// Disable truncation
|
||||
#[text_signature = "($self)"]
|
||||
#[text_signature = "(self)"]
|
||||
fn no_truncation(&mut self) {
|
||||
self.tokenizer.with_truncation(None);
|
||||
}
|
||||
@ -675,7 +679,7 @@ impl PyTokenizer {
|
||||
/// If specified, the length at which to pad. If not specified we pad using the size of
|
||||
/// the longest sequence in a batch.
|
||||
#[args(kwargs = "**")]
|
||||
#[text_signature = "($self, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]', length=None, pad_to_multiple_of=None)"]
|
||||
#[text_signature = "(self, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]', length=None, pad_to_multiple_of=None)"]
|
||||
fn enable_padding(&mut self, kwargs: Option<&PyDict>) -> PyResult<()> {
|
||||
let mut params = PaddingParams::default();
|
||||
|
||||
@ -733,7 +737,7 @@ impl PyTokenizer {
|
||||
}
|
||||
|
||||
/// Disable padding
|
||||
#[text_signature = "($self)"]
|
||||
#[text_signature = "(self)"]
|
||||
fn no_padding(&mut self) {
|
||||
self.tokenizer.with_padding(None);
|
||||
}
|
||||
@ -802,7 +806,7 @@ impl PyTokenizer {
|
||||
/// :class:`~tokenizers.Encoding`: The encoded result
|
||||
///
|
||||
#[args(pair = "None", is_pretokenized = "false", add_special_tokens = "true")]
|
||||
#[text_signature = "($self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)"]
|
||||
#[text_signature = "(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)"]
|
||||
fn encode(
|
||||
&self,
|
||||
sequence: &PyAny,
|
||||
@ -867,7 +871,7 @@ impl PyTokenizer {
|
||||
/// A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
|
||||
///
|
||||
#[args(is_pretokenized = "false", add_special_tokens = "true")]
|
||||
#[text_signature = "($self, input, is_pretokenized=False, add_special_tokens=True)"]
|
||||
#[text_signature = "(self, input, is_pretokenized=False, add_special_tokens=True)"]
|
||||
fn encode_batch(
|
||||
&self,
|
||||
input: Vec<&PyAny>,
|
||||
@ -910,7 +914,7 @@ impl PyTokenizer {
|
||||
/// Returns:
|
||||
/// :obj:`str`: The decoded string
|
||||
#[args(skip_special_tokens = true)]
|
||||
#[text_signature = "($self, ids, skip_special_tokens=True)"]
|
||||
#[text_signature = "(self, ids, skip_special_tokens=True)"]
|
||||
fn decode(&self, ids: Vec<u32>, skip_special_tokens: bool) -> PyResult<String> {
|
||||
ToPyResult(self.tokenizer.decode(ids, skip_special_tokens)).into()
|
||||
}
|
||||
@ -927,7 +931,7 @@ impl PyTokenizer {
|
||||
/// Returns:
|
||||
/// :obj:`List[str]`: A list of decoded strings
|
||||
#[args(skip_special_tokens = true)]
|
||||
#[text_signature = "($self, sequences, skip_special_tokens=True)"]
|
||||
#[text_signature = "(self, sequences, skip_special_tokens=True)"]
|
||||
fn decode_batch(
|
||||
&self,
|
||||
sequences: Vec<Vec<u32>>,
|
||||
@ -947,7 +951,7 @@ impl PyTokenizer {
|
||||
///
|
||||
/// Returns:
|
||||
/// :obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary
|
||||
#[text_signature = "($self, token)"]
|
||||
#[text_signature = "(self, token)"]
|
||||
fn token_to_id(&self, token: &str) -> Option<u32> {
|
||||
self.tokenizer.token_to_id(token)
|
||||
}
|
||||
@ -960,7 +964,7 @@ impl PyTokenizer {
|
||||
///
|
||||
/// Returns:
|
||||
/// :obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary
|
||||
#[text_signature = "($self, id)"]
|
||||
#[text_signature = "(self, id)"]
|
||||
fn id_to_token(&self, id: u32) -> Option<&str> {
|
||||
self.tokenizer.id_to_token(id)
|
||||
}
|
||||
@ -977,7 +981,7 @@ impl PyTokenizer {
|
||||
///
|
||||
/// Returns:
|
||||
/// :obj:`int`: The number of tokens that were created in the vocabulary
|
||||
#[text_signature = "($self, tokens)"]
|
||||
#[text_signature = "(self, tokens)"]
|
||||
fn add_tokens(&mut self, tokens: &PyList) -> PyResult<usize> {
|
||||
let tokens = tokens
|
||||
.into_iter()
|
||||
@ -1014,7 +1018,7 @@ impl PyTokenizer {
|
||||
///
|
||||
/// Returns:
|
||||
/// :obj:`int`: The number of tokens that were created in the vocabulary
|
||||
#[text_signature = "($self, tokens)"]
|
||||
#[text_signature = "(self, tokens)"]
|
||||
fn add_special_tokens(&mut self, tokens: &PyList) -> PyResult<usize> {
|
||||
let tokens = tokens
|
||||
.into_iter()
|
||||
@ -1064,7 +1068,7 @@ impl PyTokenizer {
|
||||
/// Returns:
|
||||
/// :class:`~tokenizers.Encoding`: The final post-processed encoding
|
||||
#[args(pair = "None", add_special_tokens = true)]
|
||||
#[text_signature = "($self, encoding, pair=None, add_special_tokens=True)"]
|
||||
#[text_signature = "(self, encoding, pair=None, add_special_tokens=True)"]
|
||||
fn post_process(
|
||||
&self,
|
||||
encoding: &PyEncoding,
|
||||
|
@ -11,7 +11,43 @@ use tokenizers as tk;
|
||||
use crate::models::PyModel;
|
||||
use crate::tokenizer::PyAddedToken;
|
||||
|
||||
/// Base class for all trainers
|
||||
///
|
||||
/// This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
/// Trainer will return an instance of this class when instantiated.
|
||||
///
|
||||
/// Args:
|
||||
/// vocab_size: unsigned int:
|
||||
/// The size of the final vocabulary, including all tokens and alphabet.
|
||||
///
|
||||
/// min_frequency: unsigned int:
|
||||
/// The minimum frequency a pair should have in order to be merged.
|
||||
///
|
||||
/// show_progress: boolean:
|
||||
/// Whether to show progress bars while training.
|
||||
///
|
||||
/// special_tokens: List[Union[str, AddedToken]]:
|
||||
/// A list of special tokens the model should know of.
|
||||
///
|
||||
/// limit_alphabet: unsigned int:
|
||||
/// The maximum different characters to keep in the alphabet.
|
||||
///
|
||||
/// initial_alphabet: List[str]:
|
||||
/// A list of characters to include in the initial alphabet, even
|
||||
/// if not seen in the training dataset.
|
||||
/// If the strings contain more than one character, only the first one
|
||||
/// is kept.
|
||||
///
|
||||
/// continuing_subword_prefix: Optional[str]:
|
||||
/// A prefix to be used for every subword that is not a beginning-of-word.
|
||||
///
|
||||
/// end_of_word_suffix: Optional[str]:
|
||||
/// A suffix to be used for every subword that is a end-of-word.
|
||||
///
|
||||
/// Returns:
|
||||
/// Trainer
|
||||
#[pyclass(name=Trainer)]
|
||||
#[text_signature = "(self, vocab_size=30000, min_frequency=0,show_progress=True, special_tokens=[],limit_alphabet=None, initial_alphabet = [], continuing_subword_prefix=None, end_of_word_suffix=None)"]
|
||||
pub struct PyTrainer {
|
||||
pub trainer: TrainerWrapper,
|
||||
}
|
||||
@ -41,6 +77,7 @@ impl Trainer for PyTrainer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Capable of training a BPE model
|
||||
#[pyclass(extends=PyTrainer, name=BpeTrainer)]
|
||||
pub struct PyBpeTrainer {}
|
||||
#[pymethods]
|
||||
@ -105,7 +142,39 @@ impl PyBpeTrainer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Capable of training a WordPiece model
|
||||
/// Args:
|
||||
/// vocab_size: unsigned int:
|
||||
/// The size of the final vocabulary, including all tokens and alphabet.
|
||||
///
|
||||
/// min_frequency: unsigned int:
|
||||
/// The minimum frequency a pair should have in order to be merged.
|
||||
///
|
||||
/// show_progress: boolean:
|
||||
/// Whether to show progress bars while training.
|
||||
///
|
||||
/// special_tokens: List[Union[str, AddedToken]]:
|
||||
/// A list of special tokens the model should know of.
|
||||
///
|
||||
/// limit_alphabet: unsigned int:
|
||||
/// The maximum different characters to keep in the alphabet.
|
||||
///
|
||||
/// initial_alphabet: List[str]:
|
||||
/// A list of characters to include in the initial alphabet, even
|
||||
/// if not seen in the training dataset.
|
||||
/// If the strings contain more than one character, only the first one
|
||||
/// is kept.
|
||||
///
|
||||
/// continuing_subword_prefix: Optional[str]:
|
||||
/// A prefix to be used for every subword that is not a beginning-of-word.
|
||||
///
|
||||
/// end_of_word_suffix: Optional[str]:
|
||||
/// A suffix to be used for every subword that is a end-of-word.
|
||||
///
|
||||
/// Returns:
|
||||
/// Trainer
|
||||
#[pyclass(extends=PyTrainer, name=WordPieceTrainer)]
|
||||
#[text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"]
|
||||
pub struct PyWordPieceTrainer {}
|
||||
#[pymethods]
|
||||
impl PyWordPieceTrainer {
|
||||
@ -173,7 +242,28 @@ impl PyWordPieceTrainer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Capable of training a Unigram model
|
||||
///
|
||||
/// Args:
|
||||
/// vocab_size: unsigned int:
|
||||
/// The size of the final vocabulary, including all tokens and alphabet.
|
||||
///
|
||||
/// show_progress: boolean:
|
||||
/// Whether to show progress bars while training.
|
||||
///
|
||||
/// special_tokens: List[Union[str, AddedToken]]:
|
||||
/// A list of special tokens the model should know of.
|
||||
///
|
||||
/// initial_alphabet: List[str]:
|
||||
/// A list of characters to include in the initial alphabet, even
|
||||
/// if not seen in the training dataset.
|
||||
/// If the strings contain more than one character, only the first one
|
||||
/// is kept.
|
||||
///
|
||||
/// Returns:
|
||||
/// Trainer
|
||||
#[pyclass(extends=PyTrainer, name=UnigramTrainer)]
|
||||
#[text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens= [])"]
|
||||
pub struct PyUnigramTrainer {}
|
||||
#[pymethods]
|
||||
impl PyUnigramTrainer {
|
||||
|
@ -173,6 +173,15 @@ fn slice(
|
||||
.flatten())
|
||||
}
|
||||
|
||||
/// NormalizedString
|
||||
///
|
||||
/// A NormalizedString takes care of modifying an "original" string, to obtain a "normalized" one.
|
||||
/// While making all the requested modifications, it keeps track of the alignment information
|
||||
/// between the two versions of the string.
|
||||
///
|
||||
/// Args:
|
||||
/// sequence: str:
|
||||
/// The string sequence used to initialize this NormalizedString
|
||||
#[pyclass(module = "tokenizers", name=NormalizedString)]
|
||||
#[derive(Clone)]
|
||||
pub struct PyNormalizedString {
|
||||
@ -186,6 +195,7 @@ impl PyNormalizedString {
|
||||
NormalizedString::from(s).into()
|
||||
}
|
||||
|
||||
/// The normalized part of the string
|
||||
#[getter]
|
||||
fn get_normalized(&self) -> &str {
|
||||
self.normalized.get()
|
||||
@ -196,70 +206,119 @@ impl PyNormalizedString {
|
||||
self.normalized.get_original()
|
||||
}
|
||||
|
||||
/// Runs the NFD normalization
|
||||
#[text_signature = "(self)"]
|
||||
fn nfd(&mut self) {
|
||||
self.normalized.nfd();
|
||||
}
|
||||
|
||||
/// Runs the NFKD normalization
|
||||
#[text_signature = "(self)"]
|
||||
fn nfkd(&mut self) {
|
||||
self.normalized.nfkd();
|
||||
}
|
||||
|
||||
/// Runs the NFC normalization
|
||||
#[text_signature = "(self)"]
|
||||
fn nfc(&mut self) {
|
||||
self.normalized.nfc();
|
||||
}
|
||||
|
||||
/// Runs the NFKC normalization
|
||||
#[text_signature = "(self)"]
|
||||
fn nfkc(&mut self) {
|
||||
self.normalized.nfkc();
|
||||
}
|
||||
|
||||
/// Lowercase the string
|
||||
#[text_signature = "(self)"]
|
||||
fn lowercase(&mut self) {
|
||||
self.normalized.lowercase();
|
||||
}
|
||||
|
||||
/// Uppercase the string
|
||||
#[text_signature = "(self)"]
|
||||
fn uppercase(&mut self) {
|
||||
self.normalized.uppercase();
|
||||
}
|
||||
|
||||
/// Prepend the given sequence to the string
|
||||
#[text_signature = "(self, s)"]
|
||||
fn prepend(&mut self, s: &str) {
|
||||
self.normalized.prepend(s);
|
||||
}
|
||||
|
||||
/// Append the given sequence to the string
|
||||
#[text_signature = "(self, s)"]
|
||||
fn append(&mut self, s: &str) {
|
||||
self.normalized.append(s);
|
||||
}
|
||||
|
||||
/// Strip the left of the string
|
||||
#[text_signature = "(self)"]
|
||||
fn lstrip(&mut self) {
|
||||
self.normalized.lstrip();
|
||||
}
|
||||
|
||||
/// Strip the right of the string
|
||||
#[text_signature = "(self)"]
|
||||
fn rstrip(&mut self) {
|
||||
self.normalized.rstrip();
|
||||
}
|
||||
|
||||
/// Strip both ends of the string
|
||||
#[text_signature = "(self)"]
|
||||
fn strip(&mut self) {
|
||||
self.normalized.strip();
|
||||
}
|
||||
|
||||
/// Clears the string
|
||||
#[text_signature = "(self)"]
|
||||
fn clear(&mut self) {
|
||||
self.normalized.clear();
|
||||
}
|
||||
|
||||
/// Slice the string using the given range
|
||||
#[text_signature = "(self, range)"]
|
||||
fn slice(&self, range: PyRange) -> PyResult<Option<PyNormalizedString>> {
|
||||
slice(&self.normalized, &range)
|
||||
}
|
||||
|
||||
/// Filter each character of the string using the given func
|
||||
#[text_signature = "(self, func)"]
|
||||
fn filter(&mut self, func: &PyAny) -> PyResult<()> {
|
||||
filter(&mut self.normalized, func)
|
||||
}
|
||||
|
||||
/// Calls the given function for each character of the string
|
||||
#[text_signature = "(self, func)"]
|
||||
fn for_each(&self, func: &PyAny) -> PyResult<()> {
|
||||
for_each(&self.normalized, func)
|
||||
}
|
||||
|
||||
/// Calls the given function for each character of the string
|
||||
///
|
||||
/// Replaces each character of the string using the returned value. Each
|
||||
/// returned value **must** be a str of length 1 (ie a character).
|
||||
#[text_signature = "(self, func)"]
|
||||
fn map(&mut self, func: &PyAny) -> PyResult<()> {
|
||||
map(&mut self.normalized, func)
|
||||
}
|
||||
|
||||
/// Split the NormalizedString using the given pattern and the specified behavior
|
||||
///
|
||||
/// Args:
|
||||
/// pattern: Pattern:
|
||||
/// A pattern used to split the string. Usually a string or a Regex
|
||||
///
|
||||
/// behavior: SplitDelimiterBehavior:
|
||||
/// The behavior to use when splitting.
|
||||
/// Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
|
||||
/// "contiguous"
|
||||
///
|
||||
/// Returns:
|
||||
/// A list of NormalizedString, representing each split
|
||||
#[text_signature = "(self, pattern, behavior)"]
|
||||
fn split(
|
||||
&mut self,
|
||||
pattern: PyPattern,
|
||||
@ -272,6 +331,15 @@ impl PyNormalizedString {
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Replace the content of the given pattern with the provided content
|
||||
///
|
||||
/// Args:
|
||||
/// pattern: Pattern:
|
||||
/// A pattern used to match the string. Usually a string or a Regex
|
||||
///
|
||||
/// content: str:
|
||||
/// The content to be used as replacement
|
||||
#[text_signature = "(self, pattern, content)"]
|
||||
fn replace(&mut self, pattern: PyPattern, content: &str) -> PyResult<()> {
|
||||
ToPyResult(self.normalized.replace(pattern, content)).into()
|
||||
}
|
||||
|
@ -65,6 +65,7 @@ fn tokenize(pretok: &mut PreTokenizedString, func: &PyAny) -> PyResult<()> {
|
||||
}
|
||||
}
|
||||
|
||||
/// This is an enum
|
||||
#[derive(Clone)]
|
||||
pub struct PyOffsetReferential(OffsetReferential);
|
||||
impl FromPyObject<'_> for PyOffsetReferential {
|
||||
@ -131,7 +132,23 @@ fn to_encoding(
|
||||
.into())
|
||||
}
|
||||
|
||||
/// PreTokenizedString
|
||||
///
|
||||
/// Wrapper over a string, that provides a way to normalize, pre-tokenize, tokenize the
|
||||
/// underlying string, while keeping track of the alignment information (offsets).
|
||||
///
|
||||
/// The PreTokenizedString manages what we call `splits`. Each split represents a substring
|
||||
/// which is a subpart of the original string, with the relevant offsets and tokens.
|
||||
///
|
||||
/// When calling one of the methods used to modify the PreTokenizedString (namely one of
|
||||
/// `split`, `normalize` or `tokenize), only the `splits` that don't have any associated
|
||||
/// tokens will get modified.
|
||||
///
|
||||
/// Args:
|
||||
/// sequence: str:
|
||||
/// The string sequence used to initialize this PreTokenizedString
|
||||
#[pyclass(module = "tokenizers", name=PreTokenizedString)]
|
||||
#[text_signature = "(self, sequence)"]
|
||||
pub struct PyPreTokenizedString {
|
||||
pub(crate) pretok: tk::PreTokenizedString,
|
||||
}
|
||||
@ -155,27 +172,84 @@ impl PyPreTokenizedString {
|
||||
PreTokenizedString::from(s).into()
|
||||
}
|
||||
|
||||
/// Split the PreTokenizedString using the given `func`
|
||||
///
|
||||
/// Args:
|
||||
/// func: Callable[[index, NormalizedString], List[NormalizedString]]:
|
||||
/// The function used to split each underlying split.
|
||||
/// It is expected to return a list of `NormalizedString`, that represent the new
|
||||
/// splits. If the given `NormalizedString` does not need any splitting, we can
|
||||
/// just return it directly.
|
||||
/// In order for the offsets to be tracked accurately, any returned `NormalizedString`
|
||||
/// should come from calling either `.split` or `.slice` on the received one.
|
||||
#[text_signature = "(self, func)"]
|
||||
fn split(&mut self, func: &PyAny) -> PyResult<()> {
|
||||
split(&mut self.pretok, func)
|
||||
}
|
||||
|
||||
/// Normalize each split of the `PreTokenizedString` using the given `func`
|
||||
///
|
||||
/// Args:
|
||||
/// func: Callable[[NormalizedString], None]:
|
||||
/// The function used to normalize each underlying split. This function
|
||||
/// does not need to return anything, just calling the methods on the provided
|
||||
/// NormalizedString allow its modification.
|
||||
#[text_signature = "(self, func)"]
|
||||
fn normalize(&mut self, func: &PyAny) -> PyResult<()> {
|
||||
normalize(&mut self.pretok, func)
|
||||
}
|
||||
|
||||
/// Tokenize each split of the `PreTokenizedString` using the given `func`
|
||||
///
|
||||
/// Args:
|
||||
/// func: Callable[[str], List[Token]]:
|
||||
/// The function used to tokenize each underlying split. This function must return
|
||||
/// a list of Token generated from the input str.
|
||||
#[text_signature = "(self, func)"]
|
||||
fn tokenize(&mut self, func: &PyAny) -> PyResult<()> {
|
||||
tokenize(&mut self.pretok, func)
|
||||
}
|
||||
|
||||
/// Return an Encoding generated from this PreTokenizedString
|
||||
///
|
||||
/// Args:
|
||||
/// type_id: int = 0:
|
||||
/// The type_id to be used on the generated Encoding.
|
||||
///
|
||||
/// word_idx: Optional[int] = None:
|
||||
/// An optional word index to be used for each token of this Encoding. If provided,
|
||||
/// all the word indices in the generated Encoding will use this value, instead
|
||||
/// of the one automatically tracked during pre-tokenization.
|
||||
///
|
||||
/// Returns:
|
||||
/// An Encoding
|
||||
#[args(type_id = "0", word_idx = "None")]
|
||||
#[text_signature = "(self, type_id=0, word_idx=None)"]
|
||||
fn to_encoding(&self, type_id: u32, word_idx: Option<u32>) -> PyResult<PyEncoding> {
|
||||
to_encoding(&self.pretok, type_id, word_idx)
|
||||
}
|
||||
|
||||
/// Get the splits currently managed by the PreTokenizedString
|
||||
///
|
||||
/// Args:
|
||||
/// offset_referential: :obj:`str`
|
||||
/// Whether the returned splits should have offsets expressed relative
|
||||
/// to the original string, or the normalized one. choices: "original", "normalized".
|
||||
///
|
||||
/// offset_type: :obj:`str`
|
||||
/// Whether the returned splits should have offsets expressed in bytes or chars.
|
||||
/// When slicing an str, we usually want to use chars, which is the default value.
|
||||
/// Now in some cases it might be interesting to get these offsets expressed in bytes,
|
||||
/// so it is possible to change this here.
|
||||
/// choices: "char", "bytes"
|
||||
///
|
||||
/// Returns
|
||||
/// A list of splits
|
||||
#[args(
|
||||
offset_referential = "PyOffsetReferential(OffsetReferential::Original)",
|
||||
offset_type = "PyOffsetType(OffsetType::Char)"
|
||||
)]
|
||||
#[text_signature = "(self, offset_referential=\"original\", offset_type=\"char\")"]
|
||||
fn get_splits(
|
||||
&self,
|
||||
offset_referential: PyOffsetReferential,
|
||||
|
@ -2,7 +2,9 @@ use onig::Regex;
|
||||
use pyo3::exceptions;
|
||||
use pyo3::prelude::*;
|
||||
|
||||
/// Instantiate a new Regex with the given pattern
|
||||
#[pyclass(module = "tokenizers", name=Regex)]
|
||||
#[text_signature = "(self, pattern)"]
|
||||
pub struct PyRegex {
|
||||
pub inner: Regex,
|
||||
pub pattern: String,
|
||||
|
192
bindings/python/stub.py
Normal file
192
bindings/python/stub.py
Normal file
@ -0,0 +1,192 @@
|
||||
import inspect
|
||||
import os
|
||||
import argparse
|
||||
import black
|
||||
from pathlib import Path
|
||||
|
||||
INDENT = " " * 4
|
||||
GENERATED_COMMENT = "# Generated content DO NOT EDIT\n"
|
||||
|
||||
|
||||
def do_indent(text: str, indent: str):
|
||||
return text.replace("\n", f"\n{indent}")
|
||||
|
||||
|
||||
def function(obj, indent, text_signature=None):
|
||||
if text_signature is None:
|
||||
text_signature = obj.__text_signature__
|
||||
string = ""
|
||||
string += f"{indent}def {obj.__name__}{text_signature}:\n"
|
||||
indent += INDENT
|
||||
string += f'{indent}"""\n'
|
||||
string += f"{indent}{do_indent(obj.__doc__, indent)}\n"
|
||||
string += f'{indent}"""\n'
|
||||
string += f"{indent}pass\n"
|
||||
string += "\n"
|
||||
string += "\n"
|
||||
return string
|
||||
|
||||
|
||||
def member_sort(member):
|
||||
if inspect.isclass(member):
|
||||
value = 10 + len(inspect.getmro(member))
|
||||
else:
|
||||
value = 1
|
||||
return value
|
||||
|
||||
|
||||
def fn_predicate(obj):
|
||||
value = inspect.ismethoddescriptor(obj) or inspect.isbuiltin(obj)
|
||||
if value:
|
||||
return obj.__doc__ and obj.__text_signature__ and not obj.__name__.startswith("_")
|
||||
if inspect.isgetsetdescriptor(obj):
|
||||
return obj.__doc__ and not obj.__name__.startswith("_")
|
||||
return False
|
||||
|
||||
|
||||
def get_module_members(module):
|
||||
members = [
|
||||
member
|
||||
for name, member in inspect.getmembers(module)
|
||||
if not name.startswith("_") and not inspect.ismodule(member)
|
||||
]
|
||||
members.sort(key=member_sort)
|
||||
return members
|
||||
|
||||
|
||||
def pyi_file(obj, indent=""):
|
||||
string = ""
|
||||
if inspect.ismodule(obj):
|
||||
string += GENERATED_COMMENT
|
||||
members = get_module_members(obj)
|
||||
for member in members:
|
||||
string += pyi_file(member, indent)
|
||||
|
||||
elif inspect.isclass(obj):
|
||||
indent += INDENT
|
||||
mro = inspect.getmro(obj)
|
||||
if len(mro) > 2:
|
||||
inherit = f"({mro[1].__name__})"
|
||||
else:
|
||||
inherit = ""
|
||||
string += f"class {obj.__name__}{inherit}:\n"
|
||||
|
||||
body = ""
|
||||
if obj.__doc__:
|
||||
body += f'{indent}"""\n{indent}{do_indent(obj.__doc__, indent)}\n{indent}"""\n'
|
||||
|
||||
fns = inspect.getmembers(obj, fn_predicate)
|
||||
|
||||
# Init
|
||||
if obj.__text_signature__:
|
||||
body += f"{indent}def __init__{obj.__text_signature__}:\n"
|
||||
body += f"{indent+INDENT}pass\n"
|
||||
body += "\n"
|
||||
|
||||
for (name, fn) in fns:
|
||||
body += pyi_file(fn, indent=indent)
|
||||
|
||||
if not body:
|
||||
body += f"{indent}pass\n"
|
||||
|
||||
string += body
|
||||
string += "\n\n"
|
||||
|
||||
elif inspect.isbuiltin(obj):
|
||||
string += f"{indent}@staticmethod\n"
|
||||
string += function(obj, indent)
|
||||
|
||||
elif inspect.ismethoddescriptor(obj):
|
||||
string += function(obj, indent)
|
||||
|
||||
elif inspect.isgetsetdescriptor(obj):
|
||||
# TODO it would be interesing to add the setter maybe ?
|
||||
string += f"{indent}@property\n"
|
||||
string += function(obj, indent, text_signature="(self)")
|
||||
else:
|
||||
raise Exception(f"Object {obj} is not supported")
|
||||
return string
|
||||
|
||||
|
||||
def py_file(module, origin):
|
||||
members = get_module_members(module)
|
||||
|
||||
string = GENERATED_COMMENT
|
||||
string += f"from .. import {origin}\n"
|
||||
string += "\n"
|
||||
for member in members:
|
||||
name = member.__name__
|
||||
string += f"{name} = {origin}.{name}\n"
|
||||
return string
|
||||
|
||||
|
||||
def do_black(content, is_pyi):
|
||||
mode = black.Mode(
|
||||
target_versions={black.TargetVersion.PY35},
|
||||
line_length=100,
|
||||
is_pyi=is_pyi,
|
||||
string_normalization=True,
|
||||
experimental_string_processing=False,
|
||||
)
|
||||
try:
|
||||
return black.format_file_contents(content, fast=True, mode=mode)
|
||||
except black.NothingChanged:
|
||||
return content
|
||||
|
||||
|
||||
def write(module, directory, origin, check=False):
|
||||
submodules = [
|
||||
(name, member) for name, member in inspect.getmembers(module) if inspect.ismodule(member)
|
||||
]
|
||||
|
||||
filename = os.path.join(directory, "__init__.pyi")
|
||||
pyi_content = pyi_file(module)
|
||||
pyi_content = do_black(pyi_content, is_pyi=True)
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
if check:
|
||||
with open(filename, "r") as f:
|
||||
data = f.read()
|
||||
assert (
|
||||
data == pyi_content
|
||||
), f"The content of {filename} seems outdated, please run `python stub.py`"
|
||||
else:
|
||||
with open(filename, "w") as f:
|
||||
f.write(pyi_content)
|
||||
|
||||
filename = os.path.join(directory, "__init__.py")
|
||||
py_content = py_file(module, origin)
|
||||
py_content = do_black(py_content, is_pyi=False)
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
|
||||
is_auto = False
|
||||
if not os.path.exists(filename):
|
||||
is_auto = True
|
||||
else:
|
||||
with open(filename, "r") as f:
|
||||
line = f.readline()
|
||||
if line == GENERATED_COMMENT:
|
||||
is_auto = True
|
||||
|
||||
if is_auto:
|
||||
if check:
|
||||
with open(filename, "r") as f:
|
||||
data = f.read()
|
||||
assert (
|
||||
data == py_content
|
||||
), f"The content of {filename} seems outdated, please run `python stub.py`"
|
||||
else:
|
||||
with open(filename, "w") as f:
|
||||
f.write(py_content)
|
||||
|
||||
for name, submodule in submodules:
|
||||
write(submodule, os.path.join(directory, name), f"{name}", check=check)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--check", action="store_true")
|
||||
|
||||
args = parser.parse_args()
|
||||
import tokenizers
|
||||
|
||||
write(tokenizers.tokenizers, "py_src/tokenizers/", "tokenizers", check=args.check)
|
Reference in New Issue
Block a user