Temp work to make the APIs uniform (build from memory by default).

2025-12-08 13:48:19 +00:00 · 2020-09-22 09:41:07 +02:00
parent b24a2fc178
commit 98a30eead1
16 changed files with 438 additions and 162 deletions
--- a/bindings/python/py_src/tokenizers/implementations/bert_wordpiece.py
+++ b/bindings/python/py_src/tokenizers/implementations/bert_wordpiece.py
@@ -5,7 +5,7 @@ from tokenizers.pre_tokenizers import BertPreTokenizer
 from tokenizers.processors import BertProcessing
 from .base_tokenizer import BaseTokenizer

-from typing import Optional, List, Union
+from typing import Optional, List, Union, Dict


 class BertWordPieceTokenizer(BaseTokenizer):
@@ -13,7 +13,7 @@ class BertWordPieceTokenizer(BaseTokenizer):

    def __init__(
        self,
-        vocab_file: Optional[str] = None,
+        vocab: Optional[Union[str, Dict[str, int]]] = None,
        unk_token: Union[str, AddedToken] = "[UNK]",
        sep_token: Union[str, AddedToken] = "[SEP]",
        cls_token: Union[str, AddedToken] = "[CLS]",
@@ -26,8 +26,8 @@ class BertWordPieceTokenizer(BaseTokenizer):
        wordpieces_prefix: str = "##",
    ):

-        if vocab_file is not None:
-            tokenizer = Tokenizer(WordPiece(vocab_file, unk_token=str(unk_token)))
+        if vocab is not None:
+            tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(unk_token)))
        else:
            tokenizer = Tokenizer(WordPiece(unk_token=str(unk_token)))

@@ -51,7 +51,7 @@ class BertWordPieceTokenizer(BaseTokenizer):
        )
        tokenizer.pre_tokenizer = BertPreTokenizer()

-        if vocab_file is not None:
+        if vocab is not None:
            sep_token_id = tokenizer.token_to_id(str(sep_token))
            if sep_token_id is None:
                raise TypeError("sep_token not found in the vocabulary")
--- a/bindings/python/py_src/tokenizers/implementations/byte_level_bpe.py
+++ b/bindings/python/py_src/tokenizers/implementations/byte_level_bpe.py
@@ -1,21 +1,28 @@
-from tokenizers import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers, processors
+from tokenizers import (
+    Tokenizer,
+    AddedToken,
+    pre_tokenizers,
+    decoders,
+    trainers,
+    processors,
+)
 from tokenizers.models import BPE
 from tokenizers.normalizers import unicode_normalizer_from_str, Lowercase, Sequence
 from .base_tokenizer import BaseTokenizer

-from typing import Optional, List, Union
+from typing import Optional, List, Union, Dict, Tuple


 class ByteLevelBPETokenizer(BaseTokenizer):
-    """ ByteLevelBPETokenizer
+    """ByteLevelBPETokenizer

    Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
    """

    def __init__(
        self,
-        vocab_file: Optional[str] = None,
-        merges_file: Optional[str] = None,
+        vocab: Optional[Union[str, Dict[str, int]]] = None,
+        merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
        add_prefix_space: bool = False,
        lowercase: bool = False,
        dropout: Optional[float] = None,
@@ -24,11 +31,11 @@ class ByteLevelBPETokenizer(BaseTokenizer):
        end_of_word_suffix: Optional[str] = None,
        trim_offsets: bool = False,
    ):
-        if vocab_file is not None and merges_file is not None:
+        if vocab is not None and merges is not None:
            tokenizer = Tokenizer(
                BPE(
-                    vocab_file,
-                    merges_file,
+                    vocab,
+                    merges,
                    dropout=dropout,
                    continuing_subword_prefix=continuing_subword_prefix or "",
                    end_of_word_suffix=end_of_word_suffix or "",
--- a/bindings/python/py_src/tokenizers/implementations/char_level_bpe.py
+++ b/bindings/python/py_src/tokenizers/implementations/char_level_bpe.py
@@ -1,31 +1,36 @@
 from .. import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers
 from ..models import BPE
-from ..normalizers import Sequence, Lowercase, unicode_normalizer_from_str, BertNormalizer
+from ..normalizers import (
+    Sequence,
+    Lowercase,
+    unicode_normalizer_from_str,
+    BertNormalizer,
+)
 from .base_tokenizer import BaseTokenizer

-from typing import Optional, List, Union
+from typing import Optional, List, Union, Dict, Tuple


 class CharBPETokenizer(BaseTokenizer):
-    """ Original BPE Tokenizer
+    """Original BPE Tokenizer

-        Represents the BPE algorithm, as introduced by Rico Sennrich
-        (https://arxiv.org/abs/1508.07909)
+    Represents the BPE algorithm, as introduced by Rico Sennrich
+    (https://arxiv.org/abs/1508.07909)

-        The defaults settings corresponds to OpenAI GPT BPE tokenizers and differs from the original
-        Sennrich subword-nmt implementation by the following options that you can deactivate:
-            - adding a normalizer to clean up the text (deactivate with `bert_normalizer=False`) by:
-                * removing any control characters and replacing all whitespaces by the classic one.
-                * handle chinese chars by putting spaces around them.
-                * strip all accents.
-            - spitting on punctuation in addition to whitespaces (deactivate it with
-              `split_on_whitespace_only=True`)
+    The defaults settings corresponds to OpenAI GPT BPE tokenizers and differs from the original
+    Sennrich subword-nmt implementation by the following options that you can deactivate:
+        - adding a normalizer to clean up the text (deactivate with `bert_normalizer=False`) by:
+            * removing any control characters and replacing all whitespaces by the classic one.
+            * handle chinese chars by putting spaces around them.
+            * strip all accents.
+        - spitting on punctuation in addition to whitespaces (deactivate it with
+          `split_on_whitespace_only=True`)
    """

    def __init__(
        self,
-        vocab_file: Optional[str] = None,
-        merges_file: Optional[str] = None,
+        vocab: Optional[Union[str, Dict[str, int]]] = None,
+        merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
        unk_token: Union[str, AddedToken] = "<unk>",
        suffix: str = "</w>",
        dropout: Optional[float] = None,
@@ -34,11 +39,11 @@ class CharBPETokenizer(BaseTokenizer):
        bert_normalizer: bool = True,
        split_on_whitespace_only: bool = False,
    ):
-        if vocab_file is not None and merges_file is not None:
+        if vocab is not None and merges is not None:
            tokenizer = Tokenizer(
                BPE(
-                    vocab_file,
-                    merges_file,
+                    vocab,
+                    merges,
                    dropout=dropout,
                    unk_token=str(unk_token),
                    end_of_word_suffix=suffix,
--- a/bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py
+++ b/bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py
@@ -3,28 +3,26 @@ from tokenizers.models import BPE
 from tokenizers.normalizers import NFKC
 from .base_tokenizer import BaseTokenizer

-from typing import Optional, List, Union
+from typing import Optional, List, Union, Dict, Tuple


 class SentencePieceBPETokenizer(BaseTokenizer):
-    """ SentencePiece BPE Tokenizer
+    """SentencePiece BPE Tokenizer

    Represents the BPE algorithm, with the pretokenization used by SentencePiece
    """

    def __init__(
        self,
-        vocab_file: Optional[str] = None,
-        merges_file: Optional[str] = None,
+        vocab: Optional[Union[str, Dict[str, int]]] = None,
+        merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
        unk_token: Union[str, AddedToken] = "<unk>",
        replacement: str = "▁",
        add_prefix_space: bool = True,
        dropout: Optional[float] = None,
    ):
-        if vocab_file is not None and merges_file is not None:
-            tokenizer = Tokenizer(
-                BPE(vocab_file, merges_file, dropout=dropout, unk_token=unk_token)
-            )
+        if vocab is not None and merges is not None:
+            tokenizer = Tokenizer(BPE(vocab, merges, dropout=dropout, unk_token=unk_token))
        else:
            tokenizer = Tokenizer(BPE())

--- a/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py
+++ b/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py
@@ -92,19 +92,10 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
                "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
            )

-        data = {"unk_id": unk_id, "vocab": vocab}
-
        replacement = "▁"
        add_prefix_space = True

-        out_vocab_filename = f"{filename}.json"
-        try:
-            with open(out_vocab_filename, "w") as f:
-                json.dump(data, f, indent=4)
-
-            tokenizer = Tokenizer(Unigram(out_vocab_filename))
-        finally:
-            os.remove(out_vocab_filename)
+        tokenizer = Tokenizer(Unigram(vocab, unk_id))

        tokenizer.normalizer = normalizers.Precompiled(precompiled_charsmap)
        tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
--- a/bindings/python/py_src/tokenizers/models/init.pyi
+++ b/bindings/python/py_src/tokenizers/models/init.pyi
@@ -1,5 +1,5 @@
 from .. import Encoding, Offsets, Token
-from typing import List, Optional, Union, Tuple
+from typing import List, Optional, Union, Tuple, Dict

 class Model:
    """ Base class for all models
@@ -32,11 +32,15 @@ class BPE(Model):
    Instantiate a BPE Model from the given vocab and merges files.

    Args:
-       vocab: ('`optional`) string:
-           Path to a vocabulary JSON file.
+       vocab: ('`optional`) Dict[str, int]:
+           A dictionnary of string keys and their ids {"am": 0,...}

       merges: (`optional`) string:
-           Path to a merge file.
+           A dictionnary of pairs of ids as keys and their merge correspondace:
+               {(id_left, id_right): (importance, id_merged), .... }
+               with vocab : {"a": 0, "b": 1", ... "ab": 4} the merge
+               {(0, 1): (0, 4) ,...}
+               corresponds to the "ab" merge, that is the most likely merge (0)

       cache_capacity: (`optional`) int:
           The number of words that the BPE cache can contain. The cache allows
@@ -62,8 +66,8 @@ class BPE(Model):
    @staticmethod
    def __init__(
        self,
-        vocab: Optional[str],
-        merges: Optional[str],
+        vocab: Optional[Union[str, Dict[str, int]]],
+        merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]],
        cache_capacity: Optional[int],
        dropout: Optional[float],
        unk_token: Optional[str],
@@ -80,7 +84,7 @@ class WordPiece(Model):

        Args:
            vocab: (`optional`) string:
-                Path to a vocabulary file.
+                A dictionnary of string keys and their ids {"am": 0,...}

            unk_token: (`optional`) str:
                The unknown token to be used by the model.
@@ -91,7 +95,7 @@ class WordPiece(Model):

    def __init__(
        self,
-        vocab: Optional[str],
+        vocab: Optional[Union[str, Dict[str, int]]],
        unk_token: Optional[str],
        max_input_chars_per_word: Optional[int],
    ):
@@ -105,13 +109,13 @@ class WordLevel(Model):

        Args:
            vocab: (`optional`) string:
-                Path to a vocabulary file.
+                A dictionnary of string keys and their ids {"am": 0,...}

            unk_token: str:
                The unknown token to be used by the model.
    """

-    def __init__(self, vocab: Optional[str], unk_token: Optional[str]):
+    def __init__(self, vocab: Optional[Union[str, Dict[str, int]]], unk_token: Optional[str]):
        pass

 class Unigram(Model):
@@ -121,10 +125,10 @@ class Unigram(Model):

    Args:
       vocab: ('`optional`) string:
-           Path to a vocabulary JSON file.
+           A list of vocabulary items and their relative score [("am", -0.2442),...]

    """

    @staticmethod
-    def __init__(self, vocab: Optional[str]):
+    def __init__(self, vocab: Optional[List[Tuple[str, float]]]):
        pass