mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-25 01:29:23 +00:00
Python - Black auto formatting
This commit is contained in:
@ -7,28 +7,33 @@ parser.add_argument("--vocab", default=None, type=str, required=True, help="The
|
||||
parser.add_argument("--merges", default=None, type=str, required=True, help="The merges.txt file")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
class GoodCustom:
|
||||
"""GoodCustom
|
||||
This class represents a good custom PreTokenizer that will be called
|
||||
by `tokenizers` when needed
|
||||
"""
|
||||
|
||||
def pre_tokenize(self, sentence):
|
||||
return sentence.split(" ")
|
||||
|
||||
def decode(self, tokens):
|
||||
return ", ".join(tokens)
|
||||
|
||||
|
||||
class BadCustom:
|
||||
"""Bad Pretok
|
||||
This class represents a bad custom PreTokenizer that will trigger an exception
|
||||
when called by `tokenizers`
|
||||
"""
|
||||
|
||||
def pre_tokenize(self, sentence):
|
||||
return None
|
||||
|
||||
def decode(self, tokens):
|
||||
return None
|
||||
|
||||
|
||||
def tokenize(sentence):
|
||||
output = tokenizer.encode(sentence).tokens
|
||||
print(f"`{sentence}` tokenized to {output}")
|
||||
@ -66,4 +71,3 @@ try:
|
||||
encoding = tokenizer.encode("Hey friend!")
|
||||
except:
|
||||
print("Bad tokenizer didn't work")
|
||||
|
||||
|
Reference in New Issue
Block a user