mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-16 17:18:43 +00:00
Python - Add Encoding repr + improve example
This commit is contained in:
@@ -14,6 +14,7 @@ parser.add_argument("--type", default="gpt2", type=str, help="The type of tokeni
|
|||||||
parser.add_argument("--file", default=None, type=str, help="The file to encode")
|
parser.add_argument("--file", default=None, type=str, help="The file to encode")
|
||||||
parser.add_argument("--vocab", default=None, type=str, required=True, help="The vocab file")
|
parser.add_argument("--vocab", default=None, type=str, required=True, help="The vocab file")
|
||||||
parser.add_argument("--merges", default=None, type=str, help="The merges.txt file")
|
parser.add_argument("--merges", default=None, type=str, help="The merges.txt file")
|
||||||
|
parser.add_argument("--debug", default=False, type=bool, help="Verbose output")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if args.type == "gpt2" and args.merges is None:
|
if args.type == "gpt2" and args.merges is None:
|
||||||
@@ -47,6 +48,7 @@ Namespaces are one honking great idea -- let's do more of those!
|
|||||||
""".split("\n")
|
""".split("\n")
|
||||||
|
|
||||||
if args.type == "gpt2":
|
if args.type == "gpt2":
|
||||||
|
print("Running GPT-2 tokenizer")
|
||||||
tok_p = GPT2Tokenizer.from_pretrained('gpt2')
|
tok_p = GPT2Tokenizer.from_pretrained('gpt2')
|
||||||
|
|
||||||
# Create a Tokenizer using BPE
|
# Create a Tokenizer using BPE
|
||||||
@@ -56,6 +58,7 @@ if args.type == "gpt2":
|
|||||||
# Use ByteLevel Decoder
|
# Use ByteLevel Decoder
|
||||||
tok_r.with_decoder(decoders.ByteLevel.new())
|
tok_r.with_decoder(decoders.ByteLevel.new())
|
||||||
elif args.type == "bert":
|
elif args.type == "bert":
|
||||||
|
print("Running Bert tokenizer")
|
||||||
tok_p = BertTokenizer.from_pretrained('bert-base-uncased')
|
tok_p = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||||
|
|
||||||
tok_r = Tokenizer(models.WordPiece.from_files(args.vocab))
|
tok_r = Tokenizer(models.WordPiece.from_files(args.vocab))
|
||||||
@@ -65,7 +68,6 @@ else:
|
|||||||
raise Exception(f"Unknown type {args.type}")
|
raise Exception(f"Unknown type {args.type}")
|
||||||
|
|
||||||
def tokenize_r():
|
def tokenize_r():
|
||||||
# return [ tok_r.encode(sentence) for sentence in text]
|
|
||||||
return tok_r.encode_batch(text);
|
return tok_r.encode_batch(text);
|
||||||
|
|
||||||
def tokenize_p():
|
def tokenize_p():
|
||||||
@@ -89,28 +91,27 @@ print(f"Transformer tokenizer took: {time_p} sec")
|
|||||||
|
|
||||||
print(f"SpeedUp Ratio: {time_p / time_r}")
|
print(f"SpeedUp Ratio: {time_p / time_r}")
|
||||||
|
|
||||||
ids_r = [ [ token.id for token in sentence ] for sentence in encoded_r ]
|
ids_r = [ sentence.ids for sentence in encoded_r ]
|
||||||
diff = 0
|
diff_ids = 0
|
||||||
for i in range(0, len(ids_r)):
|
for i in range(0, len(encoded_r)):
|
||||||
if ids_r[i] != encoded_p[i]:
|
if encoded_r[i].ids != encoded_p[i]:
|
||||||
diff += 1
|
diff_ids += 1
|
||||||
print("".join([ token.value for token in encoded_r[i] ]))
|
if args.debug:
|
||||||
print("".join(tok_p.tokenize(text[i])))
|
print("".join([ token.value for token in encoded_r[i] ]))
|
||||||
print(text[i])
|
print("".join(tok_p.tokenize(text[i])))
|
||||||
print("")
|
print(text[i])
|
||||||
#print(ids_r[i])
|
print("")
|
||||||
#print(encoded_p[i])
|
print(f"Ids differences: {diff_ids}")
|
||||||
print(f"DIFF: {diff}")
|
|
||||||
assert(ids_r == encoded_p)
|
|
||||||
|
|
||||||
exit()
|
decoded_r = tok_r.decode_batch([ sentence.ids for sentence in encoded_r ])
|
||||||
decoded_r = tok_r.decode_batch(ids_r)
|
|
||||||
decoded_p = [ tok_p.decode(en) for en in encoded_p ]
|
decoded_p = [ tok_p.decode(en) for en in encoded_p ]
|
||||||
|
diff_decoded = 0
|
||||||
for i in range(0, len(text)):
|
for i in range(0, len(text)):
|
||||||
if decoded_r[i] != decoded_p[i]: #text[i]:
|
if decoded_r[i] != decoded_p[i]:
|
||||||
print(decoded_r[i])
|
diff_decoded += 1
|
||||||
print(decoded_p[i])
|
if args.debug:
|
||||||
#print(text[i])
|
print(f"Original: {text[i]}")
|
||||||
print("")
|
print(f"Rust: {decoded_r[i]}")
|
||||||
|
print(f"Python: {decoded_p[i]}")
|
||||||
assert(decoded_r == text)
|
print("")
|
||||||
|
print(f"Decoding differences: {diff_decoded}")
|
||||||
|
|||||||
@@ -1,8 +1,9 @@
|
|||||||
extern crate tokenizers as tk;
|
extern crate tokenizers as tk;
|
||||||
|
|
||||||
use pyo3::prelude::*;
|
use pyo3::prelude::*;
|
||||||
|
use pyo3::PyObjectProtocol;
|
||||||
|
|
||||||
#[pyclass]
|
#[pyclass(dict)]
|
||||||
#[repr(transparent)]
|
#[repr(transparent)]
|
||||||
pub struct Encoding {
|
pub struct Encoding {
|
||||||
encoding: tk::tokenizer::Encoding,
|
encoding: tk::tokenizer::Encoding,
|
||||||
@@ -14,6 +15,16 @@ impl Encoding {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[pyproto]
|
||||||
|
impl PyObjectProtocol for Encoding {
|
||||||
|
fn __repr__(&self) -> PyResult<String> {
|
||||||
|
Ok(format!(
|
||||||
|
"Encoding {{ original: '{}', ... }}",
|
||||||
|
self.encoding.get_original()
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl Encoding {
|
impl Encoding {
|
||||||
#[getter]
|
#[getter]
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ pub struct Model {
|
|||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl Model {
|
impl Model {
|
||||||
#[new]
|
#[new]
|
||||||
fn new(obj: &PyRawObject) -> PyResult<()> {
|
fn new(_obj: &PyRawObject) -> PyResult<()> {
|
||||||
Err(exceptions::Exception::py_err(
|
Err(exceptions::Exception::py_err(
|
||||||
"Cannot create a Model directly. Use a concrete subclass",
|
"Cannot create a Model directly. Use a concrete subclass",
|
||||||
))
|
))
|
||||||
|
|||||||
@@ -8,7 +8,6 @@ use super::decoders::Decoder;
|
|||||||
use super::encoding::Encoding;
|
use super::encoding::Encoding;
|
||||||
use super::models::Model;
|
use super::models::Model;
|
||||||
use super::pre_tokenizers::PreTokenizer;
|
use super::pre_tokenizers::PreTokenizer;
|
||||||
use super::token::Token;
|
|
||||||
use super::trainers::Trainer;
|
use super::trainers::Trainer;
|
||||||
|
|
||||||
#[pyclass(dict)]
|
#[pyclass(dict)]
|
||||||
|
|||||||
Reference in New Issue
Block a user