mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-06 12:48:18 +00:00
Going back for not fuse_unk by default for BPE, but add a flag to
enable it.
This commit is contained in:
committed by
Anthony MOI
parent
940f8bd8fa
commit
9d3a93db5b
@@ -45,6 +45,9 @@ class BPE(Model):
|
||||
|
||||
end_of_word_suffix: (`optional`) str:
|
||||
The suffix to attach to subword units that represent an end of word.
|
||||
|
||||
fuse_unk: (`optional`) bool:
|
||||
Multiple unk tokens get fused into only 1
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
@@ -57,6 +60,7 @@ class BPE(Model):
|
||||
unk_token: Optional[str],
|
||||
continuing_subword_prefix: Optional[str],
|
||||
end_of_word_suffix: Optional[str],
|
||||
fuse_unk: Optional[bool],
|
||||
):
|
||||
pass
|
||||
|
||||
|
||||
@@ -106,6 +106,7 @@ class SpmConverter(Converter):
|
||||
out_vocab_filename,
|
||||
out_merge_filename,
|
||||
unk_token=proto.trainer_spec.unk_piece,
|
||||
fuse_unk=True,
|
||||
)
|
||||
)
|
||||
finally:
|
||||
|
||||
@@ -158,6 +158,7 @@ impl PyBPE {
|
||||
builder = builder.continuing_subword_prefix(value.extract()?)
|
||||
}
|
||||
"end_of_word_suffix" => builder = builder.end_of_word_suffix(value.extract()?),
|
||||
"fuse_unk" => builder = builder.fuse_unk(value.extract()?),
|
||||
_ => println!("Ignored unknown kwarg option {}", key),
|
||||
};
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user