Going back for not fuse_unk by default for BPE, but add a flag to

enable it.
This commit is contained in:
Nicolas Patry
2020-09-22 11:00:08 +02:00
committed by Anthony MOI
parent 940f8bd8fa
commit 9d3a93db5b
6 changed files with 95 additions and 7 deletions

View File

@@ -45,6 +45,9 @@ class BPE(Model):
end_of_word_suffix: (`optional`) str:
The suffix to attach to subword units that represent an end of word.
fuse_unk: (`optional`) bool:
Multiple unk tokens get fused into only 1
"""
@staticmethod
@@ -57,6 +60,7 @@ class BPE(Model):
unk_token: Optional[str],
continuing_subword_prefix: Optional[str],
end_of_word_suffix: Optional[str],
fuse_unk: Optional[bool],
):
pass

View File

@@ -106,6 +106,7 @@ class SpmConverter(Converter):
out_vocab_filename,
out_merge_filename,
unk_token=proto.trainer_spec.unk_piece,
fuse_unk=True,
)
)
finally:

View File

@@ -158,6 +158,7 @@ impl PyBPE {
builder = builder.continuing_subword_prefix(value.extract()?)
}
"end_of_word_suffix" => builder = builder.end_of_word_suffix(value.extract()?),
"fuse_unk" => builder = builder.fuse_unk(value.extract()?),
_ => println!("Ignored unknown kwarg option {}", key),
};
}