Adding Replace to decoder (to undo the Replace Normalizer for (#1195)

Metaspace split).
This commit is contained in:
Nicolas Patry
2023-03-23 23:43:47 +01:00
committed by GitHub
parent 178e294a6a
commit 250d46c676
10 changed files with 135 additions and 1 deletions

View File

@ -12,6 +12,13 @@ interface Decoder {
*/
export function byteLevelDecoder(): Decoder;
/**
* Instantiate a new Replace Decoder
* @param [pattern] The pattern to replace
* @param [content] The replacement.
*/
export function replaceDecoder(pattern: string, content: string): Decoder;
/**
* Instantiate a new WordPiece Decoder
* @param [prefix='##'] The prefix to use for subwords that are not a beginning-of-word

View File

@ -2,6 +2,7 @@ const native = require("./native");
module.exports = {
byteLevelDecoder: native.decoders_ByteLevel,
replaceDecoder: native.decoders_Replace,
wordPieceDecoder: native.decoders_WordPiece,
byteFallbackDecoder: native.decoders_ByteFallback,
metaspaceDecoder: native.decoders_Metaspace,

View File

@ -3,6 +3,7 @@ import {
byteFallbackDecoder,
ctcDecoder,
metaspaceDecoder,
replaceDecoder,
sequenceDecoder,
wordPieceDecoder,
} from "./decoders";
@ -44,6 +45,12 @@ describe("byteFallbackDecoder", () => {
});
});
describe("replaceDecoder", () => {
it("can decode arrays of strings", () => {
expect(replaceDecoder("_", " ").decode(["Hello", "_Hello"])).toEqual("Hello Hello");
});
});
describe("metaspaceDecoder", () => {
it("accepts `undefined` as first parameter", () => {
expect(metaspaceDecoder(undefined)).toBeDefined();

View File

@ -57,6 +57,20 @@ fn byte_level(mut cx: FunctionContext) -> JsResult<JsDecoder> {
Ok(decoder)
}
/// replace()
fn replace(mut cx: FunctionContext) -> JsResult<JsDecoder> {
let pattern: String = cx.extract::<String>(0)?;
let content: String = cx.extract::<String>(1)?;
let mut decoder = JsDecoder::new::<_, JsDecoder, _>(&mut cx, vec![])?;
let guard = cx.lock();
decoder.borrow_mut(&guard).decoder = Some(Arc::new(
tk::normalizers::replace::Replace::new(pattern, content)
.map_err(|e| Error(e.to_string()))?
.into(),
));
Ok(decoder)
}
/// wordpiece(prefix: String = "##", cleanup: bool)
fn wordpiece(mut cx: FunctionContext) -> JsResult<JsDecoder> {
let prefix = cx
@ -156,6 +170,7 @@ fn sequence(mut cx: FunctionContext) -> JsResult<JsDecoder> {
/// Register everything here
pub fn register(m: &mut ModuleContext, prefix: &str) -> NeonResult<()> {
m.export_function(&format!("{}_ByteLevel", prefix), byte_level)?;
m.export_function(&format!("{}_Replace", prefix), replace)?;
m.export_function(&format!("{}_WordPiece", prefix), wordpiece)?;
m.export_function(&format!("{}_ByteFallback", prefix), byte_fallback)?;
m.export_function(&format!("{}_Metaspace", prefix), metaspace)?;