mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Adding Replace
to decoder (to undo the Replace Normalizer for (#1195)
Metaspace split).
This commit is contained in:
7
bindings/node/lib/bindings/decoders.d.ts
vendored
7
bindings/node/lib/bindings/decoders.d.ts
vendored
@ -12,6 +12,13 @@ interface Decoder {
|
||||
*/
|
||||
export function byteLevelDecoder(): Decoder;
|
||||
|
||||
/**
|
||||
* Instantiate a new Replace Decoder
|
||||
* @param [pattern] The pattern to replace
|
||||
* @param [content] The replacement.
|
||||
*/
|
||||
export function replaceDecoder(pattern: string, content: string): Decoder;
|
||||
|
||||
/**
|
||||
* Instantiate a new WordPiece Decoder
|
||||
* @param [prefix='##'] The prefix to use for subwords that are not a beginning-of-word
|
||||
|
@ -2,6 +2,7 @@ const native = require("./native");
|
||||
|
||||
module.exports = {
|
||||
byteLevelDecoder: native.decoders_ByteLevel,
|
||||
replaceDecoder: native.decoders_Replace,
|
||||
wordPieceDecoder: native.decoders_WordPiece,
|
||||
byteFallbackDecoder: native.decoders_ByteFallback,
|
||||
metaspaceDecoder: native.decoders_Metaspace,
|
||||
|
@ -3,6 +3,7 @@ import {
|
||||
byteFallbackDecoder,
|
||||
ctcDecoder,
|
||||
metaspaceDecoder,
|
||||
replaceDecoder,
|
||||
sequenceDecoder,
|
||||
wordPieceDecoder,
|
||||
} from "./decoders";
|
||||
@ -44,6 +45,12 @@ describe("byteFallbackDecoder", () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe("replaceDecoder", () => {
|
||||
it("can decode arrays of strings", () => {
|
||||
expect(replaceDecoder("_", " ").decode(["Hello", "_Hello"])).toEqual("Hello Hello");
|
||||
});
|
||||
});
|
||||
|
||||
describe("metaspaceDecoder", () => {
|
||||
it("accepts `undefined` as first parameter", () => {
|
||||
expect(metaspaceDecoder(undefined)).toBeDefined();
|
||||
|
@ -57,6 +57,20 @@ fn byte_level(mut cx: FunctionContext) -> JsResult<JsDecoder> {
|
||||
Ok(decoder)
|
||||
}
|
||||
|
||||
/// replace()
|
||||
fn replace(mut cx: FunctionContext) -> JsResult<JsDecoder> {
|
||||
let pattern: String = cx.extract::<String>(0)?;
|
||||
let content: String = cx.extract::<String>(1)?;
|
||||
let mut decoder = JsDecoder::new::<_, JsDecoder, _>(&mut cx, vec![])?;
|
||||
let guard = cx.lock();
|
||||
decoder.borrow_mut(&guard).decoder = Some(Arc::new(
|
||||
tk::normalizers::replace::Replace::new(pattern, content)
|
||||
.map_err(|e| Error(e.to_string()))?
|
||||
.into(),
|
||||
));
|
||||
Ok(decoder)
|
||||
}
|
||||
|
||||
/// wordpiece(prefix: String = "##", cleanup: bool)
|
||||
fn wordpiece(mut cx: FunctionContext) -> JsResult<JsDecoder> {
|
||||
let prefix = cx
|
||||
@ -156,6 +170,7 @@ fn sequence(mut cx: FunctionContext) -> JsResult<JsDecoder> {
|
||||
/// Register everything here
|
||||
pub fn register(m: &mut ModuleContext, prefix: &str) -> NeonResult<()> {
|
||||
m.export_function(&format!("{}_ByteLevel", prefix), byte_level)?;
|
||||
m.export_function(&format!("{}_Replace", prefix), replace)?;
|
||||
m.export_function(&format!("{}_WordPiece", prefix), wordpiece)?;
|
||||
m.export_function(&format!("{}_ByteFallback", prefix), byte_fallback)?;
|
||||
m.export_function(&format!("{}_Metaspace", prefix), metaspace)?;
|
||||
|
Reference in New Issue
Block a user