From 8a6a8dc9d571f251c4db2c2272aeddfb8a183de9 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Fri, 24 Mar 2023 01:57:39 +0100 Subject: [PATCH] Fixing decoder strip because of char boundaries. (#1197) --- tokenizers/src/decoders/strip.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tokenizers/src/decoders/strip.rs b/tokenizers/src/decoders/strip.rs index 1691c707..0f6e0426 100644 --- a/tokenizers/src/decoders/strip.rs +++ b/tokenizers/src/decoders/strip.rs @@ -23,7 +23,13 @@ impl Decoder for Strip { fn decode_chain(&self, tokens: Vec) -> Result> { Ok(tokens .into_iter() - .map(|token| token[self.left..token.len() - self.right].to_string()) + .map(|token| { + token + .chars() + .skip(self.left) + .take(token.len() - self.left - self.right) + .collect() + }) .collect()) } }