diff --git a/bindings/python/CHANGELOG.md b/bindings/python/CHANGELOG.md index 80b7cd10..2e235daf 100644 --- a/bindings/python/CHANGELOG.md +++ b/bindings/python/CHANGELOG.md @@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.8.1-rc1] + +### Fixed +- [#333]: Fix deserialization of `AddedToken`, where the content was not restored properly + +### Changed +- [#329]: Improved warning and behavior when we detect a fork +- [#330]: BertNormalizer now keeps the same behavior than the original implementation when +`strip_accents` is not specified. + ## [0.8.0] ### Highlights of this release @@ -209,6 +219,9 @@ delimiter (Works like `.split(delimiter)`) - Fix a bug with the IDs associated with added tokens. - Fix a bug that was causing crashes in Python 3.5 +[#333]: https://github.com/huggingface/tokenizers/pull/333 +[#330]: https://github.com/huggingface/tokenizers/pull/330 +[#329]: https://github.com/huggingface/tokenizers/pull/329 [#311]: https://github.com/huggingface/tokenizers/pull/311 [#309]: https://github.com/huggingface/tokenizers/pull/309 [#289]: https://github.com/huggingface/tokenizers/pull/289 diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock index d06c2a15..e06c44e5 100644 --- a/bindings/python/Cargo.lock +++ b/bindings/python/Cargo.lock @@ -641,7 +641,7 @@ dependencies = [ [[package]] name = "tokenizers-python" -version = "0.8.0" +version = "0.8.1-rc1" dependencies = [ "libc 0.2.68 (registry+https://github.com/rust-lang/crates.io-index)", "pyo3 0.9.2 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index d5d4cd32..93667444 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tokenizers-python" -version = "0.8.0" +version = "0.8.1-rc1" authors = ["Anthony MOI "] edition = "2018" diff --git a/bindings/python/setup.py b/bindings/python/setup.py index 8109ff70..35eb3764 100644 --- a/bindings/python/setup.py +++ b/bindings/python/setup.py @@ -6,7 +6,7 @@ extras["testing"] = ["pytest"] setup( name="tokenizers", - version="0.8.0", + version="0.8.1.rc1", description="Fast and Customizable Tokenizers", long_description=open("README.md", "r", encoding="utf-8").read(), long_description_content_type="text/markdown", diff --git a/bindings/python/tokenizers/__init__.py b/bindings/python/tokenizers/__init__.py index 658e8c12..f1261aa1 100644 --- a/bindings/python/tokenizers/__init__.py +++ b/bindings/python/tokenizers/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.8.0" +__version__ = "0.8.1.rc1" from typing import Tuple, Union, Tuple, List diff --git a/tokenizers/CHANGELOG.md b/tokenizers/CHANGELOG.md index 5d8e2de9..a1c847c1 100644 --- a/tokenizers/CHANGELOG.md +++ b/tokenizers/CHANGELOG.md @@ -34,6 +34,8 @@ implementation from GPT-2 - [#309]: Improved the management of the additional vocabulary. This introduces an option `normalized`, controlling whether a token should be extracted from the normalized version of the input text. +- [#330]: BertNormalizer now keeps the same behavior than the original implementation when +`strip_accents` is not specified. ### Added - [#236]: RobertaProcessing is now also taking care of trimming offsets, and works just as ByteLevel @@ -119,6 +121,7 @@ advised, but that's not the question) split up in multiple bytes - [#174]: The `LongestFirst` truncation strategy had a bug +[#330]: https://github.com/huggingface/tokenizers/pull/330 [#311]: https://github.com/huggingface/tokenizers/pull/311 [#309]: https://github.com/huggingface/tokenizers/pull/309 [#298]: https://github.com/huggingface/tokenizers/pull/298