mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Update CHANGELOGs and bump version for python release
This commit is contained in:
@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [0.8.1-rc1]
|
||||
|
||||
### Fixed
|
||||
- [#333]: Fix deserialization of `AddedToken`, where the content was not restored properly
|
||||
|
||||
### Changed
|
||||
- [#329]: Improved warning and behavior when we detect a fork
|
||||
- [#330]: BertNormalizer now keeps the same behavior than the original implementation when
|
||||
`strip_accents` is not specified.
|
||||
|
||||
## [0.8.0]
|
||||
|
||||
### Highlights of this release
|
||||
@ -209,6 +219,9 @@ delimiter (Works like `.split(delimiter)`)
|
||||
- Fix a bug with the IDs associated with added tokens.
|
||||
- Fix a bug that was causing crashes in Python 3.5
|
||||
|
||||
[#333]: https://github.com/huggingface/tokenizers/pull/333
|
||||
[#330]: https://github.com/huggingface/tokenizers/pull/330
|
||||
[#329]: https://github.com/huggingface/tokenizers/pull/329
|
||||
[#311]: https://github.com/huggingface/tokenizers/pull/311
|
||||
[#309]: https://github.com/huggingface/tokenizers/pull/309
|
||||
[#289]: https://github.com/huggingface/tokenizers/pull/289
|
||||
|
2
bindings/python/Cargo.lock
generated
2
bindings/python/Cargo.lock
generated
@ -641,7 +641,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tokenizers-python"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1-rc1"
|
||||
dependencies = [
|
||||
"libc 0.2.68 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"pyo3 0.9.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
|
@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tokenizers-python"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1-rc1"
|
||||
authors = ["Anthony MOI <m.anthony.moi@gmail.com>"]
|
||||
edition = "2018"
|
||||
|
||||
|
@ -6,7 +6,7 @@ extras["testing"] = ["pytest"]
|
||||
|
||||
setup(
|
||||
name="tokenizers",
|
||||
version="0.8.0",
|
||||
version="0.8.1.rc1",
|
||||
description="Fast and Customizable Tokenizers",
|
||||
long_description=open("README.md", "r", encoding="utf-8").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
|
@ -1,4 +1,4 @@
|
||||
__version__ = "0.8.0"
|
||||
__version__ = "0.8.1.rc1"
|
||||
|
||||
from typing import Tuple, Union, Tuple, List
|
||||
|
||||
|
@ -34,6 +34,8 @@ implementation from GPT-2
|
||||
- [#309]: Improved the management of the additional vocabulary. This introduces an option
|
||||
`normalized`, controlling whether a token should be extracted from the normalized version of the
|
||||
input text.
|
||||
- [#330]: BertNormalizer now keeps the same behavior than the original implementation when
|
||||
`strip_accents` is not specified.
|
||||
|
||||
### Added
|
||||
- [#236]: RobertaProcessing is now also taking care of trimming offsets, and works just as ByteLevel
|
||||
@ -119,6 +121,7 @@ advised, but that's not the question)
|
||||
split up in multiple bytes
|
||||
- [#174]: The `LongestFirst` truncation strategy had a bug
|
||||
|
||||
[#330]: https://github.com/huggingface/tokenizers/pull/330
|
||||
[#311]: https://github.com/huggingface/tokenizers/pull/311
|
||||
[#309]: https://github.com/huggingface/tokenizers/pull/309
|
||||
[#298]: https://github.com/huggingface/tokenizers/pull/298
|
||||
|
Reference in New Issue
Block a user