mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Add a visualization utility to render tokens and annotations in a notebook (#508)
* Draft functionality of visualization * Added comments to make code more intelligble * polish the styles * Ensure colors are stable and comment the css * Code clean up * Made visualizer importable and added some docs * Fix styling * implement comments from PR * Fixed the regex for UNK tokens and examples in notebook * Converted docs to google format * Added a notebook showing multiple languages and tokenizers * Added visual indication of chars that are tokenized with >1 token * Reorganize things a bit and fix import * Update docs Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@ -4,7 +4,7 @@
|
||||
.vim
|
||||
.env
|
||||
target
|
||||
|
||||
.idea
|
||||
Cargo.lock
|
||||
|
||||
/data
|
||||
@ -17,6 +17,7 @@ __pycache__
|
||||
pip-wheel-metadata
|
||||
*.egg-info
|
||||
*.so
|
||||
/bindings/python/examples/.ipynb_checkpoints
|
||||
/bindings/python/build
|
||||
/bindings/python/dist
|
||||
|
||||
|
1053
bindings/python/examples/using_the_visualizer.ipynb
Normal file
1053
bindings/python/examples/using_the_visualizer.ipynb
Normal file
File diff suppressed because one or more lines are too long
@ -91,6 +91,7 @@ from .tokenizers import normalizers
|
||||
from .tokenizers import pre_tokenizers
|
||||
from .tokenizers import processors
|
||||
from .tokenizers import trainers
|
||||
|
||||
from .implementations import (
|
||||
ByteLevelBPETokenizer,
|
||||
CharBPETokenizer,
|
||||
|
1
bindings/python/py_src/tokenizers/tools/__init__.py
Normal file
1
bindings/python/py_src/tokenizers/tools/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
from .visualizer import EncodingVisualizer, Annotation
|
170
bindings/python/py_src/tokenizers/tools/visualizer-styles.css
Normal file
170
bindings/python/py_src/tokenizers/tools/visualizer-styles.css
Normal file
@ -0,0 +1,170 @@
|
||||
.tokenized-text {
|
||||
width:100%;
|
||||
padding:2rem;
|
||||
max-height: 400px;
|
||||
overflow-y: auto;
|
||||
box-sizing:border-box;
|
||||
line-height:4rem; /* Lots of space between lines */
|
||||
font-family: "Roboto Light", "Ubuntu Light", "Ubuntu", monospace;
|
||||
box-shadow: 2px 2px 2px rgba(0,0,0,0.2);
|
||||
background-color: rgba(0,0,0,0.01);
|
||||
letter-spacing:2px; /* Give some extra separation between chars */
|
||||
}
|
||||
.non-token{
|
||||
/* White space and other things the tokenizer ignores*/
|
||||
white-space: pre;
|
||||
letter-spacing:4px;
|
||||
border-top:1px solid #A0A0A0; /* A gentle border on top and bottom makes tabs more ovious*/
|
||||
border-bottom:1px solid #A0A0A0;
|
||||
line-height: 1rem;
|
||||
height: calc(100% - 2px);
|
||||
}
|
||||
|
||||
.token {
|
||||
white-space: pre;
|
||||
position:relative;
|
||||
color:black;
|
||||
letter-spacing:2px;
|
||||
}
|
||||
|
||||
.annotation{
|
||||
white-space:nowrap; /* Important - ensures that annotations appears even if the annotated text wraps a line */
|
||||
border-radius:4px;
|
||||
position:relative;
|
||||
width:fit-content;
|
||||
}
|
||||
.annotation:before {
|
||||
/*The before holds the text and the after holds the background*/
|
||||
z-index:1000; /* Make sure this is above the background */
|
||||
content:attr(data-label); /* The annotations label is on a data attribute */
|
||||
color:white;
|
||||
position:absolute;
|
||||
font-size:1rem;
|
||||
text-align:center;
|
||||
font-weight:bold;
|
||||
|
||||
top:1.75rem;
|
||||
line-height:0;
|
||||
left:0;
|
||||
width:100%;
|
||||
padding:0.5rem 0;
|
||||
/* These make it so an annotation doesn't stretch beyond the annotated text if the label is longer*/
|
||||
overflow: hidden;
|
||||
white-space: nowrap;
|
||||
text-overflow:ellipsis;
|
||||
}
|
||||
|
||||
.annotation:after {
|
||||
content:attr(data-label); /* The content defines the width of the annotation*/
|
||||
position:absolute;
|
||||
font-size:0.75rem;
|
||||
text-align:center;
|
||||
font-weight:bold;
|
||||
text-overflow:ellipsis;
|
||||
top:1.75rem;
|
||||
line-height:0;
|
||||
overflow: hidden;
|
||||
white-space: nowrap;
|
||||
|
||||
left:0;
|
||||
width:100%; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/
|
||||
|
||||
padding:0.5rem 0;
|
||||
/* Nast hack below:
|
||||
We set the annotations color in code because we don't know the colors at css time.
|
||||
But you can't pass a color as a data attribute to get it into the pseudo element (this thing)
|
||||
So to get around that, annotations have the color set on them with a style attribute and then we
|
||||
can get the color with currentColor.
|
||||
Annotations wrap tokens and tokens set the color back to black
|
||||
*/
|
||||
background-color: currentColor;
|
||||
}
|
||||
.annotation:hover::after, .annotation:hover::before{
|
||||
/* When the user hovers over an annotation expand the label to display in full
|
||||
*/
|
||||
min-width: fit-content;
|
||||
}
|
||||
|
||||
.annotation:hover{
|
||||
/* Emphasize the annotation start end with a border on hover*/
|
||||
border-color: currentColor;
|
||||
border: 2px solid;
|
||||
}
|
||||
.special-token:not(:empty){
|
||||
/*
|
||||
A none empty special token is like UNK (as opposed to CLS which has no representation in the text )
|
||||
*/
|
||||
position:relative;
|
||||
}
|
||||
.special-token:empty::before{
|
||||
/* Special tokens that don't have text are displayed as pseudo elements so we dont select them with the mouse*/
|
||||
content:attr(data-stok);
|
||||
background:#202020;
|
||||
font-size:0.75rem;
|
||||
color:white;
|
||||
margin: 0 0.25rem;
|
||||
padding: 0.25rem;
|
||||
border-radius:4px
|
||||
}
|
||||
|
||||
.special-token:not(:empty):before {
|
||||
/* Special tokens that have text (UNK) are displayed above the actual text*/
|
||||
content:attr(data-stok);
|
||||
position:absolute;
|
||||
bottom:1.75rem;
|
||||
min-width:100%;
|
||||
width:100%;
|
||||
height:1rem;
|
||||
line-height:1rem;
|
||||
font-size:1rem;
|
||||
text-align:center;
|
||||
color:white;
|
||||
font-weight:bold;
|
||||
background:#202020;
|
||||
border-radius:10%;
|
||||
}
|
||||
/*
|
||||
We want to alternate the color of tokens, but we can't use nth child because tokens might be broken up by annotations
|
||||
instead we apply even and odd class at generation time and color them that way
|
||||
*/
|
||||
.even-token{
|
||||
background:#DCDCDC ;
|
||||
border: 1px solid #DCDCDC;
|
||||
}
|
||||
.odd-token{
|
||||
background:#A0A0A0;
|
||||
border: 1px solid #A0A0A0;
|
||||
}
|
||||
.even-token.multi-token,.odd-token.multi-token{
|
||||
background: repeating-linear-gradient(
|
||||
45deg,
|
||||
transparent,
|
||||
transparent 1px,
|
||||
#ccc 1px,
|
||||
#ccc 1px
|
||||
),
|
||||
/* on "bottom" */
|
||||
linear-gradient(
|
||||
to bottom,
|
||||
#FFB6C1,
|
||||
#999
|
||||
);
|
||||
}
|
||||
|
||||
.multi-token:hover::after {
|
||||
content:"This char has more than 1 token"; /* The content defines the width of the annotation*/
|
||||
color:white;
|
||||
background-color: black;
|
||||
position:absolute;
|
||||
font-size:0.75rem;
|
||||
text-align:center;
|
||||
font-weight:bold;
|
||||
text-overflow:ellipsis;
|
||||
top:1.75rem;
|
||||
line-height:0;
|
||||
overflow: hidden;
|
||||
white-space: nowrap;
|
||||
left:0;
|
||||
width:fit-content; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/
|
||||
padding:0.5rem 0;
|
||||
}
|
412
bindings/python/py_src/tokenizers/tools/visualizer.py
Normal file
412
bindings/python/py_src/tokenizers/tools/visualizer.py
Normal file
@ -0,0 +1,412 @@
|
||||
import os
|
||||
import itertools
|
||||
import re
|
||||
from typing import List, Optional, Tuple, Dict, Callable, Any, NamedTuple
|
||||
from string import Template
|
||||
from typing import List
|
||||
|
||||
from tokenizers import Tokenizer, Encoding
|
||||
|
||||
dirname = os.path.dirname(__file__)
|
||||
css_filename = os.path.join(dirname, "visualizer-styles.css")
|
||||
with open(css_filename) as f:
|
||||
css = f.read()
|
||||
|
||||
|
||||
class Annotation:
|
||||
start: int
|
||||
end: int
|
||||
label: int
|
||||
|
||||
def __init__(self, start: int, end: int, label: str):
|
||||
self.start = start
|
||||
self.end = end
|
||||
self.label = label
|
||||
|
||||
|
||||
AnnotationList = List[Annotation]
|
||||
PartialIntList = List[Optional[int]]
|
||||
|
||||
|
||||
class CharStateKey(NamedTuple):
|
||||
token_ix: Optional[int]
|
||||
anno_ix: Optional[int]
|
||||
|
||||
|
||||
class CharState:
|
||||
char_ix: Optional[int]
|
||||
|
||||
def __init__(self, char_ix):
|
||||
self.char_ix = char_ix
|
||||
|
||||
self.anno_ix: Optional[int] = None
|
||||
self.tokens: List[int] = []
|
||||
|
||||
@property
|
||||
def token_ix(self):
|
||||
return self.tokens[0] if len(self.tokens) > 0 else None
|
||||
|
||||
@property
|
||||
def is_multitoken(self):
|
||||
"""
|
||||
BPE tokenizers can output more than one token for a char
|
||||
"""
|
||||
return len(self.tokens) > 1
|
||||
|
||||
def partition_key(self) -> CharStateKey:
|
||||
return CharStateKey(
|
||||
token_ix=self.token_ix,
|
||||
anno_ix=self.anno_ix,
|
||||
)
|
||||
|
||||
|
||||
class Aligned:
|
||||
pass
|
||||
|
||||
|
||||
class EncodingVisualizer:
|
||||
"""
|
||||
Build an EncodingVisualizer
|
||||
|
||||
Args:
|
||||
|
||||
tokenizer (:class:`~tokenizers.Tokenizer`):
|
||||
A tokenizer instance
|
||||
|
||||
default_to_notebook (:obj:`bool`):
|
||||
Whether to render html output in a notebook by default
|
||||
|
||||
annotation_converter (:obj:`Callable`, `optional`):
|
||||
An optional (lambda) function that takes an annotation in any format and returns
|
||||
an Annotation object
|
||||
"""
|
||||
|
||||
unk_token_regex = re.compile("(.{1}\b)?(unk|oov)(\b.{1})?", flags=re.IGNORECASE)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
tokenizer: Tokenizer,
|
||||
default_to_notebook: bool = True,
|
||||
annotation_converter: Optional[Callable[[Any], Annotation]] = None,
|
||||
):
|
||||
if default_to_notebook:
|
||||
try:
|
||||
from IPython.core.display import display, HTML
|
||||
except ImportError as e:
|
||||
raise Exception(
|
||||
"""We couldn't import IPython utils for html display.
|
||||
Are you running in a notebook?
|
||||
You can also pass `default_to_notebook=False` to get back raw HTML
|
||||
"""
|
||||
)
|
||||
|
||||
self.tokenizer = tokenizer
|
||||
self.default_to_notebook = default_to_notebook
|
||||
self.annotation_coverter = annotation_converter
|
||||
pass
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
text: str,
|
||||
annotations: AnnotationList = [],
|
||||
default_to_notebook: Optional[bool] = None,
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Build a visualization of the given text
|
||||
|
||||
Args:
|
||||
text (:obj:`str`):
|
||||
The text to tokenize
|
||||
|
||||
annotations (:obj:`List[Annotation]`, `optional`):
|
||||
An optional list of annotations of the text. The can either be an annotation class
|
||||
or anything else if you instantiated the visualizer with a converter function
|
||||
|
||||
default_to_notebook (:obj:`bool`, `optional`, defaults to `False`):
|
||||
If True, will render the html in a notebook. Otherwise returns an html string.
|
||||
|
||||
Returns:
|
||||
The HTML string if default_to_notebook is False, otherwise (default) returns None and
|
||||
renders the HTML in the notebook
|
||||
|
||||
"""
|
||||
final_default_to_notebook = self.default_to_notebook
|
||||
if default_to_notebook is not None:
|
||||
final_default_to_notebook = default_to_notebook
|
||||
if final_default_to_notebook:
|
||||
try:
|
||||
from IPython.core.display import display, HTML
|
||||
except ImportError as e:
|
||||
raise Exception(
|
||||
"""We couldn't import IPython utils for html display.
|
||||
Are you running in a notebook?"""
|
||||
)
|
||||
if self.annotation_coverter is not None:
|
||||
annotations = list(map(self.annotation_coverter, annotations))
|
||||
encoding = self.tokenizer.encode(text)
|
||||
html = EncodingVisualizer.__make_html(text, encoding, annotations)
|
||||
if final_default_to_notebook:
|
||||
display(HTML(html))
|
||||
else:
|
||||
return html
|
||||
|
||||
@staticmethod
|
||||
def calculate_label_colors(annotations: AnnotationList) -> Dict[str, str]:
|
||||
"""
|
||||
Generates a color palette for all the labels in a given set of annotations
|
||||
|
||||
Args:
|
||||
annotations (:obj:`Annotation`):
|
||||
A list of annotations
|
||||
|
||||
Returns:
|
||||
:obj:`dict`: A dictionary mapping labels to colors in HSL format
|
||||
"""
|
||||
if len(annotations) == 0:
|
||||
return {}
|
||||
labels = set(map(lambda x: x.label, annotations))
|
||||
num_labels = len(labels)
|
||||
h_step = int(255 / num_labels)
|
||||
if h_step < 20:
|
||||
h_step = 20
|
||||
s = 32
|
||||
l = 64
|
||||
h = 10
|
||||
colors = {}
|
||||
|
||||
for label in sorted(
|
||||
labels
|
||||
): # sort so we always get the same colors for a given set of labels
|
||||
colors[label] = f"hsl({h},{s}%,{l}%"
|
||||
h += h_step
|
||||
return colors
|
||||
|
||||
@staticmethod
|
||||
def consecutive_chars_to_html(
|
||||
consecutive_chars_list: List[CharState],
|
||||
text: str,
|
||||
encoding: Encoding,
|
||||
):
|
||||
"""
|
||||
Converts a list of "consecutive chars" into a single HTML element.
|
||||
Chars are consecutive if they fall under the same word, token and annotation.
|
||||
The CharState class is a named tuple with a "partition_key" method that makes it easy to
|
||||
compare if two chars are consecutive.
|
||||
|
||||
Args:
|
||||
consecutive_chars_list (:obj:`List[CharState]`):
|
||||
A list of CharStates that have been grouped together
|
||||
|
||||
text (:obj:`str`):
|
||||
The original text being processed
|
||||
|
||||
encoding (:class:`~tokenizers.Encoding`):
|
||||
The encoding returned from the tokenizer
|
||||
|
||||
Returns:
|
||||
:obj:`str`: The HTML span for a set of consecutive chars
|
||||
"""
|
||||
first = consecutive_chars_list[0]
|
||||
if first.char_ix is None:
|
||||
# its a special token
|
||||
stoken = encoding.tokens[first.token_ix]
|
||||
# special tokens are represented as empty spans. We use the data attribute and css
|
||||
# magic to display it
|
||||
return f'<span class="special-token" data-stoken={stoken}></span>'
|
||||
# We're not in a special token so this group has a start and end.
|
||||
last = consecutive_chars_list[-1]
|
||||
start = first.char_ix
|
||||
end = last.char_ix + 1
|
||||
span_text = text[start:end]
|
||||
css_classes = [] # What css classes will we apply on the resulting span
|
||||
data_items = {} # What data attributes will we apply on the result span
|
||||
if first.token_ix is not None:
|
||||
# We can either be in a token or not (e.g. in white space)
|
||||
css_classes.append("token")
|
||||
if first.is_multitoken:
|
||||
css_classes.append("multi-token")
|
||||
if first.token_ix % 2:
|
||||
# We use this to color alternating tokens.
|
||||
# A token might be split by an annotation that ends in the middle of it, so this
|
||||
# lets us visually indicate a consecutive token despite its possible splitting in
|
||||
# the html markup
|
||||
css_classes.append("odd-token")
|
||||
else:
|
||||
# Like above, but a different color so we can see the tokens alternate
|
||||
css_classes.append("even-token")
|
||||
if (
|
||||
EncodingVisualizer.unk_token_regex.search(encoding.tokens[first.token_ix])
|
||||
is not None
|
||||
):
|
||||
# This is a special token that is in the text. probably UNK
|
||||
css_classes.append("special-token")
|
||||
# TODO is this the right name for the data attribute ?
|
||||
data_items["stok"] = encoding.tokens[first.token_ix]
|
||||
else:
|
||||
# In this case we are looking at a group/single char that is not tokenized.
|
||||
# e.g. white space
|
||||
css_classes.append("non-token")
|
||||
css = f'''class="{' '.join(css_classes)}"'''
|
||||
data = ""
|
||||
for key, val in data_items.items():
|
||||
data += f' data-{key}="{val}"'
|
||||
return f"<span {css} {data} >{span_text}</span>"
|
||||
|
||||
@staticmethod
|
||||
def __make_html(text: str, encoding: Encoding, annotations: AnnotationList) -> str:
|
||||
char_states = EncodingVisualizer.__make_char_states(text, encoding, annotations)
|
||||
current_consecutive_chars = [char_states[0]]
|
||||
prev_anno_ix = char_states[0].anno_ix
|
||||
spans = []
|
||||
label_colors_dict = EncodingVisualizer.calculate_label_colors(annotations)
|
||||
cur_anno_ix = char_states[0].anno_ix
|
||||
if cur_anno_ix is not None:
|
||||
# If we started in an annotation make a span for it
|
||||
anno = annotations[cur_anno_ix]
|
||||
label = anno.label
|
||||
color = label_colors_dict[label]
|
||||
spans.append(f'<span class="annotation" style="color:{color}" data-label="{label}">')
|
||||
|
||||
for cs in char_states[1:]:
|
||||
cur_anno_ix = cs.anno_ix
|
||||
if cur_anno_ix != prev_anno_ix:
|
||||
# If we've transitioned in or out of an annotation
|
||||
spans.append(
|
||||
# Create a span from the current consecutive characters
|
||||
EncodingVisualizer.consecutive_chars_to_html(
|
||||
current_consecutive_chars,
|
||||
text=text,
|
||||
encoding=encoding,
|
||||
)
|
||||
)
|
||||
current_consecutive_chars = [cs]
|
||||
|
||||
if prev_anno_ix is not None:
|
||||
# if we transitioned out of an annotation close it's span
|
||||
spans.append("</span>")
|
||||
if cur_anno_ix is not None:
|
||||
# If we entered a new annotation make a span for it
|
||||
anno = annotations[cur_anno_ix]
|
||||
label = anno.label
|
||||
color = label_colors_dict[label]
|
||||
spans.append(
|
||||
f'<span class="annotation" style="color:{color}" data-label="{label}">'
|
||||
)
|
||||
prev_anno_ix = cur_anno_ix
|
||||
|
||||
if cs.partition_key() == current_consecutive_chars[0].partition_key():
|
||||
# If the current charchter is in the same "group" as the previous one
|
||||
current_consecutive_chars.append(cs)
|
||||
else:
|
||||
# Otherwise we make a span for the previous group
|
||||
spans.append(
|
||||
EncodingVisualizer.consecutive_chars_to_html(
|
||||
current_consecutive_chars,
|
||||
text=text,
|
||||
encoding=encoding,
|
||||
)
|
||||
)
|
||||
# An reset the consecutive_char_list to form a new group
|
||||
current_consecutive_chars = [cs]
|
||||
# All that's left is to fill out the final span
|
||||
# TODO I think there is an edge case here where an annotation's span might not close
|
||||
spans.append(
|
||||
EncodingVisualizer.consecutive_chars_to_html(
|
||||
current_consecutive_chars,
|
||||
text=text,
|
||||
encoding=encoding,
|
||||
)
|
||||
)
|
||||
res = HTMLBody(spans) # Send the list of spans to the body of our html
|
||||
return res
|
||||
|
||||
@staticmethod
|
||||
def __make_anno_map(text: str, annotations: AnnotationList) -> PartialIntList:
|
||||
"""
|
||||
Args:
|
||||
text (:obj:`str`):
|
||||
The raw text we want to align to
|
||||
|
||||
annotations (:obj:`AnnotationList`):
|
||||
A (possibly empty) list of annotations
|
||||
|
||||
Returns:
|
||||
A list of length len(text) whose entry at index i is None if there is no annotation on
|
||||
charachter i or k, the index of the annotation that covers index i where k is with
|
||||
respect to the list of annotations
|
||||
"""
|
||||
annotation_map = [None] * len(text)
|
||||
for anno_ix, a in enumerate(annotations):
|
||||
for i in range(a.start, a.end):
|
||||
annotation_map[i] = anno_ix
|
||||
return annotation_map
|
||||
|
||||
@staticmethod
|
||||
def __make_char_states(
|
||||
text: str, encoding: Encoding, annotations: AnnotationList
|
||||
) -> List[CharState]:
|
||||
"""
|
||||
For each character in the original text, we emit a tuple representing it's "state":
|
||||
|
||||
* which token_ix it corresponds to
|
||||
* which word_ix it corresponds to
|
||||
* which annotation_ix it corresponds to
|
||||
|
||||
Args:
|
||||
text (:obj:`str`):
|
||||
The raw text we want to align to
|
||||
|
||||
annotations (:obj:`List[Annotation]`):
|
||||
A (possibly empty) list of annotations
|
||||
|
||||
encoding: (:class:`~tokenizers.Encoding`):
|
||||
The encoding returned from the tokenizer
|
||||
|
||||
Returns:
|
||||
:obj:`List[CharState]`: A list of CharStates, indicating for each char in the text what
|
||||
it's state is
|
||||
"""
|
||||
annotation_map = EncodingVisualizer.__make_anno_map(text, annotations)
|
||||
# Todo make this a dataclass or named tuple
|
||||
char_states: List[CharState] = [CharState(char_ix) for char_ix in range(len(text))]
|
||||
for token_ix, token in enumerate(encoding.tokens):
|
||||
offsets = encoding.token_to_chars(token_ix)
|
||||
if offsets is not None:
|
||||
start, end = offsets
|
||||
for i in range(start, end):
|
||||
char_states[i].tokens.append(token_ix)
|
||||
for char_ix, anno_ix in enumerate(annotation_map):
|
||||
char_states[char_ix].anno_ix = anno_ix
|
||||
|
||||
return char_states
|
||||
|
||||
|
||||
def HTMLBody(children: List[str], css_styles=css) -> str:
|
||||
"""
|
||||
Generates the full html with css from a list of html spans
|
||||
|
||||
Args:
|
||||
children (:obj:`List[str]`):
|
||||
A list of strings, assumed to be html elements
|
||||
|
||||
css_styles (:obj:`str`, `optional`):
|
||||
Optional alternative implementation of the css
|
||||
|
||||
Returns:
|
||||
:obj:`str`: An HTML string with style markup
|
||||
"""
|
||||
children_text = "".join(children)
|
||||
return f"""
|
||||
<html>
|
||||
<head>
|
||||
<style>
|
||||
{css_styles}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="tokenized-text" dir=auto>
|
||||
{children_text}
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
@ -41,6 +41,7 @@ setup(
|
||||
"tokenizers.processors",
|
||||
"tokenizers.trainers",
|
||||
"tokenizers.implementations",
|
||||
"tokenizers.tools",
|
||||
],
|
||||
package_data={
|
||||
"tokenizers": ["py.typed", "__init__.pyi"],
|
||||
@ -51,6 +52,7 @@ setup(
|
||||
"tokenizers.processors": ["py.typed", "__init__.pyi"],
|
||||
"tokenizers.trainers": ["py.typed", "__init__.pyi"],
|
||||
"tokenizers.implementations": ["py.typed"],
|
||||
"tokenizers.tools": ["py.typed", "visualizer-styles.css"],
|
||||
},
|
||||
zip_safe=False,
|
||||
)
|
||||
|
@ -78,3 +78,13 @@ Trainers
|
||||
|
||||
.. automodule:: tokenizers.trainers
|
||||
:members:
|
||||
|
||||
|
||||
Visualizer
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: tokenizers.tools.Annotation
|
||||
:members:
|
||||
|
||||
.. autoclass:: tokenizers.tools.EncodingVisualizer
|
||||
:members: __call__
|
||||
|
Reference in New Issue
Block a user