diff --git a/.gitignore b/.gitignore
index 6141202f..f965b105 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,7 +4,7 @@
.vim
.env
target
-
+.idea
Cargo.lock
/data
@@ -17,6 +17,7 @@ __pycache__
pip-wheel-metadata
*.egg-info
*.so
+/bindings/python/examples/.ipynb_checkpoints
/bindings/python/build
/bindings/python/dist
diff --git a/bindings/python/examples/using_the_visualizer.ipynb b/bindings/python/examples/using_the_visualizer.ipynb
new file mode 100644
index 00000000..2840d2e0
--- /dev/null
+++ b/bindings/python/examples/using_the_visualizer.ipynb
@@ -0,0 +1,1053 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "--2020-12-04 09:25:00-- https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt\n",
+ "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n",
+ "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.104.253\n",
+ "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.104.253|:443... connected.\n",
+ "HTTP request sent, awaiting response... 200 OK\n",
+ "Length: 231508 (226K) [text/plain]\n",
+ "Saving to: ‘/tmp/bert-base-uncased-vocab.txt’\n",
+ "\n",
+ "/tmp/bert-base-unca 100%[===================>] 226.08K --.-KB/s in 0.06s \n",
+ "\n",
+ "2020-12-04 09:25:00 (3.87 MB/s) - ‘/tmp/bert-base-uncased-vocab.txt’ saved [231508/231508]\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "!wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt -O /tmp/bert-base-uncased-vocab.txt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from tokenizers import BertWordPieceTokenizer\n",
+ "from tokenizers.tools import EncodingVisualizer\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "EncodingVisualizer.unk_token_regex.search(\"aaa[udsnk]aaa\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "text = \"\"\"Mathias Bynens 'Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A̴̵̜̰͔ͫ͗͢L̠ͨͧͩ͘G̴̻͈͍͔̹̑͗̎̅͛́Ǫ̵̹̻̝̳͂̌̌͘!͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞': Whenever you’re working on a piece of JavaScript code that deals with strings or regular expressions in some way, just add a unit test that contains a pile of poo (💩) in a string, 💩💩💩💩💩💩💩💩💩💩💩💩 and see if anything breaks. It’s a quick, fun, and easy way to see if your code supports astral symbols. Once you’ve found a Unicode-related bug in your code, all you need to do is apply the techniques discussed in this post to fix it.\"\"\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tokenizer = BertWordPieceTokenizer(\"/tmp/bert-base-uncased-vocab.txt\", lowercase=True)\n",
+ "visualizer = EncodingVisualizer(tokenizer=tokenizer)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Visualizing Tokens With No Annotations"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " Mathias Bynens 'Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A̴̵̜̰͔ͫ͗͢L̠ͨͧͩ͘G̴̻͈͍͔̹̑͗̎̅͛́Ǫ̵̹̻̝̳͂̌̌͘!͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞': Whenever you’re working on a piece of JavaScript code that deals with strings or regular expressions in some way, just add a unit test that contains a pile of poo (💩) in a string, 💩💩💩💩💩💩💩💩💩💩💩💩 and see if anything breaks. It’s a quick, fun, and easy way to see if your code supports astral symbols. Once you’ve found a Unicode-related bug in your code, all you need to do is apply the techniques discussed in this post to fix it.\n",
+ "
\n",
+ " \n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "visualizer(text)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Visualizing Tokens With Aligned Annotations\n",
+ "First we make some annotations with the Annotation class"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from tokenizers.tools import Annotation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "anno1 = Annotation(start=0, end=2, label=\"foo\")\n",
+ "anno2 = Annotation(start=2, end=4, label=\"bar\")\n",
+ "anno3 = Annotation(start=6, end=8, label=\"poo\")\n",
+ "anno4 = Annotation(start=9, end=12, label=\"shoe\")\n",
+ "annotations=[\n",
+ " anno1,\n",
+ " anno2,\n",
+ " anno3,\n",
+ " anno4,\n",
+ " Annotation(start=23, end=30, label=\"random tandem bandem sandem landem fandom\"),\n",
+ " Annotation(start=63, end=70, label=\"foo\"),\n",
+ " Annotation(start=80, end=95, label=\"bar\"),\n",
+ " Annotation(start=120, end=128, label=\"bar\"),\n",
+ " Annotation(start=152, end=155, label=\"poo\"),\n",
+ "]\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " Mathias Bynens 'Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A̴̵̜̰͔ͫ͗͢L̠ͨͧͩ͘G̴̻͈͍͔̹̑͗̎̅͛́Ǫ̵̹̻̝̳͂̌̌͘!͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞': Whenever you’re working on a piece of JavaScript code that deals with strings or regular expressions in some way, just add a unit test that contains a pile of poo (💩) in a string, 💩💩💩💩💩💩💩💩💩💩💩💩 and see if anything breaks. It’s a quick, fun, and easy way to see if your code supports astral symbols. Once you’ve found a Unicode-related bug in your code, all you need to do is apply the techniques discussed in this post to fix it.\n",
+ "
\n",
+ " \n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "visualizer(text,annotations=annotations)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Using A Custom Annotation Format\n",
+ "Every system has its own representation of annotations. That's why we can instantiate the EncodingVisualizer with a convertion function."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[{'startPlace': 0, 'endPlace': 3, 'theTag': '0'},\n",
+ " {'startPlace': 4, 'endPlace': 7, 'theTag': '4'},\n",
+ " {'startPlace': 8, 'endPlace': 11, 'theTag': '8'},\n",
+ " {'startPlace': 12, 'endPlace': 15, 'theTag': '12'},\n",
+ " {'startPlace': 16, 'endPlace': 19, 'theTag': '16'}]"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "funnyAnnotations = [dict(startPlace=i,endPlace=i+3,theTag=str(i)) for i in range(0,20,4)]\n",
+ "funnyAnnotations"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "converter = lambda funny: Annotation(start=funny['startPlace'], end=funny['endPlace'], label=funny['theTag'])\n",
+ "visualizer = EncodingVisualizer(tokenizer=tokenizer, default_to_notebook=True, annotation_converter=converter)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " Mathias Bynens 'Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A̴̵̜̰͔ͫ͗͢L̠ͨͧͩ͘G̴̻͈͍͔̹̑͗̎̅͛́Ǫ̵̹̻̝̳͂̌̌͘!͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞': Whenever you’re working on a piece of JavaScript code that deals with strings or regular expressions in some way, just add a unit test that contains a pile of poo (💩) in a string, 💩💩💩💩💩💩💩💩💩💩💩💩 and see if anything breaks. It’s a quick, fun, and easy way to see if your code supports astral symbols. Once you’ve found a Unicode-related bug in your code, all you need to do is apply the techniques discussed in this post to fix it.\n",
+ "
\n",
+ " \n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "visualizer(text, annotations=funnyAnnotations)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Trying with Roberta\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "--2020-12-04 09:25:00-- https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json\n",
+ "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n",
+ "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.226.19\n",
+ "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.226.19|:443... connected.\n",
+ "HTTP request sent, awaiting response... 200 OK\n",
+ "Length: 898823 (878K) [application/json]\n",
+ "Saving to: ‘/tmp/roberta-base-vocab.json’\n",
+ "\n",
+ "/tmp/roberta-base-v 100%[===================>] 877.76K 4.35MB/s in 0.2s \n",
+ "\n",
+ "2020-12-04 09:25:00 (4.35 MB/s) - ‘/tmp/roberta-base-vocab.json’ saved [898823/898823]\n",
+ "\n",
+ "--2020-12-04 09:25:00-- https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt\n",
+ "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n",
+ "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.104.253\n",
+ "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.104.253|:443... connected.\n",
+ "HTTP request sent, awaiting response... 200 OK\n",
+ "Length: 456318 (446K) [text/plain]\n",
+ "Saving to: ‘/tmp/roberta-base-merges.txt’\n",
+ "\n",
+ "/tmp/roberta-base-m 100%[===================>] 445.62K --.-KB/s in 0.1s \n",
+ "\n",
+ "2020-12-04 09:25:01 (4.04 MB/s) - ‘/tmp/roberta-base-merges.txt’ saved [456318/456318]\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "!wget \"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json\" -O /tmp/roberta-base-vocab.json\n",
+ "!wget \"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt\" -O /tmp/roberta-base-merges.txt\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " Mathias Bynens 'Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A̴̵̜̰͔ͫ͗͢L̠ͨͧͩ͘G̴̻͈͍͔̹̑͗̎̅͛́Ǫ̵̹̻̝̳͂̌̌͘!͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞': Whenever you’re working on a piece of JavaScript code that deals with strings or regular expressions in some way, just add a unit test that contains a pile of poo (💩) in a string, 💩💩💩💩💩💩💩💩💩💩💩💩 and see if anything breaks. It’s a quick, fun, and easy way to see if your code supports astral symbols. Once you’ve found a Unicode-related bug in your code, all you need to do is apply the techniques discussed in this post to fix it.\n",
+ "
\n",
+ " \n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from tokenizers import ByteLevelBPETokenizer\n",
+ "roberta_tokenizer = ByteLevelBPETokenizer.from_file('/tmp/roberta-base-vocab.json', '/tmp/roberta-base-merges.txt')\n",
+ "roberta_visualizer = EncodingVisualizer(tokenizer=roberta_tokenizer, default_to_notebook=True)\n",
+ "roberta_visualizer(text, annotations=annotations)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/bindings/python/py_src/tokenizers/__init__.py b/bindings/python/py_src/tokenizers/__init__.py
index 6447bc40..b595f085 100644
--- a/bindings/python/py_src/tokenizers/__init__.py
+++ b/bindings/python/py_src/tokenizers/__init__.py
@@ -91,6 +91,7 @@ from .tokenizers import normalizers
from .tokenizers import pre_tokenizers
from .tokenizers import processors
from .tokenizers import trainers
+
from .implementations import (
ByteLevelBPETokenizer,
CharBPETokenizer,
diff --git a/bindings/python/py_src/tokenizers/tools/__init__.py b/bindings/python/py_src/tokenizers/tools/__init__.py
new file mode 100644
index 00000000..7b5511c0
--- /dev/null
+++ b/bindings/python/py_src/tokenizers/tools/__init__.py
@@ -0,0 +1 @@
+from .visualizer import EncodingVisualizer, Annotation
diff --git a/bindings/python/py_src/tokenizers/tools/visualizer-styles.css b/bindings/python/py_src/tokenizers/tools/visualizer-styles.css
new file mode 100644
index 00000000..f54fde45
--- /dev/null
+++ b/bindings/python/py_src/tokenizers/tools/visualizer-styles.css
@@ -0,0 +1,170 @@
+.tokenized-text {
+ width:100%;
+ padding:2rem;
+ max-height: 400px;
+ overflow-y: auto;
+ box-sizing:border-box;
+ line-height:4rem; /* Lots of space between lines */
+ font-family: "Roboto Light", "Ubuntu Light", "Ubuntu", monospace;
+ box-shadow: 2px 2px 2px rgba(0,0,0,0.2);
+ background-color: rgba(0,0,0,0.01);
+ letter-spacing:2px; /* Give some extra separation between chars */
+}
+.non-token{
+ /* White space and other things the tokenizer ignores*/
+ white-space: pre;
+ letter-spacing:4px;
+ border-top:1px solid #A0A0A0; /* A gentle border on top and bottom makes tabs more ovious*/
+ border-bottom:1px solid #A0A0A0;
+ line-height: 1rem;
+ height: calc(100% - 2px);
+}
+
+.token {
+ white-space: pre;
+ position:relative;
+ color:black;
+ letter-spacing:2px;
+}
+
+.annotation{
+ white-space:nowrap; /* Important - ensures that annotations appears even if the annotated text wraps a line */
+ border-radius:4px;
+ position:relative;
+ width:fit-content;
+}
+.annotation:before {
+ /*The before holds the text and the after holds the background*/
+ z-index:1000; /* Make sure this is above the background */
+ content:attr(data-label); /* The annotations label is on a data attribute */
+ color:white;
+ position:absolute;
+ font-size:1rem;
+ text-align:center;
+ font-weight:bold;
+
+ top:1.75rem;
+ line-height:0;
+ left:0;
+ width:100%;
+ padding:0.5rem 0;
+ /* These make it so an annotation doesn't stretch beyond the annotated text if the label is longer*/
+ overflow: hidden;
+ white-space: nowrap;
+ text-overflow:ellipsis;
+}
+
+.annotation:after {
+ content:attr(data-label); /* The content defines the width of the annotation*/
+ position:absolute;
+ font-size:0.75rem;
+ text-align:center;
+ font-weight:bold;
+ text-overflow:ellipsis;
+ top:1.75rem;
+ line-height:0;
+ overflow: hidden;
+ white-space: nowrap;
+
+ left:0;
+ width:100%; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/
+
+ padding:0.5rem 0;
+ /* Nast hack below:
+ We set the annotations color in code because we don't know the colors at css time.
+ But you can't pass a color as a data attribute to get it into the pseudo element (this thing)
+ So to get around that, annotations have the color set on them with a style attribute and then we
+ can get the color with currentColor.
+ Annotations wrap tokens and tokens set the color back to black
+ */
+ background-color: currentColor;
+}
+.annotation:hover::after, .annotation:hover::before{
+ /* When the user hovers over an annotation expand the label to display in full
+ */
+ min-width: fit-content;
+}
+
+.annotation:hover{
+ /* Emphasize the annotation start end with a border on hover*/
+ border-color: currentColor;
+ border: 2px solid;
+}
+.special-token:not(:empty){
+ /*
+ A none empty special token is like UNK (as opposed to CLS which has no representation in the text )
+ */
+ position:relative;
+}
+.special-token:empty::before{
+ /* Special tokens that don't have text are displayed as pseudo elements so we dont select them with the mouse*/
+ content:attr(data-stok);
+ background:#202020;
+ font-size:0.75rem;
+ color:white;
+ margin: 0 0.25rem;
+ padding: 0.25rem;
+ border-radius:4px
+}
+
+.special-token:not(:empty):before {
+ /* Special tokens that have text (UNK) are displayed above the actual text*/
+ content:attr(data-stok);
+ position:absolute;
+ bottom:1.75rem;
+ min-width:100%;
+ width:100%;
+ height:1rem;
+ line-height:1rem;
+ font-size:1rem;
+ text-align:center;
+ color:white;
+ font-weight:bold;
+ background:#202020;
+ border-radius:10%;
+}
+/*
+We want to alternate the color of tokens, but we can't use nth child because tokens might be broken up by annotations
+instead we apply even and odd class at generation time and color them that way
+ */
+.even-token{
+ background:#DCDCDC ;
+ border: 1px solid #DCDCDC;
+}
+.odd-token{
+ background:#A0A0A0;
+ border: 1px solid #A0A0A0;
+}
+.even-token.multi-token,.odd-token.multi-token{
+ background: repeating-linear-gradient(
+ 45deg,
+ transparent,
+ transparent 1px,
+ #ccc 1px,
+ #ccc 1px
+ ),
+ /* on "bottom" */
+ linear-gradient(
+ to bottom,
+ #FFB6C1,
+ #999
+ );
+}
+
+.multi-token:hover::after {
+ content:"This char has more than 1 token"; /* The content defines the width of the annotation*/
+ color:white;
+ background-color: black;
+ position:absolute;
+ font-size:0.75rem;
+ text-align:center;
+ font-weight:bold;
+ text-overflow:ellipsis;
+ top:1.75rem;
+ line-height:0;
+ overflow: hidden;
+ white-space: nowrap;
+ left:0;
+ width:fit-content; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/
+ padding:0.5rem 0;
+}
diff --git a/bindings/python/py_src/tokenizers/tools/visualizer.py b/bindings/python/py_src/tokenizers/tools/visualizer.py
new file mode 100644
index 00000000..4e5ef719
--- /dev/null
+++ b/bindings/python/py_src/tokenizers/tools/visualizer.py
@@ -0,0 +1,412 @@
+import os
+import itertools
+import re
+from typing import List, Optional, Tuple, Dict, Callable, Any, NamedTuple
+from string import Template
+from typing import List
+
+from tokenizers import Tokenizer, Encoding
+
+dirname = os.path.dirname(__file__)
+css_filename = os.path.join(dirname, "visualizer-styles.css")
+with open(css_filename) as f:
+ css = f.read()
+
+
+class Annotation:
+ start: int
+ end: int
+ label: int
+
+ def __init__(self, start: int, end: int, label: str):
+ self.start = start
+ self.end = end
+ self.label = label
+
+
+AnnotationList = List[Annotation]
+PartialIntList = List[Optional[int]]
+
+
+class CharStateKey(NamedTuple):
+ token_ix: Optional[int]
+ anno_ix: Optional[int]
+
+
+class CharState:
+ char_ix: Optional[int]
+
+ def __init__(self, char_ix):
+ self.char_ix = char_ix
+
+ self.anno_ix: Optional[int] = None
+ self.tokens: List[int] = []
+
+ @property
+ def token_ix(self):
+ return self.tokens[0] if len(self.tokens) > 0 else None
+
+ @property
+ def is_multitoken(self):
+ """
+ BPE tokenizers can output more than one token for a char
+ """
+ return len(self.tokens) > 1
+
+ def partition_key(self) -> CharStateKey:
+ return CharStateKey(
+ token_ix=self.token_ix,
+ anno_ix=self.anno_ix,
+ )
+
+
+class Aligned:
+ pass
+
+
+class EncodingVisualizer:
+ """
+ Build an EncodingVisualizer
+
+ Args:
+
+ tokenizer (:class:`~tokenizers.Tokenizer`):
+ A tokenizer instance
+
+ default_to_notebook (:obj:`bool`):
+ Whether to render html output in a notebook by default
+
+ annotation_converter (:obj:`Callable`, `optional`):
+ An optional (lambda) function that takes an annotation in any format and returns
+ an Annotation object
+ """
+
+ unk_token_regex = re.compile("(.{1}\b)?(unk|oov)(\b.{1})?", flags=re.IGNORECASE)
+
+ def __init__(
+ self,
+ tokenizer: Tokenizer,
+ default_to_notebook: bool = True,
+ annotation_converter: Optional[Callable[[Any], Annotation]] = None,
+ ):
+ if default_to_notebook:
+ try:
+ from IPython.core.display import display, HTML
+ except ImportError as e:
+ raise Exception(
+ """We couldn't import IPython utils for html display.
+ Are you running in a notebook?
+ You can also pass `default_to_notebook=False` to get back raw HTML
+ """
+ )
+
+ self.tokenizer = tokenizer
+ self.default_to_notebook = default_to_notebook
+ self.annotation_coverter = annotation_converter
+ pass
+
+ def __call__(
+ self,
+ text: str,
+ annotations: AnnotationList = [],
+ default_to_notebook: Optional[bool] = None,
+ ) -> Optional[str]:
+ """
+ Build a visualization of the given text
+
+ Args:
+ text (:obj:`str`):
+ The text to tokenize
+
+ annotations (:obj:`List[Annotation]`, `optional`):
+ An optional list of annotations of the text. The can either be an annotation class
+ or anything else if you instantiated the visualizer with a converter function
+
+ default_to_notebook (:obj:`bool`, `optional`, defaults to `False`):
+ If True, will render the html in a notebook. Otherwise returns an html string.
+
+ Returns:
+ The HTML string if default_to_notebook is False, otherwise (default) returns None and
+ renders the HTML in the notebook
+
+ """
+ final_default_to_notebook = self.default_to_notebook
+ if default_to_notebook is not None:
+ final_default_to_notebook = default_to_notebook
+ if final_default_to_notebook:
+ try:
+ from IPython.core.display import display, HTML
+ except ImportError as e:
+ raise Exception(
+ """We couldn't import IPython utils for html display.
+ Are you running in a notebook?"""
+ )
+ if self.annotation_coverter is not None:
+ annotations = list(map(self.annotation_coverter, annotations))
+ encoding = self.tokenizer.encode(text)
+ html = EncodingVisualizer.__make_html(text, encoding, annotations)
+ if final_default_to_notebook:
+ display(HTML(html))
+ else:
+ return html
+
+ @staticmethod
+ def calculate_label_colors(annotations: AnnotationList) -> Dict[str, str]:
+ """
+ Generates a color palette for all the labels in a given set of annotations
+
+ Args:
+ annotations (:obj:`Annotation`):
+ A list of annotations
+
+ Returns:
+ :obj:`dict`: A dictionary mapping labels to colors in HSL format
+ """
+ if len(annotations) == 0:
+ return {}
+ labels = set(map(lambda x: x.label, annotations))
+ num_labels = len(labels)
+ h_step = int(255 / num_labels)
+ if h_step < 20:
+ h_step = 20
+ s = 32
+ l = 64
+ h = 10
+ colors = {}
+
+ for label in sorted(
+ labels
+ ): # sort so we always get the same colors for a given set of labels
+ colors[label] = f"hsl({h},{s}%,{l}%"
+ h += h_step
+ return colors
+
+ @staticmethod
+ def consecutive_chars_to_html(
+ consecutive_chars_list: List[CharState],
+ text: str,
+ encoding: Encoding,
+ ):
+ """
+ Converts a list of "consecutive chars" into a single HTML element.
+ Chars are consecutive if they fall under the same word, token and annotation.
+ The CharState class is a named tuple with a "partition_key" method that makes it easy to
+ compare if two chars are consecutive.
+
+ Args:
+ consecutive_chars_list (:obj:`List[CharState]`):
+ A list of CharStates that have been grouped together
+
+ text (:obj:`str`):
+ The original text being processed
+
+ encoding (:class:`~tokenizers.Encoding`):
+ The encoding returned from the tokenizer
+
+ Returns:
+ :obj:`str`: The HTML span for a set of consecutive chars
+ """
+ first = consecutive_chars_list[0]
+ if first.char_ix is None:
+ # its a special token
+ stoken = encoding.tokens[first.token_ix]
+ # special tokens are represented as empty spans. We use the data attribute and css
+ # magic to display it
+ return f''
+ # We're not in a special token so this group has a start and end.
+ last = consecutive_chars_list[-1]
+ start = first.char_ix
+ end = last.char_ix + 1
+ span_text = text[start:end]
+ css_classes = [] # What css classes will we apply on the resulting span
+ data_items = {} # What data attributes will we apply on the result span
+ if first.token_ix is not None:
+ # We can either be in a token or not (e.g. in white space)
+ css_classes.append("token")
+ if first.is_multitoken:
+ css_classes.append("multi-token")
+ if first.token_ix % 2:
+ # We use this to color alternating tokens.
+ # A token might be split by an annotation that ends in the middle of it, so this
+ # lets us visually indicate a consecutive token despite its possible splitting in
+ # the html markup
+ css_classes.append("odd-token")
+ else:
+ # Like above, but a different color so we can see the tokens alternate
+ css_classes.append("even-token")
+ if (
+ EncodingVisualizer.unk_token_regex.search(encoding.tokens[first.token_ix])
+ is not None
+ ):
+ # This is a special token that is in the text. probably UNK
+ css_classes.append("special-token")
+ # TODO is this the right name for the data attribute ?
+ data_items["stok"] = encoding.tokens[first.token_ix]
+ else:
+ # In this case we are looking at a group/single char that is not tokenized.
+ # e.g. white space
+ css_classes.append("non-token")
+ css = f'''class="{' '.join(css_classes)}"'''
+ data = ""
+ for key, val in data_items.items():
+ data += f' data-{key}="{val}"'
+ return f"{span_text}"
+
+ @staticmethod
+ def __make_html(text: str, encoding: Encoding, annotations: AnnotationList) -> str:
+ char_states = EncodingVisualizer.__make_char_states(text, encoding, annotations)
+ current_consecutive_chars = [char_states[0]]
+ prev_anno_ix = char_states[0].anno_ix
+ spans = []
+ label_colors_dict = EncodingVisualizer.calculate_label_colors(annotations)
+ cur_anno_ix = char_states[0].anno_ix
+ if cur_anno_ix is not None:
+ # If we started in an annotation make a span for it
+ anno = annotations[cur_anno_ix]
+ label = anno.label
+ color = label_colors_dict[label]
+ spans.append(f'')
+
+ for cs in char_states[1:]:
+ cur_anno_ix = cs.anno_ix
+ if cur_anno_ix != prev_anno_ix:
+ # If we've transitioned in or out of an annotation
+ spans.append(
+ # Create a span from the current consecutive characters
+ EncodingVisualizer.consecutive_chars_to_html(
+ current_consecutive_chars,
+ text=text,
+ encoding=encoding,
+ )
+ )
+ current_consecutive_chars = [cs]
+
+ if prev_anno_ix is not None:
+ # if we transitioned out of an annotation close it's span
+ spans.append("")
+ if cur_anno_ix is not None:
+ # If we entered a new annotation make a span for it
+ anno = annotations[cur_anno_ix]
+ label = anno.label
+ color = label_colors_dict[label]
+ spans.append(
+ f''
+ )
+ prev_anno_ix = cur_anno_ix
+
+ if cs.partition_key() == current_consecutive_chars[0].partition_key():
+ # If the current charchter is in the same "group" as the previous one
+ current_consecutive_chars.append(cs)
+ else:
+ # Otherwise we make a span for the previous group
+ spans.append(
+ EncodingVisualizer.consecutive_chars_to_html(
+ current_consecutive_chars,
+ text=text,
+ encoding=encoding,
+ )
+ )
+ # An reset the consecutive_char_list to form a new group
+ current_consecutive_chars = [cs]
+ # All that's left is to fill out the final span
+ # TODO I think there is an edge case here where an annotation's span might not close
+ spans.append(
+ EncodingVisualizer.consecutive_chars_to_html(
+ current_consecutive_chars,
+ text=text,
+ encoding=encoding,
+ )
+ )
+ res = HTMLBody(spans) # Send the list of spans to the body of our html
+ return res
+
+ @staticmethod
+ def __make_anno_map(text: str, annotations: AnnotationList) -> PartialIntList:
+ """
+ Args:
+ text (:obj:`str`):
+ The raw text we want to align to
+
+ annotations (:obj:`AnnotationList`):
+ A (possibly empty) list of annotations
+
+ Returns:
+ A list of length len(text) whose entry at index i is None if there is no annotation on
+ charachter i or k, the index of the annotation that covers index i where k is with
+ respect to the list of annotations
+ """
+ annotation_map = [None] * len(text)
+ for anno_ix, a in enumerate(annotations):
+ for i in range(a.start, a.end):
+ annotation_map[i] = anno_ix
+ return annotation_map
+
+ @staticmethod
+ def __make_char_states(
+ text: str, encoding: Encoding, annotations: AnnotationList
+ ) -> List[CharState]:
+ """
+ For each character in the original text, we emit a tuple representing it's "state":
+
+ * which token_ix it corresponds to
+ * which word_ix it corresponds to
+ * which annotation_ix it corresponds to
+
+ Args:
+ text (:obj:`str`):
+ The raw text we want to align to
+
+ annotations (:obj:`List[Annotation]`):
+ A (possibly empty) list of annotations
+
+ encoding: (:class:`~tokenizers.Encoding`):
+ The encoding returned from the tokenizer
+
+ Returns:
+ :obj:`List[CharState]`: A list of CharStates, indicating for each char in the text what
+ it's state is
+ """
+ annotation_map = EncodingVisualizer.__make_anno_map(text, annotations)
+ # Todo make this a dataclass or named tuple
+ char_states: List[CharState] = [CharState(char_ix) for char_ix in range(len(text))]
+ for token_ix, token in enumerate(encoding.tokens):
+ offsets = encoding.token_to_chars(token_ix)
+ if offsets is not None:
+ start, end = offsets
+ for i in range(start, end):
+ char_states[i].tokens.append(token_ix)
+ for char_ix, anno_ix in enumerate(annotation_map):
+ char_states[char_ix].anno_ix = anno_ix
+
+ return char_states
+
+
+def HTMLBody(children: List[str], css_styles=css) -> str:
+ """
+ Generates the full html with css from a list of html spans
+
+ Args:
+ children (:obj:`List[str]`):
+ A list of strings, assumed to be html elements
+
+ css_styles (:obj:`str`, `optional`):
+ Optional alternative implementation of the css
+
+ Returns:
+ :obj:`str`: An HTML string with style markup
+ """
+ children_text = "".join(children)
+ return f"""
+
+
+
+
+
+
+ {children_text}
+
+
+
+ """
diff --git a/bindings/python/setup.py b/bindings/python/setup.py
index 5c81afab..28154d62 100644
--- a/bindings/python/setup.py
+++ b/bindings/python/setup.py
@@ -41,6 +41,7 @@ setup(
"tokenizers.processors",
"tokenizers.trainers",
"tokenizers.implementations",
+ "tokenizers.tools",
],
package_data={
"tokenizers": ["py.typed", "__init__.pyi"],
@@ -51,6 +52,7 @@ setup(
"tokenizers.processors": ["py.typed", "__init__.pyi"],
"tokenizers.trainers": ["py.typed", "__init__.pyi"],
"tokenizers.implementations": ["py.typed"],
+ "tokenizers.tools": ["py.typed", "visualizer-styles.css"],
},
zip_safe=False,
)
diff --git a/docs/source/api/python.inc b/docs/source/api/python.inc
index ee4ed8f1..7a90d78b 100644
--- a/docs/source/api/python.inc
+++ b/docs/source/api/python.inc
@@ -78,3 +78,13 @@ Trainers
.. automodule:: tokenizers.trainers
:members:
+
+
+Visualizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: tokenizers.tools.Annotation
+ :members:
+
+.. autoclass:: tokenizers.tools.EncodingVisualizer
+ :members: __call__