From 8916b6bb2712b10df18047d4b315fa229a1e2da9 Mon Sep 17 00:00:00 2001
From: Tal Perry <tal@lighttag.io>
Date: Fri, 4 Dec 2020 16:25:56 +0100
Subject: [PATCH] Add a visualization utility to render tokens and annotations
 in a notebook (#508)

* Draft functionality of visualization

* Added comments to make code more intelligble

* polish the styles

* Ensure colors are stable and comment the css

* Code clean up

* Made visualizer importable and added some docs

* Fix styling

* implement comments from PR

* Fixed the regex for UNK tokens and examples in notebook

* Converted docs to google format

* Added a notebook showing multiple languages and tokenizers

* Added visual indication of chars that are tokenized with >1 token

* Reorganize things a bit and fix import

* Update docs

Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
---
 .gitignore                                    |    3 +-
 .../examples/using_the_visualizer.ipynb       | 1053 +++++++++++++++++
 bindings/python/py_src/tokenizers/__init__.py |    1 +
 .../py_src/tokenizers/tools/__init__.py       |    1 +
 .../tokenizers/tools/visualizer-styles.css    |  170 +++
 .../py_src/tokenizers/tools/visualizer.py     |  412 +++++++
 bindings/python/setup.py                      |    2 +
 docs/source/api/python.inc                    |   10 +
 8 files changed, 1651 insertions(+), 1 deletion(-)
 create mode 100644 bindings/python/examples/using_the_visualizer.ipynb
 create mode 100644 bindings/python/py_src/tokenizers/tools/__init__.py
 create mode 100644 bindings/python/py_src/tokenizers/tools/visualizer-styles.css
 create mode 100644 bindings/python/py_src/tokenizers/tools/visualizer.py

diff --git a/.gitignore b/.gitignore
index 6141202f..f965b105 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,7 +4,7 @@
 .vim
 .env
 target
-
+.idea
 Cargo.lock
 
 /data
@@ -17,6 +17,7 @@ __pycache__
 pip-wheel-metadata
 *.egg-info
 *.so
+/bindings/python/examples/.ipynb_checkpoints
 /bindings/python/build
 /bindings/python/dist
 
diff --git a/bindings/python/examples/using_the_visualizer.ipynb b/bindings/python/examples/using_the_visualizer.ipynb
new file mode 100644
index 00000000..2840d2e0
--- /dev/null
+++ b/bindings/python/examples/using_the_visualizer.ipynb
@@ -0,0 +1,1053 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--2020-12-04 09:25:00--  https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt\n",
+      "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n",
+      "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.104.253\n",
+      "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.104.253|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 231508 (226K) [text/plain]\n",
+      "Saving to: ‘/tmp/bert-base-uncased-vocab.txt’\n",
+      "\n",
+      "/tmp/bert-base-unca 100%[===================>] 226.08K  --.-KB/s    in 0.06s   \n",
+      "\n",
+      "2020-12-04 09:25:00 (3.87 MB/s) - ‘/tmp/bert-base-uncased-vocab.txt’ saved [231508/231508]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "!wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt -O /tmp/bert-base-uncased-vocab.txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tokenizers import BertWordPieceTokenizer\n",
+    "from tokenizers.tools import EncodingVisualizer\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "EncodingVisualizer.unk_token_regex.search(\"aaa[udsnk]aaa\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = \"\"\"Mathias Bynens 'Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A̴̵̜̰͔ͫ͗͢L̠ͨͧͩ͘G̴̻͈͍͔̹̑͗̎̅͛́Ǫ̵̹̻̝̳͂̌̌͘!͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞': Whenever you’re working on a piece of JavaScript code that deals with strings or regular expressions in some way, just add a unit test that contains a pile of poo (💩) in a string, 💩💩💩💩💩💩💩💩💩💩💩💩 and see if anything breaks. It’s a quick, fun, and easy way to see if your code supports astral symbols. Once you’ve found a Unicode-related bug in your code, all you need to do is apply the techniques discussed in this post to fix it.\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = BertWordPieceTokenizer(\"/tmp/bert-base-uncased-vocab.txt\", lowercase=True)\n",
+    "visualizer = EncodingVisualizer(tokenizer=tokenizer)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Visualizing Tokens With No Annotations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <html>\n",
+       "        <head>\n",
+       "            <style>\n",
+       "                .tokenized-text {\n",
+       "    width:100%;\n",
+       "    padding:2rem;\n",
+       "    max-height: 400px;\n",
+       "    overflow-y: auto;\n",
+       "    box-sizing:border-box;\n",
+       "    line-height:4rem; /* Lots of space between lines */\n",
+       "    font-family: \"Roboto Light\", \"Ubuntu Light\", \"Ubuntu\", monospace;\n",
+       "    box-shadow: 2px 2px 2px rgba(0,0,0,0.2);\n",
+       "    background-color: rgba(0,0,0,0.01);\n",
+       "    letter-spacing:2px; /* Give some extra separation between chars */\n",
+       "}\n",
+       ".non-token{\n",
+       "    /* White space and other things the tokenizer ignores*/\n",
+       "    white-space: pre;\n",
+       "    letter-spacing:4px;\n",
+       "    border-top:1px solid #A0A0A0; /* A gentle border on top and bottom makes tabs more ovious*/\n",
+       "    border-bottom:1px solid #A0A0A0;\n",
+       "    line-height: 1rem;\n",
+       "    height: calc(100% - 2px);\n",
+       "}\n",
+       "\n",
+       ".token {\n",
+       "    white-space: pre;\n",
+       "    position:relative;\n",
+       "    color:black;\n",
+       "    letter-spacing:2px;\n",
+       "}\n",
+       "\n",
+       ".annotation{\n",
+       "    white-space:nowrap; /* Important - ensures that annotations appears even if the annotated text wraps a line */\n",
+       "    border-radius:4px;\n",
+       "    position:relative;\n",
+       "    width:fit-content;\n",
+       "}\n",
+       ".annotation:before {\n",
+       "    /*The before holds the text and the after holds the background*/\n",
+       "    z-index:1000; /* Make sure this is above the background */\n",
+       "    content:attr(data-label); /* The annotations label is on a data attribute */\n",
+       "    color:white;\n",
+       "    position:absolute;\n",
+       "    font-size:1rem;\n",
+       "    text-align:center;\n",
+       "    font-weight:bold;\n",
+       "\n",
+       "    top:1.75rem;\n",
+       "    line-height:0;\n",
+       "    left:0;\n",
+       "    width:100%;\n",
+       "    padding:0.5rem 0;\n",
+       "    /* These make it so an annotation doesn't stretch beyond the annotated text if the label is longer*/\n",
+       "    overflow: hidden;\n",
+       "    white-space: nowrap;\n",
+       "    text-overflow:ellipsis;\n",
+       "}\n",
+       "\n",
+       ".annotation:after {\n",
+       "    content:attr(data-label); /* The content defines the width of the annotation*/\n",
+       "    position:absolute;\n",
+       "    font-size:0.75rem;\n",
+       "    text-align:center;\n",
+       "    font-weight:bold;\n",
+       "    text-overflow:ellipsis;\n",
+       "    top:1.75rem;\n",
+       "    line-height:0;\n",
+       "    overflow: hidden;\n",
+       "    white-space: nowrap;\n",
+       "\n",
+       "    left:0;\n",
+       "    width:100%; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/\n",
+       "\n",
+       "    padding:0.5rem 0;\n",
+       "    /* Nast hack below:\n",
+       "    We set the annotations color in code because we don't know the colors at css time.\n",
+       "    But you can't pass a color as a data attribute to get it into the pseudo element (this thing)\n",
+       "    So to get around that, annotations have the color set on them with a style attribute and then we\n",
+       "    can get the color with currentColor.\n",
+       "    Annotations wrap tokens and tokens set the color back to black\n",
+       "     */\n",
+       "    background-color: currentColor;\n",
+       "}\n",
+       ".annotation:hover::after, .annotation:hover::before{\n",
+       "    /* When the user hovers over an annotation expand the label to display in full\n",
+       "     */\n",
+       "    min-width: fit-content;\n",
+       "}\n",
+       "\n",
+       ".annotation:hover{\n",
+       "    /* Emphasize the annotation start end with a border on hover*/\n",
+       "    border-color: currentColor;\n",
+       "    border: 2px solid;\n",
+       "}\n",
+       ".special-token:not(:empty){\n",
+       "    /*\n",
+       "    A none empty special token is like UNK (as opposed to CLS which has no representation in the text )\n",
+       "     */\n",
+       "    position:relative;\n",
+       "}\n",
+       ".special-token:empty::before{\n",
+       "    /* Special tokens that don't have text are displayed as pseudo elements so we dont select them with the mouse*/\n",
+       "    content:attr(data-stok);\n",
+       "    background:#202020;\n",
+       "    font-size:0.75rem;\n",
+       "    color:white;\n",
+       "    margin: 0 0.25rem;\n",
+       "    padding: 0.25rem;\n",
+       "    border-radius:4px\n",
+       "}\n",
+       "\n",
+       ".special-token:not(:empty):before {\n",
+       "    /* Special tokens that have text (UNK) are displayed above the actual text*/\n",
+       "    content:attr(data-stok);\n",
+       "    position:absolute;\n",
+       "    bottom:1.75rem;\n",
+       "    min-width:100%;\n",
+       "    width:100%;\n",
+       "    height:1rem;\n",
+       "    line-height:1rem;\n",
+       "    font-size:1rem;\n",
+       "    text-align:center;\n",
+       "    color:white;\n",
+       "    font-weight:bold;\n",
+       "    background:#202020;\n",
+       "    border-radius:10%;\n",
+       "}\n",
+       "/*\n",
+       "We want to alternate the color of tokens, but we can't use nth child because tokens might be broken up by annotations\n",
+       "instead we apply even and odd class at generation time and color them that way\n",
+       " */\n",
+       ".even-token{\n",
+       "    background:#DCDCDC\t;\n",
+       "    border: 1px solid #DCDCDC;\n",
+       "}\n",
+       ".odd-token{\n",
+       "    background:#A0A0A0;\n",
+       "    border: 1px solid #A0A0A0;\n",
+       "}\n",
+       ".even-token.multi-token,.odd-token.multi-token{\n",
+       "    background:  repeating-linear-gradient(\n",
+       "    45deg,\n",
+       "    transparent,\n",
+       "    transparent 1px,\n",
+       "    #ccc 1px,\n",
+       "    #ccc 1px\n",
+       "    ),\n",
+       "    /* on \"bottom\" */\n",
+       "    linear-gradient(\n",
+       "    to bottom,\n",
+       "    #FFB6C1,\n",
+       "    #999\n",
+       "    );\n",
+       "}\n",
+       "\n",
+       ".multi-token:hover::after {\n",
+       "    content:\"This char has more than 1 token\"; /* The content defines the width of the annotation*/\n",
+       "    color:white;\n",
+       "    background-color: black;\n",
+       "    position:absolute;\n",
+       "    font-size:0.75rem;\n",
+       "    text-align:center;\n",
+       "    font-weight:bold;\n",
+       "    text-overflow:ellipsis;\n",
+       "    top:1.75rem;\n",
+       "    line-height:0;\n",
+       "    overflow: hidden;\n",
+       "    white-space: nowrap;\n",
+       "    left:0;\n",
+       "    width:fit-content; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/\n",
+       "    padding:0.5rem 0;\n",
+       "}\n",
+       "\n",
+       "            </style>\n",
+       "        </head>\n",
+       "        <body>\n",
+       "            <div class=\"tokenized-text\" dir=auto>\n",
+       "            <span class=\"token odd-token\"  >Mathias</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >By</span><span class=\"token odd-token\"  >nen</span><span class=\"token even-token\"  >s</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >'</span><span class=\"token even-token\"  >Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A</span><span class=\"non-token\"  >̴̵̜̰͔ͫ͗͢</span><span class=\"token odd-token\"  >L</span><span class=\"non-token\"  >̠ͨͧͩ͘</span><span class=\"token even-token\"  >G̴̻͈͍͔̹̑͗̎̅͛́O</span><span class=\"non-token\"  >̵̨̹̻̝̳͂̌̌͘</span><span class=\"token odd-token\"  >!</span><span class=\"non-token\"  >͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞</span><span class=\"token even-token\"  >'</span><span class=\"token odd-token\"  >:</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >Whenever</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >you</span><span class=\"token even-token\"  >’</span><span class=\"token odd-token\"  >re</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >working</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >on</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >piece</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >of</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >Java</span><span class=\"token even-token\"  >Script</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >code</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >that</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >deals</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >with</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >strings</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >or</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >regular</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >expressions</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >in</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >some</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >way</span><span class=\"token even-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >just</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >add</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >unit</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >test</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >that</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >contains</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >pile</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >of</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >po</span><span class=\"token even-token\"  >o</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >(</span><span class=\"token even-token special-token\"  data-stok=\"[UNK]\" >💩</span><span class=\"token odd-token\"  >)</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >in</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >string</span><span class=\"token odd-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token even-token special-token\"  data-stok=\"[UNK]\" >💩💩💩💩💩💩💩💩💩💩💩💩</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >and</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >see</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >if</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >anything</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >breaks</span><span class=\"token even-token\"  >.</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >It</span><span class=\"token even-token\"  >’</span><span class=\"token odd-token\"  >s</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >quick</span><span class=\"token even-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >fun</span><span class=\"token even-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >and</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >easy</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >way</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >to</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >see</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >if</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >your</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >code</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >supports</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >as</span><span class=\"token odd-token\"  >tral</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >symbols</span><span class=\"token odd-token\"  >.</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >Once</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >you</span><span class=\"token even-token\"  >’</span><span class=\"token odd-token\"  >ve</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >found</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >Unicode</span><span class=\"token odd-token\"  >-</span><span class=\"token even-token\"  >related</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >bug</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >in</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >your</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >code</span><span class=\"token odd-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >all</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >you</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >need</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >to</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >do</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >is</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >apply</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >the</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >techniques</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >discussed</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >in</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >this</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >post</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >to</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >fix</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >it</span><span class=\"token even-token\"  >.</span>\n",
+       "            </div>\n",
+       "        </body>\n",
+       "    </html>\n",
+       "    "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "visualizer(text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Visualizing Tokens With Aligned Annotations\n",
+    "First we make some annotations with the Annotation class"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tokenizers.tools import Annotation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "anno1 = Annotation(start=0, end=2, label=\"foo\")\n",
+    "anno2 = Annotation(start=2, end=4, label=\"bar\")\n",
+    "anno3 = Annotation(start=6, end=8, label=\"poo\")\n",
+    "anno4 = Annotation(start=9, end=12, label=\"shoe\")\n",
+    "annotations=[\n",
+    "    anno1,\n",
+    "    anno2,\n",
+    "    anno3,\n",
+    "    anno4,\n",
+    "    Annotation(start=23, end=30, label=\"random tandem bandem sandem landem fandom\"),\n",
+    "    Annotation(start=63, end=70, label=\"foo\"),\n",
+    "    Annotation(start=80, end=95, label=\"bar\"),\n",
+    "    Annotation(start=120, end=128, label=\"bar\"),\n",
+    "    Annotation(start=152, end=155, label=\"poo\"),\n",
+    "]\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <html>\n",
+       "        <head>\n",
+       "            <style>\n",
+       "                .tokenized-text {\n",
+       "    width:100%;\n",
+       "    padding:2rem;\n",
+       "    max-height: 400px;\n",
+       "    overflow-y: auto;\n",
+       "    box-sizing:border-box;\n",
+       "    line-height:4rem; /* Lots of space between lines */\n",
+       "    font-family: \"Roboto Light\", \"Ubuntu Light\", \"Ubuntu\", monospace;\n",
+       "    box-shadow: 2px 2px 2px rgba(0,0,0,0.2);\n",
+       "    background-color: rgba(0,0,0,0.01);\n",
+       "    letter-spacing:2px; /* Give some extra separation between chars */\n",
+       "}\n",
+       ".non-token{\n",
+       "    /* White space and other things the tokenizer ignores*/\n",
+       "    white-space: pre;\n",
+       "    letter-spacing:4px;\n",
+       "    border-top:1px solid #A0A0A0; /* A gentle border on top and bottom makes tabs more ovious*/\n",
+       "    border-bottom:1px solid #A0A0A0;\n",
+       "    line-height: 1rem;\n",
+       "    height: calc(100% - 2px);\n",
+       "}\n",
+       "\n",
+       ".token {\n",
+       "    white-space: pre;\n",
+       "    position:relative;\n",
+       "    color:black;\n",
+       "    letter-spacing:2px;\n",
+       "}\n",
+       "\n",
+       ".annotation{\n",
+       "    white-space:nowrap; /* Important - ensures that annotations appears even if the annotated text wraps a line */\n",
+       "    border-radius:4px;\n",
+       "    position:relative;\n",
+       "    width:fit-content;\n",
+       "}\n",
+       ".annotation:before {\n",
+       "    /*The before holds the text and the after holds the background*/\n",
+       "    z-index:1000; /* Make sure this is above the background */\n",
+       "    content:attr(data-label); /* The annotations label is on a data attribute */\n",
+       "    color:white;\n",
+       "    position:absolute;\n",
+       "    font-size:1rem;\n",
+       "    text-align:center;\n",
+       "    font-weight:bold;\n",
+       "\n",
+       "    top:1.75rem;\n",
+       "    line-height:0;\n",
+       "    left:0;\n",
+       "    width:100%;\n",
+       "    padding:0.5rem 0;\n",
+       "    /* These make it so an annotation doesn't stretch beyond the annotated text if the label is longer*/\n",
+       "    overflow: hidden;\n",
+       "    white-space: nowrap;\n",
+       "    text-overflow:ellipsis;\n",
+       "}\n",
+       "\n",
+       ".annotation:after {\n",
+       "    content:attr(data-label); /* The content defines the width of the annotation*/\n",
+       "    position:absolute;\n",
+       "    font-size:0.75rem;\n",
+       "    text-align:center;\n",
+       "    font-weight:bold;\n",
+       "    text-overflow:ellipsis;\n",
+       "    top:1.75rem;\n",
+       "    line-height:0;\n",
+       "    overflow: hidden;\n",
+       "    white-space: nowrap;\n",
+       "\n",
+       "    left:0;\n",
+       "    width:100%; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/\n",
+       "\n",
+       "    padding:0.5rem 0;\n",
+       "    /* Nast hack below:\n",
+       "    We set the annotations color in code because we don't know the colors at css time.\n",
+       "    But you can't pass a color as a data attribute to get it into the pseudo element (this thing)\n",
+       "    So to get around that, annotations have the color set on them with a style attribute and then we\n",
+       "    can get the color with currentColor.\n",
+       "    Annotations wrap tokens and tokens set the color back to black\n",
+       "     */\n",
+       "    background-color: currentColor;\n",
+       "}\n",
+       ".annotation:hover::after, .annotation:hover::before{\n",
+       "    /* When the user hovers over an annotation expand the label to display in full\n",
+       "     */\n",
+       "    min-width: fit-content;\n",
+       "}\n",
+       "\n",
+       ".annotation:hover{\n",
+       "    /* Emphasize the annotation start end with a border on hover*/\n",
+       "    border-color: currentColor;\n",
+       "    border: 2px solid;\n",
+       "}\n",
+       ".special-token:not(:empty){\n",
+       "    /*\n",
+       "    A none empty special token is like UNK (as opposed to CLS which has no representation in the text )\n",
+       "     */\n",
+       "    position:relative;\n",
+       "}\n",
+       ".special-token:empty::before{\n",
+       "    /* Special tokens that don't have text are displayed as pseudo elements so we dont select them with the mouse*/\n",
+       "    content:attr(data-stok);\n",
+       "    background:#202020;\n",
+       "    font-size:0.75rem;\n",
+       "    color:white;\n",
+       "    margin: 0 0.25rem;\n",
+       "    padding: 0.25rem;\n",
+       "    border-radius:4px\n",
+       "}\n",
+       "\n",
+       ".special-token:not(:empty):before {\n",
+       "    /* Special tokens that have text (UNK) are displayed above the actual text*/\n",
+       "    content:attr(data-stok);\n",
+       "    position:absolute;\n",
+       "    bottom:1.75rem;\n",
+       "    min-width:100%;\n",
+       "    width:100%;\n",
+       "    height:1rem;\n",
+       "    line-height:1rem;\n",
+       "    font-size:1rem;\n",
+       "    text-align:center;\n",
+       "    color:white;\n",
+       "    font-weight:bold;\n",
+       "    background:#202020;\n",
+       "    border-radius:10%;\n",
+       "}\n",
+       "/*\n",
+       "We want to alternate the color of tokens, but we can't use nth child because tokens might be broken up by annotations\n",
+       "instead we apply even and odd class at generation time and color them that way\n",
+       " */\n",
+       ".even-token{\n",
+       "    background:#DCDCDC\t;\n",
+       "    border: 1px solid #DCDCDC;\n",
+       "}\n",
+       ".odd-token{\n",
+       "    background:#A0A0A0;\n",
+       "    border: 1px solid #A0A0A0;\n",
+       "}\n",
+       ".even-token.multi-token,.odd-token.multi-token{\n",
+       "    background:  repeating-linear-gradient(\n",
+       "    45deg,\n",
+       "    transparent,\n",
+       "    transparent 1px,\n",
+       "    #ccc 1px,\n",
+       "    #ccc 1px\n",
+       "    ),\n",
+       "    /* on \"bottom\" */\n",
+       "    linear-gradient(\n",
+       "    to bottom,\n",
+       "    #FFB6C1,\n",
+       "    #999\n",
+       "    );\n",
+       "}\n",
+       "\n",
+       ".multi-token:hover::after {\n",
+       "    content:\"This char has more than 1 token\"; /* The content defines the width of the annotation*/\n",
+       "    color:white;\n",
+       "    background-color: black;\n",
+       "    position:absolute;\n",
+       "    font-size:0.75rem;\n",
+       "    text-align:center;\n",
+       "    font-weight:bold;\n",
+       "    text-overflow:ellipsis;\n",
+       "    top:1.75rem;\n",
+       "    line-height:0;\n",
+       "    overflow: hidden;\n",
+       "    white-space: nowrap;\n",
+       "    left:0;\n",
+       "    width:fit-content; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/\n",
+       "    padding:0.5rem 0;\n",
+       "}\n",
+       "\n",
+       "            </style>\n",
+       "        </head>\n",
+       "        <body>\n",
+       "            <div class=\"tokenized-text\" dir=auto>\n",
+       "            <span class=\"annotation\" style=\"color:hsl(61,32%,64%\" data-label=\"foo\"><span class=\"token odd-token\"  >Ma</span></span><span class=\"annotation\" style=\"color:hsl(10,32%,64%\" data-label=\"bar\"><span class=\"token odd-token\"  >th</span></span><span class=\"token odd-token\"  >ia</span><span class=\"annotation\" style=\"color:hsl(112,32%,64%\" data-label=\"poo\"><span class=\"token odd-token\"  >s</span><span class=\"non-token\"  > </span></span><span class=\"token even-token\"  >B</span><span class=\"annotation\" style=\"color:hsl(214,32%,64%\" data-label=\"shoe\"><span class=\"token even-token\"  >y</span><span class=\"token odd-token\"  >ne</span></span><span class=\"token odd-token\"  >n</span><span class=\"token even-token\"  >s</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >'</span><span class=\"token even-token\"  >Z͑ͫ̓ͪ̂ͫ</span><span class=\"annotation\" style=\"color:hsl(163,32%,64%\" data-label=\"random tandem bandem sandem landem fandom\"><span class=\"token even-token\"  >̽͏̴̙̤̞͉</span></span><span class=\"token even-token\"  >͚̯̞̠͍A</span><span class=\"non-token\"  >̴̵̜̰͔ͫ͗͢</span><span class=\"token odd-token\"  >L</span><span class=\"non-token\"  >̠ͨͧͩ͘</span><span class=\"token even-token\"  >G̴̻͈͍͔̹̑͗̎̅͛́</span><span class=\"annotation\" style=\"color:hsl(61,32%,64%\" data-label=\"foo\"><span class=\"token even-token\"  >O</span><span class=\"non-token\"  >̵̨͂̌̌͘</span></span><span class=\"non-token\"  >̹̻̝̳</span><span class=\"token odd-token\"  >!</span><span class=\"non-token\"  >̿̋ͥͥ̂</span><span class=\"annotation\" style=\"color:hsl(10,32%,64%\" data-label=\"bar\"><span class=\"non-token\"  >͖̬̰̙̗ͣ̐́́͜͞</span><span class=\"token even-token\"  >'</span><span class=\"token odd-token\"  >:</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >W</span></span><span class=\"token even-token\"  >henever</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >you</span><span class=\"token even-token\"  >’</span><span class=\"token odd-token\"  >re</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >working</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >on</span><span class=\"annotation\" style=\"color:hsl(10,32%,64%\" data-label=\"bar\"><span class=\"non-token\"  > </span><span class=\"token even-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >piece</span></span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >of</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >Java</span><span class=\"token even-token\"  >Script</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >code</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >that</span><span class=\"annotation\" style=\"color:hsl(112,32%,64%\" data-label=\"poo\"><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >de</span></span><span class=\"token odd-token\"  >als</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >with</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >strings</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >or</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >regular</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >expressions</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >in</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >some</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >way</span><span class=\"token even-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >just</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >add</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >unit</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >test</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >that</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >contains</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >pile</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >of</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >po</span><span class=\"token even-token\"  >o</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >(</span><span class=\"token even-token special-token\"  data-stok=\"[UNK]\" >💩</span><span class=\"token odd-token\"  >)</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >in</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >string</span><span class=\"token odd-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token even-token special-token\"  data-stok=\"[UNK]\" >💩💩💩💩💩💩💩💩💩💩💩💩</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >and</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >see</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >if</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >anything</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >breaks</span><span class=\"token even-token\"  >.</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >It</span><span class=\"token even-token\"  >’</span><span class=\"token odd-token\"  >s</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >quick</span><span class=\"token even-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >fun</span><span class=\"token even-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >and</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >easy</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >way</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >to</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >see</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >if</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >your</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >code</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >supports</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >as</span><span class=\"token odd-token\"  >tral</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >symbols</span><span class=\"token odd-token\"  >.</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >Once</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >you</span><span class=\"token even-token\"  >’</span><span class=\"token odd-token\"  >ve</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >found</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >Unicode</span><span class=\"token odd-token\"  >-</span><span class=\"token even-token\"  >related</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >bug</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >in</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >your</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >code</span><span class=\"token odd-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >all</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >you</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >need</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >to</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >do</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >is</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >apply</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >the</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >techniques</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >discussed</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >in</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >this</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >post</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >to</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >fix</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >it</span><span class=\"token even-token\"  >.</span>\n",
+       "            </div>\n",
+       "        </body>\n",
+       "    </html>\n",
+       "    "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "visualizer(text,annotations=annotations)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using A Custom Annotation Format\n",
+    "Every system has its own representation of annotations. That's why we can instantiate the EncodingVisualizer with a convertion function."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'startPlace': 0, 'endPlace': 3, 'theTag': '0'},\n",
+       " {'startPlace': 4, 'endPlace': 7, 'theTag': '4'},\n",
+       " {'startPlace': 8, 'endPlace': 11, 'theTag': '8'},\n",
+       " {'startPlace': 12, 'endPlace': 15, 'theTag': '12'},\n",
+       " {'startPlace': 16, 'endPlace': 19, 'theTag': '16'}]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "funnyAnnotations = [dict(startPlace=i,endPlace=i+3,theTag=str(i)) for i in range(0,20,4)]\n",
+    "funnyAnnotations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "converter = lambda funny: Annotation(start=funny['startPlace'], end=funny['endPlace'], label=funny['theTag'])\n",
+    "visualizer = EncodingVisualizer(tokenizer=tokenizer, default_to_notebook=True, annotation_converter=converter)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <html>\n",
+       "        <head>\n",
+       "            <style>\n",
+       "                .tokenized-text {\n",
+       "    width:100%;\n",
+       "    padding:2rem;\n",
+       "    max-height: 400px;\n",
+       "    overflow-y: auto;\n",
+       "    box-sizing:border-box;\n",
+       "    line-height:4rem; /* Lots of space between lines */\n",
+       "    font-family: \"Roboto Light\", \"Ubuntu Light\", \"Ubuntu\", monospace;\n",
+       "    box-shadow: 2px 2px 2px rgba(0,0,0,0.2);\n",
+       "    background-color: rgba(0,0,0,0.01);\n",
+       "    letter-spacing:2px; /* Give some extra separation between chars */\n",
+       "}\n",
+       ".non-token{\n",
+       "    /* White space and other things the tokenizer ignores*/\n",
+       "    white-space: pre;\n",
+       "    letter-spacing:4px;\n",
+       "    border-top:1px solid #A0A0A0; /* A gentle border on top and bottom makes tabs more ovious*/\n",
+       "    border-bottom:1px solid #A0A0A0;\n",
+       "    line-height: 1rem;\n",
+       "    height: calc(100% - 2px);\n",
+       "}\n",
+       "\n",
+       ".token {\n",
+       "    white-space: pre;\n",
+       "    position:relative;\n",
+       "    color:black;\n",
+       "    letter-spacing:2px;\n",
+       "}\n",
+       "\n",
+       ".annotation{\n",
+       "    white-space:nowrap; /* Important - ensures that annotations appears even if the annotated text wraps a line */\n",
+       "    border-radius:4px;\n",
+       "    position:relative;\n",
+       "    width:fit-content;\n",
+       "}\n",
+       ".annotation:before {\n",
+       "    /*The before holds the text and the after holds the background*/\n",
+       "    z-index:1000; /* Make sure this is above the background */\n",
+       "    content:attr(data-label); /* The annotations label is on a data attribute */\n",
+       "    color:white;\n",
+       "    position:absolute;\n",
+       "    font-size:1rem;\n",
+       "    text-align:center;\n",
+       "    font-weight:bold;\n",
+       "\n",
+       "    top:1.75rem;\n",
+       "    line-height:0;\n",
+       "    left:0;\n",
+       "    width:100%;\n",
+       "    padding:0.5rem 0;\n",
+       "    /* These make it so an annotation doesn't stretch beyond the annotated text if the label is longer*/\n",
+       "    overflow: hidden;\n",
+       "    white-space: nowrap;\n",
+       "    text-overflow:ellipsis;\n",
+       "}\n",
+       "\n",
+       ".annotation:after {\n",
+       "    content:attr(data-label); /* The content defines the width of the annotation*/\n",
+       "    position:absolute;\n",
+       "    font-size:0.75rem;\n",
+       "    text-align:center;\n",
+       "    font-weight:bold;\n",
+       "    text-overflow:ellipsis;\n",
+       "    top:1.75rem;\n",
+       "    line-height:0;\n",
+       "    overflow: hidden;\n",
+       "    white-space: nowrap;\n",
+       "\n",
+       "    left:0;\n",
+       "    width:100%; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/\n",
+       "\n",
+       "    padding:0.5rem 0;\n",
+       "    /* Nast hack below:\n",
+       "    We set the annotations color in code because we don't know the colors at css time.\n",
+       "    But you can't pass a color as a data attribute to get it into the pseudo element (this thing)\n",
+       "    So to get around that, annotations have the color set on them with a style attribute and then we\n",
+       "    can get the color with currentColor.\n",
+       "    Annotations wrap tokens and tokens set the color back to black\n",
+       "     */\n",
+       "    background-color: currentColor;\n",
+       "}\n",
+       ".annotation:hover::after, .annotation:hover::before{\n",
+       "    /* When the user hovers over an annotation expand the label to display in full\n",
+       "     */\n",
+       "    min-width: fit-content;\n",
+       "}\n",
+       "\n",
+       ".annotation:hover{\n",
+       "    /* Emphasize the annotation start end with a border on hover*/\n",
+       "    border-color: currentColor;\n",
+       "    border: 2px solid;\n",
+       "}\n",
+       ".special-token:not(:empty){\n",
+       "    /*\n",
+       "    A none empty special token is like UNK (as opposed to CLS which has no representation in the text )\n",
+       "     */\n",
+       "    position:relative;\n",
+       "}\n",
+       ".special-token:empty::before{\n",
+       "    /* Special tokens that don't have text are displayed as pseudo elements so we dont select them with the mouse*/\n",
+       "    content:attr(data-stok);\n",
+       "    background:#202020;\n",
+       "    font-size:0.75rem;\n",
+       "    color:white;\n",
+       "    margin: 0 0.25rem;\n",
+       "    padding: 0.25rem;\n",
+       "    border-radius:4px\n",
+       "}\n",
+       "\n",
+       ".special-token:not(:empty):before {\n",
+       "    /* Special tokens that have text (UNK) are displayed above the actual text*/\n",
+       "    content:attr(data-stok);\n",
+       "    position:absolute;\n",
+       "    bottom:1.75rem;\n",
+       "    min-width:100%;\n",
+       "    width:100%;\n",
+       "    height:1rem;\n",
+       "    line-height:1rem;\n",
+       "    font-size:1rem;\n",
+       "    text-align:center;\n",
+       "    color:white;\n",
+       "    font-weight:bold;\n",
+       "    background:#202020;\n",
+       "    border-radius:10%;\n",
+       "}\n",
+       "/*\n",
+       "We want to alternate the color of tokens, but we can't use nth child because tokens might be broken up by annotations\n",
+       "instead we apply even and odd class at generation time and color them that way\n",
+       " */\n",
+       ".even-token{\n",
+       "    background:#DCDCDC\t;\n",
+       "    border: 1px solid #DCDCDC;\n",
+       "}\n",
+       ".odd-token{\n",
+       "    background:#A0A0A0;\n",
+       "    border: 1px solid #A0A0A0;\n",
+       "}\n",
+       ".even-token.multi-token,.odd-token.multi-token{\n",
+       "    background:  repeating-linear-gradient(\n",
+       "    45deg,\n",
+       "    transparent,\n",
+       "    transparent 1px,\n",
+       "    #ccc 1px,\n",
+       "    #ccc 1px\n",
+       "    ),\n",
+       "    /* on \"bottom\" */\n",
+       "    linear-gradient(\n",
+       "    to bottom,\n",
+       "    #FFB6C1,\n",
+       "    #999\n",
+       "    );\n",
+       "}\n",
+       "\n",
+       ".multi-token:hover::after {\n",
+       "    content:\"This char has more than 1 token\"; /* The content defines the width of the annotation*/\n",
+       "    color:white;\n",
+       "    background-color: black;\n",
+       "    position:absolute;\n",
+       "    font-size:0.75rem;\n",
+       "    text-align:center;\n",
+       "    font-weight:bold;\n",
+       "    text-overflow:ellipsis;\n",
+       "    top:1.75rem;\n",
+       "    line-height:0;\n",
+       "    overflow: hidden;\n",
+       "    white-space: nowrap;\n",
+       "    left:0;\n",
+       "    width:fit-content; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/\n",
+       "    padding:0.5rem 0;\n",
+       "}\n",
+       "\n",
+       "            </style>\n",
+       "        </head>\n",
+       "        <body>\n",
+       "            <div class=\"tokenized-text\" dir=auto>\n",
+       "            <span class=\"annotation\" style=\"color:hsl(10,32%,64%\" data-label=\"0\"><span class=\"token odd-token\"  >Mat</span></span><span class=\"token odd-token\"  >h</span><span class=\"annotation\" style=\"color:hsl(163,32%,64%\" data-label=\"4\"><span class=\"token odd-token\"  >ias</span></span><span class=\"non-token\"  > </span><span class=\"annotation\" style=\"color:hsl(214,32%,64%\" data-label=\"8\"><span class=\"token even-token\"  >By</span><span class=\"token odd-token\"  >n</span></span><span class=\"token odd-token\"  >e</span><span class=\"annotation\" style=\"color:hsl(61,32%,64%\" data-label=\"12\"><span class=\"token odd-token\"  >n</span><span class=\"token even-token\"  >s</span><span class=\"non-token\"  > </span></span><span class=\"token odd-token\"  >'</span><span class=\"annotation\" style=\"color:hsl(112,32%,64%\" data-label=\"16\"><span class=\"token even-token\"  >Z͑ͫ</span></span><span class=\"token even-token\"  >̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A</span><span class=\"non-token\"  >̴̵̜̰͔ͫ͗͢</span><span class=\"token odd-token\"  >L</span><span class=\"non-token\"  >̠ͨͧͩ͘</span><span class=\"token even-token\"  >G̴̻͈͍͔̹̑͗̎̅͛́O</span><span class=\"non-token\"  >̵̨̹̻̝̳͂̌̌͘</span><span class=\"token odd-token\"  >!</span><span class=\"non-token\"  >͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞</span><span class=\"token even-token\"  >'</span><span class=\"token odd-token\"  >:</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >Whenever</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >you</span><span class=\"token even-token\"  >’</span><span class=\"token odd-token\"  >re</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >working</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >on</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >piece</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >of</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >Java</span><span class=\"token even-token\"  >Script</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >code</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >that</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >deals</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >with</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >strings</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >or</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >regular</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >expressions</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >in</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >some</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >way</span><span class=\"token even-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >just</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >add</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >unit</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >test</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >that</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >contains</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >pile</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >of</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >po</span><span class=\"token even-token\"  >o</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >(</span><span class=\"token even-token special-token\"  data-stok=\"[UNK]\" >💩</span><span class=\"token odd-token\"  >)</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >in</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >string</span><span class=\"token odd-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token even-token special-token\"  data-stok=\"[UNK]\" >💩💩💩💩💩💩💩💩💩💩💩💩</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >and</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >see</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >if</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >anything</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >breaks</span><span class=\"token even-token\"  >.</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >It</span><span class=\"token even-token\"  >’</span><span class=\"token odd-token\"  >s</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >quick</span><span class=\"token even-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >fun</span><span class=\"token even-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >and</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >easy</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >way</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >to</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >see</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >if</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >your</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >code</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >supports</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >as</span><span class=\"token odd-token\"  >tral</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >symbols</span><span class=\"token odd-token\"  >.</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >Once</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >you</span><span class=\"token even-token\"  >’</span><span class=\"token odd-token\"  >ve</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >found</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >Unicode</span><span class=\"token odd-token\"  >-</span><span class=\"token even-token\"  >related</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >bug</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >in</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >your</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >code</span><span class=\"token odd-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >all</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >you</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >need</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >to</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >do</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >is</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >apply</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >the</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >techniques</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >discussed</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >in</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >this</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >post</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >to</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >fix</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >it</span><span class=\"token even-token\"  >.</span>\n",
+       "            </div>\n",
+       "        </body>\n",
+       "    </html>\n",
+       "    "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "visualizer(text, annotations=funnyAnnotations)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Trying with Roberta\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--2020-12-04 09:25:00--  https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json\n",
+      "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n",
+      "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.226.19\n",
+      "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.226.19|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 898823 (878K) [application/json]\n",
+      "Saving to: ‘/tmp/roberta-base-vocab.json’\n",
+      "\n",
+      "/tmp/roberta-base-v 100%[===================>] 877.76K  4.35MB/s    in 0.2s    \n",
+      "\n",
+      "2020-12-04 09:25:00 (4.35 MB/s) - ‘/tmp/roberta-base-vocab.json’ saved [898823/898823]\n",
+      "\n",
+      "--2020-12-04 09:25:00--  https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt\n",
+      "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n",
+      "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.104.253\n",
+      "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.104.253|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 456318 (446K) [text/plain]\n",
+      "Saving to: ‘/tmp/roberta-base-merges.txt’\n",
+      "\n",
+      "/tmp/roberta-base-m 100%[===================>] 445.62K  --.-KB/s    in 0.1s    \n",
+      "\n",
+      "2020-12-04 09:25:01 (4.04 MB/s) - ‘/tmp/roberta-base-merges.txt’ saved [456318/456318]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "!wget \"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json\" -O /tmp/roberta-base-vocab.json\n",
+    "!wget \"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt\" -O /tmp/roberta-base-merges.txt\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <html>\n",
+       "        <head>\n",
+       "            <style>\n",
+       "                .tokenized-text {\n",
+       "    width:100%;\n",
+       "    padding:2rem;\n",
+       "    max-height: 400px;\n",
+       "    overflow-y: auto;\n",
+       "    box-sizing:border-box;\n",
+       "    line-height:4rem; /* Lots of space between lines */\n",
+       "    font-family: \"Roboto Light\", \"Ubuntu Light\", \"Ubuntu\", monospace;\n",
+       "    box-shadow: 2px 2px 2px rgba(0,0,0,0.2);\n",
+       "    background-color: rgba(0,0,0,0.01);\n",
+       "    letter-spacing:2px; /* Give some extra separation between chars */\n",
+       "}\n",
+       ".non-token{\n",
+       "    /* White space and other things the tokenizer ignores*/\n",
+       "    white-space: pre;\n",
+       "    letter-spacing:4px;\n",
+       "    border-top:1px solid #A0A0A0; /* A gentle border on top and bottom makes tabs more ovious*/\n",
+       "    border-bottom:1px solid #A0A0A0;\n",
+       "    line-height: 1rem;\n",
+       "    height: calc(100% - 2px);\n",
+       "}\n",
+       "\n",
+       ".token {\n",
+       "    white-space: pre;\n",
+       "    position:relative;\n",
+       "    color:black;\n",
+       "    letter-spacing:2px;\n",
+       "}\n",
+       "\n",
+       ".annotation{\n",
+       "    white-space:nowrap; /* Important - ensures that annotations appears even if the annotated text wraps a line */\n",
+       "    border-radius:4px;\n",
+       "    position:relative;\n",
+       "    width:fit-content;\n",
+       "}\n",
+       ".annotation:before {\n",
+       "    /*The before holds the text and the after holds the background*/\n",
+       "    z-index:1000; /* Make sure this is above the background */\n",
+       "    content:attr(data-label); /* The annotations label is on a data attribute */\n",
+       "    color:white;\n",
+       "    position:absolute;\n",
+       "    font-size:1rem;\n",
+       "    text-align:center;\n",
+       "    font-weight:bold;\n",
+       "\n",
+       "    top:1.75rem;\n",
+       "    line-height:0;\n",
+       "    left:0;\n",
+       "    width:100%;\n",
+       "    padding:0.5rem 0;\n",
+       "    /* These make it so an annotation doesn't stretch beyond the annotated text if the label is longer*/\n",
+       "    overflow: hidden;\n",
+       "    white-space: nowrap;\n",
+       "    text-overflow:ellipsis;\n",
+       "}\n",
+       "\n",
+       ".annotation:after {\n",
+       "    content:attr(data-label); /* The content defines the width of the annotation*/\n",
+       "    position:absolute;\n",
+       "    font-size:0.75rem;\n",
+       "    text-align:center;\n",
+       "    font-weight:bold;\n",
+       "    text-overflow:ellipsis;\n",
+       "    top:1.75rem;\n",
+       "    line-height:0;\n",
+       "    overflow: hidden;\n",
+       "    white-space: nowrap;\n",
+       "\n",
+       "    left:0;\n",
+       "    width:100%; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/\n",
+       "\n",
+       "    padding:0.5rem 0;\n",
+       "    /* Nast hack below:\n",
+       "    We set the annotations color in code because we don't know the colors at css time.\n",
+       "    But you can't pass a color as a data attribute to get it into the pseudo element (this thing)\n",
+       "    So to get around that, annotations have the color set on them with a style attribute and then we\n",
+       "    can get the color with currentColor.\n",
+       "    Annotations wrap tokens and tokens set the color back to black\n",
+       "     */\n",
+       "    background-color: currentColor;\n",
+       "}\n",
+       ".annotation:hover::after, .annotation:hover::before{\n",
+       "    /* When the user hovers over an annotation expand the label to display in full\n",
+       "     */\n",
+       "    min-width: fit-content;\n",
+       "}\n",
+       "\n",
+       ".annotation:hover{\n",
+       "    /* Emphasize the annotation start end with a border on hover*/\n",
+       "    border-color: currentColor;\n",
+       "    border: 2px solid;\n",
+       "}\n",
+       ".special-token:not(:empty){\n",
+       "    /*\n",
+       "    A none empty special token is like UNK (as opposed to CLS which has no representation in the text )\n",
+       "     */\n",
+       "    position:relative;\n",
+       "}\n",
+       ".special-token:empty::before{\n",
+       "    /* Special tokens that don't have text are displayed as pseudo elements so we dont select them with the mouse*/\n",
+       "    content:attr(data-stok);\n",
+       "    background:#202020;\n",
+       "    font-size:0.75rem;\n",
+       "    color:white;\n",
+       "    margin: 0 0.25rem;\n",
+       "    padding: 0.25rem;\n",
+       "    border-radius:4px\n",
+       "}\n",
+       "\n",
+       ".special-token:not(:empty):before {\n",
+       "    /* Special tokens that have text (UNK) are displayed above the actual text*/\n",
+       "    content:attr(data-stok);\n",
+       "    position:absolute;\n",
+       "    bottom:1.75rem;\n",
+       "    min-width:100%;\n",
+       "    width:100%;\n",
+       "    height:1rem;\n",
+       "    line-height:1rem;\n",
+       "    font-size:1rem;\n",
+       "    text-align:center;\n",
+       "    color:white;\n",
+       "    font-weight:bold;\n",
+       "    background:#202020;\n",
+       "    border-radius:10%;\n",
+       "}\n",
+       "/*\n",
+       "We want to alternate the color of tokens, but we can't use nth child because tokens might be broken up by annotations\n",
+       "instead we apply even and odd class at generation time and color them that way\n",
+       " */\n",
+       ".even-token{\n",
+       "    background:#DCDCDC\t;\n",
+       "    border: 1px solid #DCDCDC;\n",
+       "}\n",
+       ".odd-token{\n",
+       "    background:#A0A0A0;\n",
+       "    border: 1px solid #A0A0A0;\n",
+       "}\n",
+       ".even-token.multi-token,.odd-token.multi-token{\n",
+       "    background:  repeating-linear-gradient(\n",
+       "    45deg,\n",
+       "    transparent,\n",
+       "    transparent 1px,\n",
+       "    #ccc 1px,\n",
+       "    #ccc 1px\n",
+       "    ),\n",
+       "    /* on \"bottom\" */\n",
+       "    linear-gradient(\n",
+       "    to bottom,\n",
+       "    #FFB6C1,\n",
+       "    #999\n",
+       "    );\n",
+       "}\n",
+       "\n",
+       ".multi-token:hover::after {\n",
+       "    content:\"This char has more than 1 token\"; /* The content defines the width of the annotation*/\n",
+       "    color:white;\n",
+       "    background-color: black;\n",
+       "    position:absolute;\n",
+       "    font-size:0.75rem;\n",
+       "    text-align:center;\n",
+       "    font-weight:bold;\n",
+       "    text-overflow:ellipsis;\n",
+       "    top:1.75rem;\n",
+       "    line-height:0;\n",
+       "    overflow: hidden;\n",
+       "    white-space: nowrap;\n",
+       "    left:0;\n",
+       "    width:fit-content; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/\n",
+       "    padding:0.5rem 0;\n",
+       "}\n",
+       "\n",
+       "            </style>\n",
+       "        </head>\n",
+       "        <body>\n",
+       "            <div class=\"tokenized-text\" dir=auto>\n",
+       "            <span class=\"annotation\" style=\"color:hsl(61,32%,64%\" data-label=\"foo\"><span class=\"token even-token\"  >Ma</span></span><span class=\"annotation\" style=\"color:hsl(10,32%,64%\" data-label=\"bar\"><span class=\"token even-token\"  >th</span></span><span class=\"token odd-token\"  >ia</span><span class=\"annotation\" style=\"color:hsl(112,32%,64%\" data-label=\"poo\"><span class=\"token odd-token\"  >s</span><span class=\"token even-token\"  > </span></span><span class=\"token even-token\"  >B</span><span class=\"annotation\" style=\"color:hsl(214,32%,64%\" data-label=\"shoe\"><span class=\"token odd-token\"  >yn</span><span class=\"token even-token\"  >e</span></span><span class=\"token even-token\"  >ns</span><span class=\"token odd-token\"  > '</span><span class=\"token even-token\"  >Z</span><span class=\"token multi-token odd-token\"  >͑</span><span class=\"token multi-token odd-token\"  >ͫ</span><span class=\"token multi-token odd-token\"  >̓</span><span class=\"token multi-token odd-token\"  >ͪ</span><span class=\"token multi-token odd-token\"  >̂</span><span class=\"token multi-token odd-token\"  >ͫ</span><span class=\"annotation\" style=\"color:hsl(163,32%,64%\" data-label=\"random tandem bandem sandem landem fandom\"><span class=\"token multi-token odd-token\"  >̽</span><span class=\"token multi-token odd-token\"  >͏</span><span class=\"token multi-token odd-token\"  >̴</span><span class=\"token multi-token odd-token\"  >̙</span><span class=\"token multi-token odd-token\"  >̤</span><span class=\"token multi-token odd-token\"  >̞</span><span class=\"token multi-token odd-token\"  >͉</span></span><span class=\"token multi-token odd-token\"  >͚</span><span class=\"token multi-token odd-token\"  >̯</span><span class=\"token multi-token odd-token\"  >̞</span><span class=\"token multi-token odd-token\"  >̠</span><span class=\"token multi-token odd-token\"  >͍</span><span class=\"token odd-token\"  >A</span><span class=\"token multi-token even-token\"  >ͫ</span><span class=\"token multi-token even-token\"  >͗</span><span class=\"token multi-token even-token\"  >̴</span><span class=\"token multi-token even-token\"  >͢</span><span class=\"token multi-token even-token\"  >̵</span><span class=\"token multi-token even-token\"  >̜</span><span class=\"token multi-token even-token\"  >̰</span><span class=\"token multi-token even-token\"  >͔</span><span class=\"token even-token\"  >L</span><span class=\"token multi-token odd-token\"  >ͨ</span><span class=\"token multi-token odd-token\"  >ͧ</span><span class=\"token multi-token odd-token\"  >ͩ</span><span class=\"token multi-token odd-token\"  >͘</span><span class=\"token multi-token odd-token\"  >̠</span><span class=\"token odd-token\"  >G</span><span class=\"token multi-token even-token\"  >̑</span><span class=\"token multi-token even-token\"  >͗</span><span class=\"token multi-token even-token\"  >̎</span><span class=\"token multi-token even-token\"  >̅</span><span class=\"token multi-token even-token\"  >͛</span><span class=\"token multi-token even-token\"  >́</span><span class=\"token multi-token even-token\"  >̴</span><span class=\"token multi-token even-token\"  >̻</span><span class=\"token multi-token even-token\"  >͈</span><span class=\"token multi-token even-token\"  >͍</span><span class=\"token multi-token even-token\"  >͔</span><span class=\"token multi-token even-token\"  >̹</span><span class=\"annotation\" style=\"color:hsl(61,32%,64%\" data-label=\"foo\"><span class=\"token even-token\"  >O</span><span class=\"token multi-token odd-token\"  >͂</span><span class=\"token multi-token odd-token\"  >̌</span><span class=\"token multi-token odd-token\"  >̌</span><span class=\"token multi-token odd-token\"  >͘</span><span class=\"token multi-token odd-token\"  >̨</span><span class=\"token multi-token odd-token\"  >̵</span></span><span class=\"token multi-token odd-token\"  >̹</span><span class=\"token multi-token odd-token\"  >̻</span><span class=\"token multi-token odd-token\"  >̝</span><span class=\"token multi-token odd-token\"  >̳</span><span class=\"token odd-token\"  >!</span><span class=\"token multi-token even-token\"  >̿</span><span class=\"token multi-token even-token\"  >̋</span><span class=\"token multi-token even-token\"  >ͥ</span><span class=\"token multi-token even-token\"  >ͥ</span><span class=\"token multi-token even-token\"  >̂</span><span class=\"annotation\" style=\"color:hsl(10,32%,64%\" data-label=\"bar\"><span class=\"token multi-token even-token\"  >ͣ</span><span class=\"token multi-token even-token\"  >̐</span><span class=\"token multi-token even-token\"  >́</span><span class=\"token multi-token even-token\"  >́</span><span class=\"token multi-token even-token\"  >͞</span><span class=\"token multi-token even-token\"  >͜</span><span class=\"token multi-token even-token\"  >͖</span><span class=\"token multi-token even-token\"  >̬</span><span class=\"token multi-token even-token\"  >̰</span><span class=\"token multi-token even-token\"  >̙</span><span class=\"token multi-token even-token\"  >̗</span><span class=\"token even-token\"  >':</span><span class=\"token odd-token\"  > W</span></span><span class=\"token odd-token\"  >henever</span><span class=\"token even-token\"  > you</span><span class=\"token multi-token odd-token\"  >’</span><span class=\"token odd-token\"  >re</span><span class=\"token even-token\"  > working</span><span class=\"token odd-token\"  > on</span><span class=\"annotation\" style=\"color:hsl(10,32%,64%\" data-label=\"bar\"><span class=\"token even-token\"  > a</span><span class=\"token odd-token\"  > piece</span></span><span class=\"token even-token\"  > of</span><span class=\"token odd-token\"  > JavaScript</span><span class=\"token even-token\"  > code</span><span class=\"token odd-token\"  > that</span><span class=\"annotation\" style=\"color:hsl(112,32%,64%\" data-label=\"poo\"><span class=\"token even-token\"  > de</span></span><span class=\"token even-token\"  >als</span><span class=\"token odd-token\"  > with</span><span class=\"token even-token\"  > strings</span><span class=\"token odd-token\"  > or</span><span class=\"token even-token\"  > regular</span><span class=\"token odd-token\"  > expressions</span><span class=\"token even-token\"  > in</span><span class=\"token odd-token\"  > some</span><span class=\"token even-token\"  > way</span><span class=\"token odd-token\"  >,</span><span class=\"token even-token\"  > just</span><span class=\"token odd-token\"  > add</span><span class=\"token even-token\"  > a</span><span class=\"token odd-token\"  > unit</span><span class=\"token even-token\"  > test</span><span class=\"token odd-token\"  > that</span><span class=\"token even-token\"  > contains</span><span class=\"token odd-token\"  > a</span><span class=\"token even-token\"  > pile</span><span class=\"token odd-token\"  > of</span><span class=\"token even-token\"  > po</span><span class=\"token odd-token\"  >o</span><span class=\"token even-token\"  > (</span><span class=\"token multi-token odd-token\"  >💩</span><span class=\"token even-token\"  >)</span><span class=\"token odd-token\"  > in</span><span class=\"token even-token\"  > a</span><span class=\"token odd-token\"  > string</span><span class=\"token even-token\"  >,</span><span class=\"token odd-token\"  > 💩</span><span class=\"token multi-token even-token\"  >💩</span><span class=\"token multi-token odd-token\"  >💩</span><span class=\"token multi-token even-token\"  >💩</span><span class=\"token multi-token odd-token\"  >💩</span><span class=\"token multi-token even-token\"  >💩</span><span class=\"token multi-token odd-token\"  >💩</span><span class=\"token multi-token even-token\"  >💩</span><span class=\"token multi-token odd-token\"  >💩</span><span class=\"token multi-token even-token\"  >💩</span><span class=\"token multi-token odd-token\"  >💩</span><span class=\"token multi-token even-token\"  >💩</span><span class=\"token odd-token\"  > and</span><span class=\"token even-token\"  > see</span><span class=\"token odd-token\"  > if</span><span class=\"token even-token\"  > anything</span><span class=\"token odd-token\"  > breaks</span><span class=\"token even-token\"  >.</span><span class=\"token odd-token\"  > It</span><span class=\"token multi-token even-token\"  >’</span><span class=\"token even-token\"  >s</span><span class=\"token odd-token\"  > a</span><span class=\"token even-token\"  > quick</span><span class=\"token odd-token\"  >,</span><span class=\"token even-token\"  > fun</span><span class=\"token odd-token\"  >,</span><span class=\"token even-token\"  > and</span><span class=\"token odd-token\"  > easy</span><span class=\"token even-token\"  > way</span><span class=\"token odd-token\"  > to</span><span class=\"token even-token\"  > see</span><span class=\"token odd-token\"  > if</span><span class=\"token even-token\"  > your</span><span class=\"token odd-token\"  > code</span><span class=\"token even-token\"  > supports</span><span class=\"token odd-token\"  > ast</span><span class=\"token even-token\"  >ral</span><span class=\"token odd-token\"  > symbols</span><span class=\"token even-token\"  >.</span><span class=\"token odd-token\"  > Once</span><span class=\"token even-token\"  > you</span><span class=\"token multi-token odd-token\"  >’</span><span class=\"token odd-token\"  >ve</span><span class=\"token even-token\"  > found</span><span class=\"token odd-token\"  > a</span><span class=\"token even-token\"  > Unicode</span><span class=\"token odd-token\"  >-</span><span class=\"token even-token\"  >related</span><span class=\"token odd-token\"  > bug</span><span class=\"token even-token\"  > in</span><span class=\"token odd-token\"  > your</span><span class=\"token even-token\"  > code</span><span class=\"token odd-token\"  >,</span><span class=\"token even-token\"  > all</span><span class=\"token odd-token\"  > you</span><span class=\"token even-token\"  > need</span><span class=\"token odd-token\"  > to</span><span class=\"token even-token\"  > do</span><span class=\"token odd-token\"  > is</span><span class=\"token even-token\"  > apply</span><span class=\"token odd-token\"  > the</span><span class=\"token even-token\"  > techniques</span><span class=\"token odd-token\"  > discussed</span><span class=\"token even-token\"  > in</span><span class=\"token odd-token\"  > this</span><span class=\"token even-token\"  > post</span><span class=\"token odd-token\"  > to</span><span class=\"token even-token\"  > fix</span><span class=\"token odd-token\"  > it</span><span class=\"token even-token\"  >.</span>\n",
+       "            </div>\n",
+       "        </body>\n",
+       "    </html>\n",
+       "    "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from tokenizers import ByteLevelBPETokenizer\n",
+    "roberta_tokenizer = ByteLevelBPETokenizer.from_file('/tmp/roberta-base-vocab.json', '/tmp/roberta-base-merges.txt')\n",
+    "roberta_visualizer = EncodingVisualizer(tokenizer=roberta_tokenizer, default_to_notebook=True)\n",
+    "roberta_visualizer(text, annotations=annotations)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/bindings/python/py_src/tokenizers/__init__.py b/bindings/python/py_src/tokenizers/__init__.py
index 6447bc40..b595f085 100644
--- a/bindings/python/py_src/tokenizers/__init__.py
+++ b/bindings/python/py_src/tokenizers/__init__.py
@@ -91,6 +91,7 @@ from .tokenizers import normalizers
 from .tokenizers import pre_tokenizers
 from .tokenizers import processors
 from .tokenizers import trainers
+
 from .implementations import (
     ByteLevelBPETokenizer,
     CharBPETokenizer,
diff --git a/bindings/python/py_src/tokenizers/tools/__init__.py b/bindings/python/py_src/tokenizers/tools/__init__.py
new file mode 100644
index 00000000..7b5511c0
--- /dev/null
+++ b/bindings/python/py_src/tokenizers/tools/__init__.py
@@ -0,0 +1 @@
+from .visualizer import EncodingVisualizer, Annotation
diff --git a/bindings/python/py_src/tokenizers/tools/visualizer-styles.css b/bindings/python/py_src/tokenizers/tools/visualizer-styles.css
new file mode 100644
index 00000000..f54fde45
--- /dev/null
+++ b/bindings/python/py_src/tokenizers/tools/visualizer-styles.css
@@ -0,0 +1,170 @@
+.tokenized-text {
+    width:100%;
+    padding:2rem;
+    max-height: 400px;
+    overflow-y: auto;
+    box-sizing:border-box;
+    line-height:4rem; /* Lots of space between lines */
+    font-family: "Roboto Light", "Ubuntu Light", "Ubuntu", monospace;
+    box-shadow: 2px 2px 2px rgba(0,0,0,0.2);
+    background-color: rgba(0,0,0,0.01);
+    letter-spacing:2px; /* Give some extra separation between chars */
+}
+.non-token{
+    /* White space and other things the tokenizer ignores*/
+    white-space: pre;
+    letter-spacing:4px;
+    border-top:1px solid #A0A0A0; /* A gentle border on top and bottom makes tabs more ovious*/
+    border-bottom:1px solid #A0A0A0;
+    line-height: 1rem;
+    height: calc(100% - 2px);
+}
+
+.token {
+    white-space: pre;
+    position:relative;
+    color:black;
+    letter-spacing:2px;
+}
+
+.annotation{
+    white-space:nowrap; /* Important - ensures that annotations appears even if the annotated text wraps a line */
+    border-radius:4px;
+    position:relative;
+    width:fit-content;
+}
+.annotation:before {
+    /*The before holds the text and the after holds the background*/
+    z-index:1000; /* Make sure this is above the background */
+    content:attr(data-label); /* The annotations label is on a data attribute */
+    color:white;
+    position:absolute;
+    font-size:1rem;
+    text-align:center;
+    font-weight:bold;
+
+    top:1.75rem;
+    line-height:0;
+    left:0;
+    width:100%;
+    padding:0.5rem 0;
+    /* These make it so an annotation doesn't stretch beyond the annotated text if the label is longer*/
+    overflow: hidden;
+    white-space: nowrap;
+    text-overflow:ellipsis;
+}
+
+.annotation:after {
+    content:attr(data-label); /* The content defines the width of the annotation*/
+    position:absolute;
+    font-size:0.75rem;
+    text-align:center;
+    font-weight:bold;
+    text-overflow:ellipsis;
+    top:1.75rem;
+    line-height:0;
+    overflow: hidden;
+    white-space: nowrap;
+
+    left:0;
+    width:100%; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/
+
+    padding:0.5rem 0;
+    /* Nast hack below:
+    We set the annotations color in code because we don't know the colors at css time.
+    But you can't pass a color as a data attribute to get it into the pseudo element (this thing)
+    So to get around that, annotations have the color set on them with a style attribute and then we
+    can get the color with currentColor.
+    Annotations wrap tokens and tokens set the color back to black
+     */
+    background-color: currentColor;
+}
+.annotation:hover::after, .annotation:hover::before{
+    /* When the user hovers over an annotation expand the label to display in full
+     */
+    min-width: fit-content;
+}
+
+.annotation:hover{
+    /* Emphasize the annotation start end with a border on hover*/
+    border-color: currentColor;
+    border: 2px solid;
+}
+.special-token:not(:empty){
+    /*
+    A none empty special token is like UNK (as opposed to CLS which has no representation in the text )
+     */
+    position:relative;
+}
+.special-token:empty::before{
+    /* Special tokens that don't have text are displayed as pseudo elements so we dont select them with the mouse*/
+    content:attr(data-stok);
+    background:#202020;
+    font-size:0.75rem;
+    color:white;
+    margin: 0 0.25rem;
+    padding: 0.25rem;
+    border-radius:4px
+}
+
+.special-token:not(:empty):before {
+    /* Special tokens that have text (UNK) are displayed above the actual text*/
+    content:attr(data-stok);
+    position:absolute;
+    bottom:1.75rem;
+    min-width:100%;
+    width:100%;
+    height:1rem;
+    line-height:1rem;
+    font-size:1rem;
+    text-align:center;
+    color:white;
+    font-weight:bold;
+    background:#202020;
+    border-radius:10%;
+}
+/*
+We want to alternate the color of tokens, but we can't use nth child because tokens might be broken up by annotations
+instead we apply even and odd class at generation time and color them that way
+ */
+.even-token{
+    background:#DCDCDC	;
+    border: 1px solid #DCDCDC;
+}
+.odd-token{
+    background:#A0A0A0;
+    border: 1px solid #A0A0A0;
+}
+.even-token.multi-token,.odd-token.multi-token{
+    background:  repeating-linear-gradient(
+    45deg,
+    transparent,
+    transparent 1px,
+    #ccc 1px,
+    #ccc 1px
+    ),
+    /* on "bottom" */
+    linear-gradient(
+    to bottom,
+    #FFB6C1,
+    #999
+    );
+}
+
+.multi-token:hover::after {
+    content:"This char has more than 1 token"; /* The content defines the width of the annotation*/
+    color:white;
+    background-color: black;
+    position:absolute;
+    font-size:0.75rem;
+    text-align:center;
+    font-weight:bold;
+    text-overflow:ellipsis;
+    top:1.75rem;
+    line-height:0;
+    overflow: hidden;
+    white-space: nowrap;
+    left:0;
+    width:fit-content; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/
+    padding:0.5rem 0;
+}
diff --git a/bindings/python/py_src/tokenizers/tools/visualizer.py b/bindings/python/py_src/tokenizers/tools/visualizer.py
new file mode 100644
index 00000000..4e5ef719
--- /dev/null
+++ b/bindings/python/py_src/tokenizers/tools/visualizer.py
@@ -0,0 +1,412 @@
+import os
+import itertools
+import re
+from typing import List, Optional, Tuple, Dict, Callable, Any, NamedTuple
+from string import Template
+from typing import List
+
+from tokenizers import Tokenizer, Encoding
+
+dirname = os.path.dirname(__file__)
+css_filename = os.path.join(dirname, "visualizer-styles.css")
+with open(css_filename) as f:
+    css = f.read()
+
+
+class Annotation:
+    start: int
+    end: int
+    label: int
+
+    def __init__(self, start: int, end: int, label: str):
+        self.start = start
+        self.end = end
+        self.label = label
+
+
+AnnotationList = List[Annotation]
+PartialIntList = List[Optional[int]]
+
+
+class CharStateKey(NamedTuple):
+    token_ix: Optional[int]
+    anno_ix: Optional[int]
+
+
+class CharState:
+    char_ix: Optional[int]
+
+    def __init__(self, char_ix):
+        self.char_ix = char_ix
+
+        self.anno_ix: Optional[int] = None
+        self.tokens: List[int] = []
+
+    @property
+    def token_ix(self):
+        return self.tokens[0] if len(self.tokens) > 0 else None
+
+    @property
+    def is_multitoken(self):
+        """
+        BPE tokenizers can output more than one token for a char
+        """
+        return len(self.tokens) > 1
+
+    def partition_key(self) -> CharStateKey:
+        return CharStateKey(
+            token_ix=self.token_ix,
+            anno_ix=self.anno_ix,
+        )
+
+
+class Aligned:
+    pass
+
+
+class EncodingVisualizer:
+    """
+    Build an EncodingVisualizer
+
+    Args:
+
+         tokenizer (:class:`~tokenizers.Tokenizer`):
+            A tokenizer instance
+
+         default_to_notebook (:obj:`bool`):
+            Whether to render html output in a notebook by default
+
+         annotation_converter (:obj:`Callable`, `optional`):
+            An optional (lambda) function that takes an annotation in any format and returns
+            an Annotation object
+    """
+
+    unk_token_regex = re.compile("(.{1}\b)?(unk|oov)(\b.{1})?", flags=re.IGNORECASE)
+
+    def __init__(
+        self,
+        tokenizer: Tokenizer,
+        default_to_notebook: bool = True,
+        annotation_converter: Optional[Callable[[Any], Annotation]] = None,
+    ):
+        if default_to_notebook:
+            try:
+                from IPython.core.display import display, HTML
+            except ImportError as e:
+                raise Exception(
+                    """We couldn't import IPython utils for html display.
+                        Are you running in a notebook?
+                        You can also pass `default_to_notebook=False` to get back raw HTML
+                    """
+                )
+
+        self.tokenizer = tokenizer
+        self.default_to_notebook = default_to_notebook
+        self.annotation_coverter = annotation_converter
+        pass
+
+    def __call__(
+        self,
+        text: str,
+        annotations: AnnotationList = [],
+        default_to_notebook: Optional[bool] = None,
+    ) -> Optional[str]:
+        """
+        Build a visualization of the given text
+
+        Args:
+            text (:obj:`str`):
+                The text to tokenize
+
+            annotations (:obj:`List[Annotation]`, `optional`):
+                An optional list of annotations of the text. The can either be an annotation class
+                or anything else if you instantiated the visualizer with a converter function
+
+            default_to_notebook (:obj:`bool`, `optional`, defaults to `False`):
+                If True, will render the html in a notebook. Otherwise returns an html string.
+
+        Returns:
+            The HTML string if default_to_notebook is False, otherwise (default) returns None and
+            renders the HTML in the notebook
+
+        """
+        final_default_to_notebook = self.default_to_notebook
+        if default_to_notebook is not None:
+            final_default_to_notebook = default_to_notebook
+        if final_default_to_notebook:
+            try:
+                from IPython.core.display import display, HTML
+            except ImportError as e:
+                raise Exception(
+                    """We couldn't import IPython utils for html display.
+                    Are you running in a notebook?"""
+                )
+        if self.annotation_coverter is not None:
+            annotations = list(map(self.annotation_coverter, annotations))
+        encoding = self.tokenizer.encode(text)
+        html = EncodingVisualizer.__make_html(text, encoding, annotations)
+        if final_default_to_notebook:
+            display(HTML(html))
+        else:
+            return html
+
+    @staticmethod
+    def calculate_label_colors(annotations: AnnotationList) -> Dict[str, str]:
+        """
+        Generates a color palette for all the labels in a given set of annotations
+
+        Args:
+          annotations (:obj:`Annotation`):
+            A list of annotations
+
+        Returns:
+            :obj:`dict`: A dictionary mapping labels to colors in HSL format
+        """
+        if len(annotations) == 0:
+            return {}
+        labels = set(map(lambda x: x.label, annotations))
+        num_labels = len(labels)
+        h_step = int(255 / num_labels)
+        if h_step < 20:
+            h_step = 20
+        s = 32
+        l = 64
+        h = 10
+        colors = {}
+
+        for label in sorted(
+            labels
+        ):  # sort so we always get the same colors for a given set of labels
+            colors[label] = f"hsl({h},{s}%,{l}%"
+            h += h_step
+        return colors
+
+    @staticmethod
+    def consecutive_chars_to_html(
+        consecutive_chars_list: List[CharState],
+        text: str,
+        encoding: Encoding,
+    ):
+        """
+        Converts a list of "consecutive chars" into a single HTML element.
+        Chars are consecutive if they fall under the same word, token and annotation.
+        The CharState class is a named tuple with a "partition_key" method that makes it easy to
+        compare if two chars are consecutive.
+
+        Args:
+            consecutive_chars_list (:obj:`List[CharState]`):
+                A list of CharStates that have been grouped together
+
+            text (:obj:`str`):
+                The original text being processed
+
+            encoding (:class:`~tokenizers.Encoding`):
+                The encoding returned from the tokenizer
+
+        Returns:
+            :obj:`str`: The HTML span for a set of consecutive chars
+        """
+        first = consecutive_chars_list[0]
+        if first.char_ix is None:
+            # its a special token
+            stoken = encoding.tokens[first.token_ix]
+            # special tokens are represented as empty spans. We use the data attribute and css
+            # magic to display it
+            return f'<span class="special-token" data-stoken={stoken}></span>'
+        # We're not in a special token so this group has a start and end.
+        last = consecutive_chars_list[-1]
+        start = first.char_ix
+        end = last.char_ix + 1
+        span_text = text[start:end]
+        css_classes = []  # What css classes will we apply on the resulting span
+        data_items = {}  # What data attributes will we apply on the result span
+        if first.token_ix is not None:
+            # We can either be in a token or not (e.g. in white space)
+            css_classes.append("token")
+            if first.is_multitoken:
+                css_classes.append("multi-token")
+            if first.token_ix % 2:
+                # We use this to color alternating tokens.
+                # A token might be split by an annotation that ends in the middle of it, so this
+                # lets us visually indicate a consecutive token despite its possible splitting in
+                # the html markup
+                css_classes.append("odd-token")
+            else:
+                # Like above, but a different color so we can see the tokens alternate
+                css_classes.append("even-token")
+            if (
+                EncodingVisualizer.unk_token_regex.search(encoding.tokens[first.token_ix])
+                is not None
+            ):
+                # This is a special token that is in the text. probably UNK
+                css_classes.append("special-token")
+                # TODO is this the right name for the data attribute ?
+                data_items["stok"] = encoding.tokens[first.token_ix]
+        else:
+            # In this case we are looking at a group/single char that is not tokenized.
+            # e.g. white space
+            css_classes.append("non-token")
+        css = f'''class="{' '.join(css_classes)}"'''
+        data = ""
+        for key, val in data_items.items():
+            data += f' data-{key}="{val}"'
+        return f"<span {css} {data} >{span_text}</span>"
+
+    @staticmethod
+    def __make_html(text: str, encoding: Encoding, annotations: AnnotationList) -> str:
+        char_states = EncodingVisualizer.__make_char_states(text, encoding, annotations)
+        current_consecutive_chars = [char_states[0]]
+        prev_anno_ix = char_states[0].anno_ix
+        spans = []
+        label_colors_dict = EncodingVisualizer.calculate_label_colors(annotations)
+        cur_anno_ix = char_states[0].anno_ix
+        if cur_anno_ix is not None:
+            # If we started in an  annotation make a span for it
+            anno = annotations[cur_anno_ix]
+            label = anno.label
+            color = label_colors_dict[label]
+            spans.append(f'<span class="annotation" style="color:{color}" data-label="{label}">')
+
+        for cs in char_states[1:]:
+            cur_anno_ix = cs.anno_ix
+            if cur_anno_ix != prev_anno_ix:
+                # If we've transitioned in or out of an annotation
+                spans.append(
+                    # Create a span from the current consecutive characters
+                    EncodingVisualizer.consecutive_chars_to_html(
+                        current_consecutive_chars,
+                        text=text,
+                        encoding=encoding,
+                    )
+                )
+                current_consecutive_chars = [cs]
+
+                if prev_anno_ix is not None:
+                    # if we transitioned out of an annotation close it's span
+                    spans.append("</span>")
+                if cur_anno_ix is not None:
+                    # If we entered a new annotation make a span for it
+                    anno = annotations[cur_anno_ix]
+                    label = anno.label
+                    color = label_colors_dict[label]
+                    spans.append(
+                        f'<span class="annotation" style="color:{color}" data-label="{label}">'
+                    )
+            prev_anno_ix = cur_anno_ix
+
+            if cs.partition_key() == current_consecutive_chars[0].partition_key():
+                # If the current charchter is in the same "group" as the previous one
+                current_consecutive_chars.append(cs)
+            else:
+                # Otherwise we make a span for the previous group
+                spans.append(
+                    EncodingVisualizer.consecutive_chars_to_html(
+                        current_consecutive_chars,
+                        text=text,
+                        encoding=encoding,
+                    )
+                )
+                # An reset the consecutive_char_list to form a new group
+                current_consecutive_chars = [cs]
+        # All that's left is to fill out the final span
+        # TODO I think there is an edge case here where an annotation's span might not close
+        spans.append(
+            EncodingVisualizer.consecutive_chars_to_html(
+                current_consecutive_chars,
+                text=text,
+                encoding=encoding,
+            )
+        )
+        res = HTMLBody(spans)  # Send the list of spans to the body of our html
+        return res
+
+    @staticmethod
+    def __make_anno_map(text: str, annotations: AnnotationList) -> PartialIntList:
+        """
+        Args:
+            text (:obj:`str`):
+                The raw text we want to align to
+
+            annotations (:obj:`AnnotationList`):
+                A (possibly empty) list of annotations
+
+        Returns:
+            A list of  length len(text) whose entry at index i is None if there is no annotation on
+            charachter i or k, the index of the annotation that covers index i where k is with
+            respect to the list of annotations
+        """
+        annotation_map = [None] * len(text)
+        for anno_ix, a in enumerate(annotations):
+            for i in range(a.start, a.end):
+                annotation_map[i] = anno_ix
+        return annotation_map
+
+    @staticmethod
+    def __make_char_states(
+        text: str, encoding: Encoding, annotations: AnnotationList
+    ) -> List[CharState]:
+        """
+        For each character in the original text, we emit a tuple representing it's "state":
+
+            * which token_ix it corresponds to
+            * which word_ix it corresponds to
+            * which annotation_ix it corresponds to
+
+        Args:
+            text (:obj:`str`):
+                The raw text we want to align to
+
+            annotations (:obj:`List[Annotation]`):
+                A (possibly empty) list of annotations
+
+            encoding: (:class:`~tokenizers.Encoding`):
+                The encoding returned from the tokenizer
+
+        Returns:
+            :obj:`List[CharState]`: A list of CharStates, indicating for each char in the text what
+            it's state is
+        """
+        annotation_map = EncodingVisualizer.__make_anno_map(text, annotations)
+        # Todo make this a dataclass or named tuple
+        char_states: List[CharState] = [CharState(char_ix) for char_ix in range(len(text))]
+        for token_ix, token in enumerate(encoding.tokens):
+            offsets = encoding.token_to_chars(token_ix)
+            if offsets is not None:
+                start, end = offsets
+                for i in range(start, end):
+                    char_states[i].tokens.append(token_ix)
+        for char_ix, anno_ix in enumerate(annotation_map):
+            char_states[char_ix].anno_ix = anno_ix
+
+        return char_states
+
+
+def HTMLBody(children: List[str], css_styles=css) -> str:
+    """
+    Generates the full html with css from a list of html spans
+
+    Args:
+        children (:obj:`List[str]`):
+            A list of strings, assumed to be html elements
+
+        css_styles (:obj:`str`, `optional`):
+            Optional alternative implementation of the css
+
+    Returns:
+        :obj:`str`: An HTML string with style markup
+    """
+    children_text = "".join(children)
+    return f"""
+    <html>
+        <head>
+            <style>
+                {css_styles}
+            </style>
+        </head>
+        <body>
+            <div class="tokenized-text" dir=auto>
+            {children_text}
+            </div>
+        </body>
+    </html>
+    """
diff --git a/bindings/python/setup.py b/bindings/python/setup.py
index 5c81afab..28154d62 100644
--- a/bindings/python/setup.py
+++ b/bindings/python/setup.py
@@ -41,6 +41,7 @@ setup(
         "tokenizers.processors",
         "tokenizers.trainers",
         "tokenizers.implementations",
+        "tokenizers.tools",
     ],
     package_data={
         "tokenizers": ["py.typed", "__init__.pyi"],
@@ -51,6 +52,7 @@ setup(
         "tokenizers.processors": ["py.typed", "__init__.pyi"],
         "tokenizers.trainers": ["py.typed", "__init__.pyi"],
         "tokenizers.implementations": ["py.typed"],
+        "tokenizers.tools": ["py.typed", "visualizer-styles.css"],
     },
     zip_safe=False,
 )
diff --git a/docs/source/api/python.inc b/docs/source/api/python.inc
index ee4ed8f1..7a90d78b 100644
--- a/docs/source/api/python.inc
+++ b/docs/source/api/python.inc
@@ -78,3 +78,13 @@ Trainers
 
 .. automodule:: tokenizers.trainers
     :members:
+
+
+Visualizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: tokenizers.tools.Annotation
+    :members:
+
+.. autoclass:: tokenizers.tools.EncodingVisualizer
+    :members: __call__