From 8916b6bb2712b10df18047d4b315fa229a1e2da9 Mon Sep 17 00:00:00 2001 From: Tal Perry Date: Fri, 4 Dec 2020 16:25:56 +0100 Subject: [PATCH] Add a visualization utility to render tokens and annotations in a notebook (#508) * Draft functionality of visualization * Added comments to make code more intelligble * polish the styles * Ensure colors are stable and comment the css * Code clean up * Made visualizer importable and added some docs * Fix styling * implement comments from PR * Fixed the regex for UNK tokens and examples in notebook * Converted docs to google format * Added a notebook showing multiple languages and tokenizers * Added visual indication of chars that are tokenized with >1 token * Reorganize things a bit and fix import * Update docs Co-authored-by: Anthony MOI --- .gitignore | 3 +- .../examples/using_the_visualizer.ipynb | 1053 +++++++++++++++++ bindings/python/py_src/tokenizers/__init__.py | 1 + .../py_src/tokenizers/tools/__init__.py | 1 + .../tokenizers/tools/visualizer-styles.css | 170 +++ .../py_src/tokenizers/tools/visualizer.py | 412 +++++++ bindings/python/setup.py | 2 + docs/source/api/python.inc | 10 + 8 files changed, 1651 insertions(+), 1 deletion(-) create mode 100644 bindings/python/examples/using_the_visualizer.ipynb create mode 100644 bindings/python/py_src/tokenizers/tools/__init__.py create mode 100644 bindings/python/py_src/tokenizers/tools/visualizer-styles.css create mode 100644 bindings/python/py_src/tokenizers/tools/visualizer.py diff --git a/.gitignore b/.gitignore index 6141202f..f965b105 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,7 @@ .vim .env target - +.idea Cargo.lock /data @@ -17,6 +17,7 @@ __pycache__ pip-wheel-metadata *.egg-info *.so +/bindings/python/examples/.ipynb_checkpoints /bindings/python/build /bindings/python/dist diff --git a/bindings/python/examples/using_the_visualizer.ipynb b/bindings/python/examples/using_the_visualizer.ipynb new file mode 100644 index 00000000..2840d2e0 --- /dev/null +++ b/bindings/python/examples/using_the_visualizer.ipynb @@ -0,0 +1,1053 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2020-12-04 09:25:00-- https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.104.253\n", + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.104.253|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 231508 (226K) [text/plain]\n", + "Saving to: ‘/tmp/bert-base-uncased-vocab.txt’\n", + "\n", + "/tmp/bert-base-unca 100%[===================>] 226.08K --.-KB/s in 0.06s \n", + "\n", + "2020-12-04 09:25:00 (3.87 MB/s) - ‘/tmp/bert-base-uncased-vocab.txt’ saved [231508/231508]\n", + "\n" + ] + } + ], + "source": [ + "!wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt -O /tmp/bert-base-uncased-vocab.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from tokenizers import BertWordPieceTokenizer\n", + "from tokenizers.tools import EncodingVisualizer\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "EncodingVisualizer.unk_token_regex.search(\"aaa[udsnk]aaa\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "text = \"\"\"Mathias Bynens 'Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A̴̵̜̰͔ͫ͗͢L̠ͨͧͩ͘G̴̻͈͍͔̹̑͗̎̅͛́Ǫ̵̹̻̝̳͂̌̌͘!͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞': Whenever you’re working on a piece of JavaScript code that deals with strings or regular expressions in some way, just add a unit test that contains a pile of poo (💩) in a string, 💩💩💩💩💩💩💩💩💩💩💩💩 and see if anything breaks. It’s a quick, fun, and easy way to see if your code supports astral symbols. Once you’ve found a Unicode-related bug in your code, all you need to do is apply the techniques discussed in this post to fix it.\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer = BertWordPieceTokenizer(\"/tmp/bert-base-uncased-vocab.txt\", lowercase=True)\n", + "visualizer = EncodingVisualizer(tokenizer=tokenizer)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualizing Tokens With No Annotations" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " Mathias Bynens 'Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A̴̵̜̰͔ͫ͗͢L̠ͨͧͩ͘G̴̻͈͍͔̹̑͗̎̅͛́Ǫ̵̹̻̝̳͂̌̌͘!͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞': Whenever youre working on a piece of JavaScript code that deals with strings or regular expressions in some way, just add a unit test that contains a pile of poo (💩) in a string, 💩💩💩💩💩💩💩💩💩💩💩💩 and see if anything breaks. Its a quick, fun, and easy way to see if your code supports astral symbols. Once youve found a Unicode-related bug in your code, all you need to do is apply the techniques discussed in this post to fix it.\n", + "
\n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "visualizer(text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualizing Tokens With Aligned Annotations\n", + "First we make some annotations with the Annotation class" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from tokenizers.tools import Annotation" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "anno1 = Annotation(start=0, end=2, label=\"foo\")\n", + "anno2 = Annotation(start=2, end=4, label=\"bar\")\n", + "anno3 = Annotation(start=6, end=8, label=\"poo\")\n", + "anno4 = Annotation(start=9, end=12, label=\"shoe\")\n", + "annotations=[\n", + " anno1,\n", + " anno2,\n", + " anno3,\n", + " anno4,\n", + " Annotation(start=23, end=30, label=\"random tandem bandem sandem landem fandom\"),\n", + " Annotation(start=63, end=70, label=\"foo\"),\n", + " Annotation(start=80, end=95, label=\"bar\"),\n", + " Annotation(start=120, end=128, label=\"bar\"),\n", + " Annotation(start=152, end=155, label=\"poo\"),\n", + "]\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " Mathias Bynens 'Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A̴̵̜̰͔ͫ͗͢L̠ͨͧͩ͘G̴̻͈͍͔̹̑͗̎̅͛́Ǫ̵̹̻̝̳͂̌̌͘!͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞': Whenever youre working on a piece of JavaScript code that deals with strings or regular expressions in some way, just add a unit test that contains a pile of poo (💩) in a string, 💩💩💩💩💩💩💩💩💩💩💩💩 and see if anything breaks. Its a quick, fun, and easy way to see if your code supports astral symbols. Once youve found a Unicode-related bug in your code, all you need to do is apply the techniques discussed in this post to fix it.\n", + "
\n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "visualizer(text,annotations=annotations)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using A Custom Annotation Format\n", + "Every system has its own representation of annotations. That's why we can instantiate the EncodingVisualizer with a convertion function." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'startPlace': 0, 'endPlace': 3, 'theTag': '0'},\n", + " {'startPlace': 4, 'endPlace': 7, 'theTag': '4'},\n", + " {'startPlace': 8, 'endPlace': 11, 'theTag': '8'},\n", + " {'startPlace': 12, 'endPlace': 15, 'theTag': '12'},\n", + " {'startPlace': 16, 'endPlace': 19, 'theTag': '16'}]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "funnyAnnotations = [dict(startPlace=i,endPlace=i+3,theTag=str(i)) for i in range(0,20,4)]\n", + "funnyAnnotations" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "converter = lambda funny: Annotation(start=funny['startPlace'], end=funny['endPlace'], label=funny['theTag'])\n", + "visualizer = EncodingVisualizer(tokenizer=tokenizer, default_to_notebook=True, annotation_converter=converter)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " Mathias Bynens 'Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A̴̵̜̰͔ͫ͗͢L̠ͨͧͩ͘G̴̻͈͍͔̹̑͗̎̅͛́Ǫ̵̹̻̝̳͂̌̌͘!͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞': Whenever youre working on a piece of JavaScript code that deals with strings or regular expressions in some way, just add a unit test that contains a pile of poo (💩) in a string, 💩💩💩💩💩💩💩💩💩💩💩💩 and see if anything breaks. Its a quick, fun, and easy way to see if your code supports astral symbols. Once youve found a Unicode-related bug in your code, all you need to do is apply the techniques discussed in this post to fix it.\n", + "
\n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "visualizer(text, annotations=funnyAnnotations)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Trying with Roberta\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2020-12-04 09:25:00-- https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.226.19\n", + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.226.19|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 898823 (878K) [application/json]\n", + "Saving to: ‘/tmp/roberta-base-vocab.json’\n", + "\n", + "/tmp/roberta-base-v 100%[===================>] 877.76K 4.35MB/s in 0.2s \n", + "\n", + "2020-12-04 09:25:00 (4.35 MB/s) - ‘/tmp/roberta-base-vocab.json’ saved [898823/898823]\n", + "\n", + "--2020-12-04 09:25:00-- https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.104.253\n", + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.104.253|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 456318 (446K) [text/plain]\n", + "Saving to: ‘/tmp/roberta-base-merges.txt’\n", + "\n", + "/tmp/roberta-base-m 100%[===================>] 445.62K --.-KB/s in 0.1s \n", + "\n", + "2020-12-04 09:25:01 (4.04 MB/s) - ‘/tmp/roberta-base-merges.txt’ saved [456318/456318]\n", + "\n" + ] + } + ], + "source": [ + "!wget \"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json\" -O /tmp/roberta-base-vocab.json\n", + "!wget \"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt\" -O /tmp/roberta-base-merges.txt\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " Mathias Bynens 'Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A̴̵̜̰͔ͫ͗͢L̠ͨͧͩ͘G̴̻͈͍͔̹̑͗̎̅͛́Ǫ̵̹̻̝̳͂̌̌͘!͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞': Whenever youre working on a piece of JavaScript code that deals with strings or regular expressions in some way, just add a unit test that contains a pile of poo (💩) in a string, 💩💩💩💩💩💩💩💩💩💩💩💩 and see if anything breaks. Its a quick, fun, and easy way to see if your code supports astral symbols. Once youve found a Unicode-related bug in your code, all you need to do is apply the techniques discussed in this post to fix it.\n", + "
\n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from tokenizers import ByteLevelBPETokenizer\n", + "roberta_tokenizer = ByteLevelBPETokenizer.from_file('/tmp/roberta-base-vocab.json', '/tmp/roberta-base-merges.txt')\n", + "roberta_visualizer = EncodingVisualizer(tokenizer=roberta_tokenizer, default_to_notebook=True)\n", + "roberta_visualizer(text, annotations=annotations)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/bindings/python/py_src/tokenizers/__init__.py b/bindings/python/py_src/tokenizers/__init__.py index 6447bc40..b595f085 100644 --- a/bindings/python/py_src/tokenizers/__init__.py +++ b/bindings/python/py_src/tokenizers/__init__.py @@ -91,6 +91,7 @@ from .tokenizers import normalizers from .tokenizers import pre_tokenizers from .tokenizers import processors from .tokenizers import trainers + from .implementations import ( ByteLevelBPETokenizer, CharBPETokenizer, diff --git a/bindings/python/py_src/tokenizers/tools/__init__.py b/bindings/python/py_src/tokenizers/tools/__init__.py new file mode 100644 index 00000000..7b5511c0 --- /dev/null +++ b/bindings/python/py_src/tokenizers/tools/__init__.py @@ -0,0 +1 @@ +from .visualizer import EncodingVisualizer, Annotation diff --git a/bindings/python/py_src/tokenizers/tools/visualizer-styles.css b/bindings/python/py_src/tokenizers/tools/visualizer-styles.css new file mode 100644 index 00000000..f54fde45 --- /dev/null +++ b/bindings/python/py_src/tokenizers/tools/visualizer-styles.css @@ -0,0 +1,170 @@ +.tokenized-text { + width:100%; + padding:2rem; + max-height: 400px; + overflow-y: auto; + box-sizing:border-box; + line-height:4rem; /* Lots of space between lines */ + font-family: "Roboto Light", "Ubuntu Light", "Ubuntu", monospace; + box-shadow: 2px 2px 2px rgba(0,0,0,0.2); + background-color: rgba(0,0,0,0.01); + letter-spacing:2px; /* Give some extra separation between chars */ +} +.non-token{ + /* White space and other things the tokenizer ignores*/ + white-space: pre; + letter-spacing:4px; + border-top:1px solid #A0A0A0; /* A gentle border on top and bottom makes tabs more ovious*/ + border-bottom:1px solid #A0A0A0; + line-height: 1rem; + height: calc(100% - 2px); +} + +.token { + white-space: pre; + position:relative; + color:black; + letter-spacing:2px; +} + +.annotation{ + white-space:nowrap; /* Important - ensures that annotations appears even if the annotated text wraps a line */ + border-radius:4px; + position:relative; + width:fit-content; +} +.annotation:before { + /*The before holds the text and the after holds the background*/ + z-index:1000; /* Make sure this is above the background */ + content:attr(data-label); /* The annotations label is on a data attribute */ + color:white; + position:absolute; + font-size:1rem; + text-align:center; + font-weight:bold; + + top:1.75rem; + line-height:0; + left:0; + width:100%; + padding:0.5rem 0; + /* These make it so an annotation doesn't stretch beyond the annotated text if the label is longer*/ + overflow: hidden; + white-space: nowrap; + text-overflow:ellipsis; +} + +.annotation:after { + content:attr(data-label); /* The content defines the width of the annotation*/ + position:absolute; + font-size:0.75rem; + text-align:center; + font-weight:bold; + text-overflow:ellipsis; + top:1.75rem; + line-height:0; + overflow: hidden; + white-space: nowrap; + + left:0; + width:100%; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/ + + padding:0.5rem 0; + /* Nast hack below: + We set the annotations color in code because we don't know the colors at css time. + But you can't pass a color as a data attribute to get it into the pseudo element (this thing) + So to get around that, annotations have the color set on them with a style attribute and then we + can get the color with currentColor. + Annotations wrap tokens and tokens set the color back to black + */ + background-color: currentColor; +} +.annotation:hover::after, .annotation:hover::before{ + /* When the user hovers over an annotation expand the label to display in full + */ + min-width: fit-content; +} + +.annotation:hover{ + /* Emphasize the annotation start end with a border on hover*/ + border-color: currentColor; + border: 2px solid; +} +.special-token:not(:empty){ + /* + A none empty special token is like UNK (as opposed to CLS which has no representation in the text ) + */ + position:relative; +} +.special-token:empty::before{ + /* Special tokens that don't have text are displayed as pseudo elements so we dont select them with the mouse*/ + content:attr(data-stok); + background:#202020; + font-size:0.75rem; + color:white; + margin: 0 0.25rem; + padding: 0.25rem; + border-radius:4px +} + +.special-token:not(:empty):before { + /* Special tokens that have text (UNK) are displayed above the actual text*/ + content:attr(data-stok); + position:absolute; + bottom:1.75rem; + min-width:100%; + width:100%; + height:1rem; + line-height:1rem; + font-size:1rem; + text-align:center; + color:white; + font-weight:bold; + background:#202020; + border-radius:10%; +} +/* +We want to alternate the color of tokens, but we can't use nth child because tokens might be broken up by annotations +instead we apply even and odd class at generation time and color them that way + */ +.even-token{ + background:#DCDCDC ; + border: 1px solid #DCDCDC; +} +.odd-token{ + background:#A0A0A0; + border: 1px solid #A0A0A0; +} +.even-token.multi-token,.odd-token.multi-token{ + background: repeating-linear-gradient( + 45deg, + transparent, + transparent 1px, + #ccc 1px, + #ccc 1px + ), + /* on "bottom" */ + linear-gradient( + to bottom, + #FFB6C1, + #999 + ); +} + +.multi-token:hover::after { + content:"This char has more than 1 token"; /* The content defines the width of the annotation*/ + color:white; + background-color: black; + position:absolute; + font-size:0.75rem; + text-align:center; + font-weight:bold; + text-overflow:ellipsis; + top:1.75rem; + line-height:0; + overflow: hidden; + white-space: nowrap; + left:0; + width:fit-content; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/ + padding:0.5rem 0; +} diff --git a/bindings/python/py_src/tokenizers/tools/visualizer.py b/bindings/python/py_src/tokenizers/tools/visualizer.py new file mode 100644 index 00000000..4e5ef719 --- /dev/null +++ b/bindings/python/py_src/tokenizers/tools/visualizer.py @@ -0,0 +1,412 @@ +import os +import itertools +import re +from typing import List, Optional, Tuple, Dict, Callable, Any, NamedTuple +from string import Template +from typing import List + +from tokenizers import Tokenizer, Encoding + +dirname = os.path.dirname(__file__) +css_filename = os.path.join(dirname, "visualizer-styles.css") +with open(css_filename) as f: + css = f.read() + + +class Annotation: + start: int + end: int + label: int + + def __init__(self, start: int, end: int, label: str): + self.start = start + self.end = end + self.label = label + + +AnnotationList = List[Annotation] +PartialIntList = List[Optional[int]] + + +class CharStateKey(NamedTuple): + token_ix: Optional[int] + anno_ix: Optional[int] + + +class CharState: + char_ix: Optional[int] + + def __init__(self, char_ix): + self.char_ix = char_ix + + self.anno_ix: Optional[int] = None + self.tokens: List[int] = [] + + @property + def token_ix(self): + return self.tokens[0] if len(self.tokens) > 0 else None + + @property + def is_multitoken(self): + """ + BPE tokenizers can output more than one token for a char + """ + return len(self.tokens) > 1 + + def partition_key(self) -> CharStateKey: + return CharStateKey( + token_ix=self.token_ix, + anno_ix=self.anno_ix, + ) + + +class Aligned: + pass + + +class EncodingVisualizer: + """ + Build an EncodingVisualizer + + Args: + + tokenizer (:class:`~tokenizers.Tokenizer`): + A tokenizer instance + + default_to_notebook (:obj:`bool`): + Whether to render html output in a notebook by default + + annotation_converter (:obj:`Callable`, `optional`): + An optional (lambda) function that takes an annotation in any format and returns + an Annotation object + """ + + unk_token_regex = re.compile("(.{1}\b)?(unk|oov)(\b.{1})?", flags=re.IGNORECASE) + + def __init__( + self, + tokenizer: Tokenizer, + default_to_notebook: bool = True, + annotation_converter: Optional[Callable[[Any], Annotation]] = None, + ): + if default_to_notebook: + try: + from IPython.core.display import display, HTML + except ImportError as e: + raise Exception( + """We couldn't import IPython utils for html display. + Are you running in a notebook? + You can also pass `default_to_notebook=False` to get back raw HTML + """ + ) + + self.tokenizer = tokenizer + self.default_to_notebook = default_to_notebook + self.annotation_coverter = annotation_converter + pass + + def __call__( + self, + text: str, + annotations: AnnotationList = [], + default_to_notebook: Optional[bool] = None, + ) -> Optional[str]: + """ + Build a visualization of the given text + + Args: + text (:obj:`str`): + The text to tokenize + + annotations (:obj:`List[Annotation]`, `optional`): + An optional list of annotations of the text. The can either be an annotation class + or anything else if you instantiated the visualizer with a converter function + + default_to_notebook (:obj:`bool`, `optional`, defaults to `False`): + If True, will render the html in a notebook. Otherwise returns an html string. + + Returns: + The HTML string if default_to_notebook is False, otherwise (default) returns None and + renders the HTML in the notebook + + """ + final_default_to_notebook = self.default_to_notebook + if default_to_notebook is not None: + final_default_to_notebook = default_to_notebook + if final_default_to_notebook: + try: + from IPython.core.display import display, HTML + except ImportError as e: + raise Exception( + """We couldn't import IPython utils for html display. + Are you running in a notebook?""" + ) + if self.annotation_coverter is not None: + annotations = list(map(self.annotation_coverter, annotations)) + encoding = self.tokenizer.encode(text) + html = EncodingVisualizer.__make_html(text, encoding, annotations) + if final_default_to_notebook: + display(HTML(html)) + else: + return html + + @staticmethod + def calculate_label_colors(annotations: AnnotationList) -> Dict[str, str]: + """ + Generates a color palette for all the labels in a given set of annotations + + Args: + annotations (:obj:`Annotation`): + A list of annotations + + Returns: + :obj:`dict`: A dictionary mapping labels to colors in HSL format + """ + if len(annotations) == 0: + return {} + labels = set(map(lambda x: x.label, annotations)) + num_labels = len(labels) + h_step = int(255 / num_labels) + if h_step < 20: + h_step = 20 + s = 32 + l = 64 + h = 10 + colors = {} + + for label in sorted( + labels + ): # sort so we always get the same colors for a given set of labels + colors[label] = f"hsl({h},{s}%,{l}%" + h += h_step + return colors + + @staticmethod + def consecutive_chars_to_html( + consecutive_chars_list: List[CharState], + text: str, + encoding: Encoding, + ): + """ + Converts a list of "consecutive chars" into a single HTML element. + Chars are consecutive if they fall under the same word, token and annotation. + The CharState class is a named tuple with a "partition_key" method that makes it easy to + compare if two chars are consecutive. + + Args: + consecutive_chars_list (:obj:`List[CharState]`): + A list of CharStates that have been grouped together + + text (:obj:`str`): + The original text being processed + + encoding (:class:`~tokenizers.Encoding`): + The encoding returned from the tokenizer + + Returns: + :obj:`str`: The HTML span for a set of consecutive chars + """ + first = consecutive_chars_list[0] + if first.char_ix is None: + # its a special token + stoken = encoding.tokens[first.token_ix] + # special tokens are represented as empty spans. We use the data attribute and css + # magic to display it + return f'' + # We're not in a special token so this group has a start and end. + last = consecutive_chars_list[-1] + start = first.char_ix + end = last.char_ix + 1 + span_text = text[start:end] + css_classes = [] # What css classes will we apply on the resulting span + data_items = {} # What data attributes will we apply on the result span + if first.token_ix is not None: + # We can either be in a token or not (e.g. in white space) + css_classes.append("token") + if first.is_multitoken: + css_classes.append("multi-token") + if first.token_ix % 2: + # We use this to color alternating tokens. + # A token might be split by an annotation that ends in the middle of it, so this + # lets us visually indicate a consecutive token despite its possible splitting in + # the html markup + css_classes.append("odd-token") + else: + # Like above, but a different color so we can see the tokens alternate + css_classes.append("even-token") + if ( + EncodingVisualizer.unk_token_regex.search(encoding.tokens[first.token_ix]) + is not None + ): + # This is a special token that is in the text. probably UNK + css_classes.append("special-token") + # TODO is this the right name for the data attribute ? + data_items["stok"] = encoding.tokens[first.token_ix] + else: + # In this case we are looking at a group/single char that is not tokenized. + # e.g. white space + css_classes.append("non-token") + css = f'''class="{' '.join(css_classes)}"''' + data = "" + for key, val in data_items.items(): + data += f' data-{key}="{val}"' + return f"{span_text}" + + @staticmethod + def __make_html(text: str, encoding: Encoding, annotations: AnnotationList) -> str: + char_states = EncodingVisualizer.__make_char_states(text, encoding, annotations) + current_consecutive_chars = [char_states[0]] + prev_anno_ix = char_states[0].anno_ix + spans = [] + label_colors_dict = EncodingVisualizer.calculate_label_colors(annotations) + cur_anno_ix = char_states[0].anno_ix + if cur_anno_ix is not None: + # If we started in an annotation make a span for it + anno = annotations[cur_anno_ix] + label = anno.label + color = label_colors_dict[label] + spans.append(f'') + + for cs in char_states[1:]: + cur_anno_ix = cs.anno_ix + if cur_anno_ix != prev_anno_ix: + # If we've transitioned in or out of an annotation + spans.append( + # Create a span from the current consecutive characters + EncodingVisualizer.consecutive_chars_to_html( + current_consecutive_chars, + text=text, + encoding=encoding, + ) + ) + current_consecutive_chars = [cs] + + if prev_anno_ix is not None: + # if we transitioned out of an annotation close it's span + spans.append("") + if cur_anno_ix is not None: + # If we entered a new annotation make a span for it + anno = annotations[cur_anno_ix] + label = anno.label + color = label_colors_dict[label] + spans.append( + f'' + ) + prev_anno_ix = cur_anno_ix + + if cs.partition_key() == current_consecutive_chars[0].partition_key(): + # If the current charchter is in the same "group" as the previous one + current_consecutive_chars.append(cs) + else: + # Otherwise we make a span for the previous group + spans.append( + EncodingVisualizer.consecutive_chars_to_html( + current_consecutive_chars, + text=text, + encoding=encoding, + ) + ) + # An reset the consecutive_char_list to form a new group + current_consecutive_chars = [cs] + # All that's left is to fill out the final span + # TODO I think there is an edge case here where an annotation's span might not close + spans.append( + EncodingVisualizer.consecutive_chars_to_html( + current_consecutive_chars, + text=text, + encoding=encoding, + ) + ) + res = HTMLBody(spans) # Send the list of spans to the body of our html + return res + + @staticmethod + def __make_anno_map(text: str, annotations: AnnotationList) -> PartialIntList: + """ + Args: + text (:obj:`str`): + The raw text we want to align to + + annotations (:obj:`AnnotationList`): + A (possibly empty) list of annotations + + Returns: + A list of length len(text) whose entry at index i is None if there is no annotation on + charachter i or k, the index of the annotation that covers index i where k is with + respect to the list of annotations + """ + annotation_map = [None] * len(text) + for anno_ix, a in enumerate(annotations): + for i in range(a.start, a.end): + annotation_map[i] = anno_ix + return annotation_map + + @staticmethod + def __make_char_states( + text: str, encoding: Encoding, annotations: AnnotationList + ) -> List[CharState]: + """ + For each character in the original text, we emit a tuple representing it's "state": + + * which token_ix it corresponds to + * which word_ix it corresponds to + * which annotation_ix it corresponds to + + Args: + text (:obj:`str`): + The raw text we want to align to + + annotations (:obj:`List[Annotation]`): + A (possibly empty) list of annotations + + encoding: (:class:`~tokenizers.Encoding`): + The encoding returned from the tokenizer + + Returns: + :obj:`List[CharState]`: A list of CharStates, indicating for each char in the text what + it's state is + """ + annotation_map = EncodingVisualizer.__make_anno_map(text, annotations) + # Todo make this a dataclass or named tuple + char_states: List[CharState] = [CharState(char_ix) for char_ix in range(len(text))] + for token_ix, token in enumerate(encoding.tokens): + offsets = encoding.token_to_chars(token_ix) + if offsets is not None: + start, end = offsets + for i in range(start, end): + char_states[i].tokens.append(token_ix) + for char_ix, anno_ix in enumerate(annotation_map): + char_states[char_ix].anno_ix = anno_ix + + return char_states + + +def HTMLBody(children: List[str], css_styles=css) -> str: + """ + Generates the full html with css from a list of html spans + + Args: + children (:obj:`List[str]`): + A list of strings, assumed to be html elements + + css_styles (:obj:`str`, `optional`): + Optional alternative implementation of the css + + Returns: + :obj:`str`: An HTML string with style markup + """ + children_text = "".join(children) + return f""" + + + + + +
+ {children_text} +
+ + + """ diff --git a/bindings/python/setup.py b/bindings/python/setup.py index 5c81afab..28154d62 100644 --- a/bindings/python/setup.py +++ b/bindings/python/setup.py @@ -41,6 +41,7 @@ setup( "tokenizers.processors", "tokenizers.trainers", "tokenizers.implementations", + "tokenizers.tools", ], package_data={ "tokenizers": ["py.typed", "__init__.pyi"], @@ -51,6 +52,7 @@ setup( "tokenizers.processors": ["py.typed", "__init__.pyi"], "tokenizers.trainers": ["py.typed", "__init__.pyi"], "tokenizers.implementations": ["py.typed"], + "tokenizers.tools": ["py.typed", "visualizer-styles.css"], }, zip_safe=False, ) diff --git a/docs/source/api/python.inc b/docs/source/api/python.inc index ee4ed8f1..7a90d78b 100644 --- a/docs/source/api/python.inc +++ b/docs/source/api/python.inc @@ -78,3 +78,13 @@ Trainers .. automodule:: tokenizers.trainers :members: + + +Visualizer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: tokenizers.tools.Annotation + :members: + +.. autoclass:: tokenizers.tools.EncodingVisualizer + :members: __call__