mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 16:49:27 +00:00
1057 lines
74 KiB
Plaintext
1057 lines
74 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"--2020-12-04 09:25:00-- https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt\n",
|
||
"Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n",
|
||
"Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.104.253\n",
|
||
"Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.104.253|:443... connected.\n",
|
||
"HTTP request sent, awaiting response... 200 OK\n",
|
||
"Length: 231508 (226K) [text/plain]\n",
|
||
"Saving to: ‘/tmp/bert-base-uncased-vocab.txt’\n",
|
||
"\n",
|
||
"/tmp/bert-base-unca 100%[===================>] 226.08K --.-KB/s in 0.06s \n",
|
||
"\n",
|
||
"2020-12-04 09:25:00 (3.87 MB/s) - ‘/tmp/bert-base-uncased-vocab.txt’ saved [231508/231508]\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"!wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt -O /tmp/bert-base-uncased-vocab.txt"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from tokenizers import BertWordPieceTokenizer\n",
|
||
"from tokenizers.tools import EncodingVisualizer"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"EncodingVisualizer.unk_token_regex.search(\"aaa[udsnk]aaa\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"text = \"\"\"Mathias Bynens 'Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A̴̵̜̰͔ͫ͗͢L̠ͨͧͩ͘G̴̻͈͍͔̹̑͗̎̅͛́Ǫ̵̹̻̝̳͂̌̌͘!͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞': Whenever you’re working on a piece of JavaScript code that deals with strings or regular expressions in some way, just add a unit test that contains a pile of poo (💩) in a string, 💩💩💩💩💩💩💩💩💩💩💩💩 and see if anything breaks. It’s a quick, fun, and easy way to see if your code supports astral symbols. Once you’ve found a Unicode-related bug in your code, all you need to do is apply the techniques discussed in this post to fix it.\"\"\""
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"tokenizer = BertWordPieceTokenizer(\"/tmp/bert-base-uncased-vocab.txt\", lowercase=True)\n",
|
||
"visualizer = EncodingVisualizer(tokenizer=tokenizer)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Visualizing Tokens With No Annotations"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"\n",
|
||
" <html>\n",
|
||
" <head>\n",
|
||
" <style>\n",
|
||
" .tokenized-text {\n",
|
||
" width:100%;\n",
|
||
" padding:2rem;\n",
|
||
" max-height: 400px;\n",
|
||
" overflow-y: auto;\n",
|
||
" box-sizing:border-box;\n",
|
||
" line-height:4rem; /* Lots of space between lines */\n",
|
||
" font-family: \"Roboto Light\", \"Ubuntu Light\", \"Ubuntu\", monospace;\n",
|
||
" box-shadow: 2px 2px 2px rgba(0,0,0,0.2);\n",
|
||
" background-color: rgba(0,0,0,0.01);\n",
|
||
" letter-spacing:2px; /* Give some extra separation between chars */\n",
|
||
"}\n",
|
||
".non-token{\n",
|
||
" /* White space and other things the tokenizer ignores*/\n",
|
||
" white-space: pre;\n",
|
||
" letter-spacing:4px;\n",
|
||
" border-top:1px solid #A0A0A0; /* A gentle border on top and bottom makes tabs more ovious*/\n",
|
||
" border-bottom:1px solid #A0A0A0;\n",
|
||
" line-height: 1rem;\n",
|
||
" height: calc(100% - 2px);\n",
|
||
"}\n",
|
||
"\n",
|
||
".token {\n",
|
||
" white-space: pre;\n",
|
||
" position:relative;\n",
|
||
" color:black;\n",
|
||
" letter-spacing:2px;\n",
|
||
"}\n",
|
||
"\n",
|
||
".annotation{\n",
|
||
" white-space:nowrap; /* Important - ensures that annotations appears even if the annotated text wraps a line */\n",
|
||
" border-radius:4px;\n",
|
||
" position:relative;\n",
|
||
" width:fit-content;\n",
|
||
"}\n",
|
||
".annotation:before {\n",
|
||
" /*The before holds the text and the after holds the background*/\n",
|
||
" z-index:1000; /* Make sure this is above the background */\n",
|
||
" content:attr(data-label); /* The annotations label is on a data attribute */\n",
|
||
" color:white;\n",
|
||
" position:absolute;\n",
|
||
" font-size:1rem;\n",
|
||
" text-align:center;\n",
|
||
" font-weight:bold;\n",
|
||
"\n",
|
||
" top:1.75rem;\n",
|
||
" line-height:0;\n",
|
||
" left:0;\n",
|
||
" width:100%;\n",
|
||
" padding:0.5rem 0;\n",
|
||
" /* These make it so an annotation doesn't stretch beyond the annotated text if the label is longer*/\n",
|
||
" overflow: hidden;\n",
|
||
" white-space: nowrap;\n",
|
||
" text-overflow:ellipsis;\n",
|
||
"}\n",
|
||
"\n",
|
||
".annotation:after {\n",
|
||
" content:attr(data-label); /* The content defines the width of the annotation*/\n",
|
||
" position:absolute;\n",
|
||
" font-size:0.75rem;\n",
|
||
" text-align:center;\n",
|
||
" font-weight:bold;\n",
|
||
" text-overflow:ellipsis;\n",
|
||
" top:1.75rem;\n",
|
||
" line-height:0;\n",
|
||
" overflow: hidden;\n",
|
||
" white-space: nowrap;\n",
|
||
"\n",
|
||
" left:0;\n",
|
||
" width:100%; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/\n",
|
||
"\n",
|
||
" padding:0.5rem 0;\n",
|
||
" /* Nast hack below:\n",
|
||
" We set the annotations color in code because we don't know the colors at css time.\n",
|
||
" But you can't pass a color as a data attribute to get it into the pseudo element (this thing)\n",
|
||
" So to get around that, annotations have the color set on them with a style attribute and then we\n",
|
||
" can get the color with currentColor.\n",
|
||
" Annotations wrap tokens and tokens set the color back to black\n",
|
||
" */\n",
|
||
" background-color: currentColor;\n",
|
||
"}\n",
|
||
".annotation:hover::after, .annotation:hover::before{\n",
|
||
" /* When the user hovers over an annotation expand the label to display in full\n",
|
||
" */\n",
|
||
" min-width: fit-content;\n",
|
||
"}\n",
|
||
"\n",
|
||
".annotation:hover{\n",
|
||
" /* Emphasize the annotation start end with a border on hover*/\n",
|
||
" border-color: currentColor;\n",
|
||
" border: 2px solid;\n",
|
||
"}\n",
|
||
".special-token:not(:empty){\n",
|
||
" /*\n",
|
||
" A none empty special token is like UNK (as opposed to CLS which has no representation in the text )\n",
|
||
" */\n",
|
||
" position:relative;\n",
|
||
"}\n",
|
||
".special-token:empty::before{\n",
|
||
" /* Special tokens that don't have text are displayed as pseudo elements so we dont select them with the mouse*/\n",
|
||
" content:attr(data-stok);\n",
|
||
" background:#202020;\n",
|
||
" font-size:0.75rem;\n",
|
||
" color:white;\n",
|
||
" margin: 0 0.25rem;\n",
|
||
" padding: 0.25rem;\n",
|
||
" border-radius:4px\n",
|
||
"}\n",
|
||
"\n",
|
||
".special-token:not(:empty):before {\n",
|
||
" /* Special tokens that have text (UNK) are displayed above the actual text*/\n",
|
||
" content:attr(data-stok);\n",
|
||
" position:absolute;\n",
|
||
" bottom:1.75rem;\n",
|
||
" min-width:100%;\n",
|
||
" width:100%;\n",
|
||
" height:1rem;\n",
|
||
" line-height:1rem;\n",
|
||
" font-size:1rem;\n",
|
||
" text-align:center;\n",
|
||
" color:white;\n",
|
||
" font-weight:bold;\n",
|
||
" background:#202020;\n",
|
||
" border-radius:10%;\n",
|
||
"}\n",
|
||
"/*\n",
|
||
"We want to alternate the color of tokens, but we can't use nth child because tokens might be broken up by annotations\n",
|
||
"instead we apply even and odd class at generation time and color them that way\n",
|
||
" */\n",
|
||
".even-token{\n",
|
||
" background:#DCDCDC\t;\n",
|
||
" border: 1px solid #DCDCDC;\n",
|
||
"}\n",
|
||
".odd-token{\n",
|
||
" background:#A0A0A0;\n",
|
||
" border: 1px solid #A0A0A0;\n",
|
||
"}\n",
|
||
".even-token.multi-token,.odd-token.multi-token{\n",
|
||
" background: repeating-linear-gradient(\n",
|
||
" 45deg,\n",
|
||
" transparent,\n",
|
||
" transparent 1px,\n",
|
||
" #ccc 1px,\n",
|
||
" #ccc 1px\n",
|
||
" ),\n",
|
||
" /* on \"bottom\" */\n",
|
||
" linear-gradient(\n",
|
||
" to bottom,\n",
|
||
" #FFB6C1,\n",
|
||
" #999\n",
|
||
" );\n",
|
||
"}\n",
|
||
"\n",
|
||
".multi-token:hover::after {\n",
|
||
" content:\"This char has more than 1 token\"; /* The content defines the width of the annotation*/\n",
|
||
" color:white;\n",
|
||
" background-color: black;\n",
|
||
" position:absolute;\n",
|
||
" font-size:0.75rem;\n",
|
||
" text-align:center;\n",
|
||
" font-weight:bold;\n",
|
||
" text-overflow:ellipsis;\n",
|
||
" top:1.75rem;\n",
|
||
" line-height:0;\n",
|
||
" overflow: hidden;\n",
|
||
" white-space: nowrap;\n",
|
||
" left:0;\n",
|
||
" width:fit-content; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/\n",
|
||
" padding:0.5rem 0;\n",
|
||
"}\n",
|
||
"\n",
|
||
" </style>\n",
|
||
" </head>\n",
|
||
" <body>\n",
|
||
" <div class=\"tokenized-text\" dir=auto>\n",
|
||
" <span class=\"token odd-token\" >Mathias</span><span class=\"non-token\" > </span><span class=\"token even-token\" >By</span><span class=\"token odd-token\" >nen</span><span class=\"token even-token\" >s</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >'</span><span class=\"token even-token\" >Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A</span><span class=\"non-token\" >̴̵̜̰͔ͫ͗͢</span><span class=\"token odd-token\" >L</span><span class=\"non-token\" >̠ͨͧͩ͘</span><span class=\"token even-token\" >G̴̻͈͍͔̹̑͗̎̅͛́O</span><span class=\"non-token\" >̵̨̹̻̝̳͂̌̌͘</span><span class=\"token odd-token\" >!</span><span class=\"non-token\" >͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞</span><span class=\"token even-token\" >'</span><span class=\"token odd-token\" >:</span><span class=\"non-token\" > </span><span class=\"token even-token\" >Whenever</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >you</span><span class=\"token even-token\" >’</span><span class=\"token odd-token\" >re</span><span class=\"non-token\" > </span><span class=\"token even-token\" >working</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >on</span><span class=\"non-token\" > </span><span class=\"token even-token\" >a</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >piece</span><span class=\"non-token\" > </span><span class=\"token even-token\" >of</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >Java</span><span class=\"token even-token\" >Script</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >code</span><span class=\"non-token\" > </span><span class=\"token even-token\" >that</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >deals</span><span class=\"non-token\" > </span><span class=\"token even-token\" >with</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >strings</span><span class=\"non-token\" > </span><span class=\"token even-token\" >or</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >regular</span><span class=\"non-token\" > </span><span class=\"token even-token\" >expressions</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >in</span><span class=\"non-token\" > </span><span class=\"token even-token\" >some</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >way</span><span class=\"token even-token\" >,</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >just</span><span class=\"non-token\" > </span><span class=\"token even-token\" >add</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >a</span><span class=\"non-token\" > </span><span class=\"token even-token\" >unit</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >test</span><span class=\"non-token\" > </span><span class=\"token even-token\" >that</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >contains</span><span class=\"non-token\" > </span><span class=\"token even-token\" >a</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >pile</span><span class=\"non-token\" > </span><span class=\"token even-token\" >of</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >po</span><span class=\"token even-token\" >o</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >(</span><span class=\"token even-token special-token\" data-stok=\"[UNK]\" >💩</span><span class=\"token odd-token\" >)</span><span class=\"non-token\" > </span><span class=\"token even-token\" >in</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >a</span><span class=\"non-token\" > </span><span class=\"token even-token\" >string</span><span class=\"token odd-token\" >,</span><span class=\"non-token\" > </span><span class=\"token even-token special-token\" data-stok=\"[UNK]\" >💩💩💩💩💩💩💩💩💩💩💩💩</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >and</span><span class=\"non-token\" > </span><span class=\"token even-token\" >see</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >if</span><span class=\"non-token\" > </span><span class=\"token even-token\" >anything</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >breaks</span><span class=\"token even-token\" >.</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >It</span><span class=\"token even-token\" >’</span><span class=\"token odd-token\" >s</span><span class=\"non-token\" > </span><span class=\"token even-token\" >a</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >quick</span><span class=\"token even-token\" >,</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >fun</span><span class=\"token even-token\" >,</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >and</span><span class=\"non-token\" > </span><span class=\"token even-token\" >easy</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >way</span><span class=\"non-token\" > </span><span class=\"token even-token\" >to</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >see</span><span class=\"non-token\" > </span><span class=\"token even-token\" >if</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >your</span><span class=\"non-token\" > </span><span class=\"token even-token\" >code</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >supports</span><span class=\"non-token\" > </span><span class=\"token even-token\" >as</span><span class=\"token odd-token\" >tral</span><span class=\"non-token\" > </span><span class=\"token even-token\" >symbols</span><span class=\"token odd-token\" >.</span><span class=\"non-token\" > </span><span class=\"token even-token\" >Once</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >you</span><span class=\"token even-token\" >’</span><span class=\"token odd-token\" >ve</span><span class=\"non-token\" > </span><span class=\"token even-token\" >found</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >a</span><span class=\"non-token\" > </span><span class=\"token even-token\" >Unicode</span><span class=\"token odd-token\" >-</span><span class=\"token even-token\" >related</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >bug</span><span class=\"non-token\" > </span><span class=\"token even-token\" >in</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >your</span><span class=\"non-token\" > </span><span class=\"token even-token\" >code</span><span class=\"token odd-token\" >,</span><span class=\"non-token\" > </span><span class=\"token even-token\" >all</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >you</span><span class=\"non-token\" > </span><span class=\"token even-token\" >need</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >to</span><span class=\"non-token\" > </span><span class=\"token even-token\" >do</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >is</span><span class=\"non-token\" > </span><span class=\"token even-token\" >apply</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >the</span><span class=\"non-token\" > </span><span class=\"token even-token\" >techniques</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >discussed</span><span class=\"non-token\" > </span><span class=\"token even-token\" >in</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >this</span><span class=\"non-token\" > </span><span class=\"token even-token\" >post</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >to</span><span class=\"non-token\" > </span><span class=\"token even-token\" >fix</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >it</span><span class=\"token even-token\" >.</span>\n",
|
||
" </div>\n",
|
||
" </body>\n",
|
||
" </html>\n",
|
||
" "
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.HTML object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"visualizer(text)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Visualizing Tokens With Aligned Annotations\n",
|
||
"First we make some annotations with the Annotation class"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from tokenizers.tools import Annotation"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"anno1 = Annotation(start=0, end=2, label=\"foo\")\n",
|
||
"anno2 = Annotation(start=2, end=4, label=\"bar\")\n",
|
||
"anno3 = Annotation(start=6, end=8, label=\"poo\")\n",
|
||
"anno4 = Annotation(start=9, end=12, label=\"shoe\")\n",
|
||
"annotations = [\n",
|
||
" anno1,\n",
|
||
" anno2,\n",
|
||
" anno3,\n",
|
||
" anno4,\n",
|
||
" Annotation(start=23, end=30, label=\"random tandem bandem sandem landem fandom\"),\n",
|
||
" Annotation(start=63, end=70, label=\"foo\"),\n",
|
||
" Annotation(start=80, end=95, label=\"bar\"),\n",
|
||
" Annotation(start=120, end=128, label=\"bar\"),\n",
|
||
" Annotation(start=152, end=155, label=\"poo\"),\n",
|
||
"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"\n",
|
||
" <html>\n",
|
||
" <head>\n",
|
||
" <style>\n",
|
||
" .tokenized-text {\n",
|
||
" width:100%;\n",
|
||
" padding:2rem;\n",
|
||
" max-height: 400px;\n",
|
||
" overflow-y: auto;\n",
|
||
" box-sizing:border-box;\n",
|
||
" line-height:4rem; /* Lots of space between lines */\n",
|
||
" font-family: \"Roboto Light\", \"Ubuntu Light\", \"Ubuntu\", monospace;\n",
|
||
" box-shadow: 2px 2px 2px rgba(0,0,0,0.2);\n",
|
||
" background-color: rgba(0,0,0,0.01);\n",
|
||
" letter-spacing:2px; /* Give some extra separation between chars */\n",
|
||
"}\n",
|
||
".non-token{\n",
|
||
" /* White space and other things the tokenizer ignores*/\n",
|
||
" white-space: pre;\n",
|
||
" letter-spacing:4px;\n",
|
||
" border-top:1px solid #A0A0A0; /* A gentle border on top and bottom makes tabs more ovious*/\n",
|
||
" border-bottom:1px solid #A0A0A0;\n",
|
||
" line-height: 1rem;\n",
|
||
" height: calc(100% - 2px);\n",
|
||
"}\n",
|
||
"\n",
|
||
".token {\n",
|
||
" white-space: pre;\n",
|
||
" position:relative;\n",
|
||
" color:black;\n",
|
||
" letter-spacing:2px;\n",
|
||
"}\n",
|
||
"\n",
|
||
".annotation{\n",
|
||
" white-space:nowrap; /* Important - ensures that annotations appears even if the annotated text wraps a line */\n",
|
||
" border-radius:4px;\n",
|
||
" position:relative;\n",
|
||
" width:fit-content;\n",
|
||
"}\n",
|
||
".annotation:before {\n",
|
||
" /*The before holds the text and the after holds the background*/\n",
|
||
" z-index:1000; /* Make sure this is above the background */\n",
|
||
" content:attr(data-label); /* The annotations label is on a data attribute */\n",
|
||
" color:white;\n",
|
||
" position:absolute;\n",
|
||
" font-size:1rem;\n",
|
||
" text-align:center;\n",
|
||
" font-weight:bold;\n",
|
||
"\n",
|
||
" top:1.75rem;\n",
|
||
" line-height:0;\n",
|
||
" left:0;\n",
|
||
" width:100%;\n",
|
||
" padding:0.5rem 0;\n",
|
||
" /* These make it so an annotation doesn't stretch beyond the annotated text if the label is longer*/\n",
|
||
" overflow: hidden;\n",
|
||
" white-space: nowrap;\n",
|
||
" text-overflow:ellipsis;\n",
|
||
"}\n",
|
||
"\n",
|
||
".annotation:after {\n",
|
||
" content:attr(data-label); /* The content defines the width of the annotation*/\n",
|
||
" position:absolute;\n",
|
||
" font-size:0.75rem;\n",
|
||
" text-align:center;\n",
|
||
" font-weight:bold;\n",
|
||
" text-overflow:ellipsis;\n",
|
||
" top:1.75rem;\n",
|
||
" line-height:0;\n",
|
||
" overflow: hidden;\n",
|
||
" white-space: nowrap;\n",
|
||
"\n",
|
||
" left:0;\n",
|
||
" width:100%; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/\n",
|
||
"\n",
|
||
" padding:0.5rem 0;\n",
|
||
" /* Nast hack below:\n",
|
||
" We set the annotations color in code because we don't know the colors at css time.\n",
|
||
" But you can't pass a color as a data attribute to get it into the pseudo element (this thing)\n",
|
||
" So to get around that, annotations have the color set on them with a style attribute and then we\n",
|
||
" can get the color with currentColor.\n",
|
||
" Annotations wrap tokens and tokens set the color back to black\n",
|
||
" */\n",
|
||
" background-color: currentColor;\n",
|
||
"}\n",
|
||
".annotation:hover::after, .annotation:hover::before{\n",
|
||
" /* When the user hovers over an annotation expand the label to display in full\n",
|
||
" */\n",
|
||
" min-width: fit-content;\n",
|
||
"}\n",
|
||
"\n",
|
||
".annotation:hover{\n",
|
||
" /* Emphasize the annotation start end with a border on hover*/\n",
|
||
" border-color: currentColor;\n",
|
||
" border: 2px solid;\n",
|
||
"}\n",
|
||
".special-token:not(:empty){\n",
|
||
" /*\n",
|
||
" A none empty special token is like UNK (as opposed to CLS which has no representation in the text )\n",
|
||
" */\n",
|
||
" position:relative;\n",
|
||
"}\n",
|
||
".special-token:empty::before{\n",
|
||
" /* Special tokens that don't have text are displayed as pseudo elements so we dont select them with the mouse*/\n",
|
||
" content:attr(data-stok);\n",
|
||
" background:#202020;\n",
|
||
" font-size:0.75rem;\n",
|
||
" color:white;\n",
|
||
" margin: 0 0.25rem;\n",
|
||
" padding: 0.25rem;\n",
|
||
" border-radius:4px\n",
|
||
"}\n",
|
||
"\n",
|
||
".special-token:not(:empty):before {\n",
|
||
" /* Special tokens that have text (UNK) are displayed above the actual text*/\n",
|
||
" content:attr(data-stok);\n",
|
||
" position:absolute;\n",
|
||
" bottom:1.75rem;\n",
|
||
" min-width:100%;\n",
|
||
" width:100%;\n",
|
||
" height:1rem;\n",
|
||
" line-height:1rem;\n",
|
||
" font-size:1rem;\n",
|
||
" text-align:center;\n",
|
||
" color:white;\n",
|
||
" font-weight:bold;\n",
|
||
" background:#202020;\n",
|
||
" border-radius:10%;\n",
|
||
"}\n",
|
||
"/*\n",
|
||
"We want to alternate the color of tokens, but we can't use nth child because tokens might be broken up by annotations\n",
|
||
"instead we apply even and odd class at generation time and color them that way\n",
|
||
" */\n",
|
||
".even-token{\n",
|
||
" background:#DCDCDC\t;\n",
|
||
" border: 1px solid #DCDCDC;\n",
|
||
"}\n",
|
||
".odd-token{\n",
|
||
" background:#A0A0A0;\n",
|
||
" border: 1px solid #A0A0A0;\n",
|
||
"}\n",
|
||
".even-token.multi-token,.odd-token.multi-token{\n",
|
||
" background: repeating-linear-gradient(\n",
|
||
" 45deg,\n",
|
||
" transparent,\n",
|
||
" transparent 1px,\n",
|
||
" #ccc 1px,\n",
|
||
" #ccc 1px\n",
|
||
" ),\n",
|
||
" /* on \"bottom\" */\n",
|
||
" linear-gradient(\n",
|
||
" to bottom,\n",
|
||
" #FFB6C1,\n",
|
||
" #999\n",
|
||
" );\n",
|
||
"}\n",
|
||
"\n",
|
||
".multi-token:hover::after {\n",
|
||
" content:\"This char has more than 1 token\"; /* The content defines the width of the annotation*/\n",
|
||
" color:white;\n",
|
||
" background-color: black;\n",
|
||
" position:absolute;\n",
|
||
" font-size:0.75rem;\n",
|
||
" text-align:center;\n",
|
||
" font-weight:bold;\n",
|
||
" text-overflow:ellipsis;\n",
|
||
" top:1.75rem;\n",
|
||
" line-height:0;\n",
|
||
" overflow: hidden;\n",
|
||
" white-space: nowrap;\n",
|
||
" left:0;\n",
|
||
" width:fit-content; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/\n",
|
||
" padding:0.5rem 0;\n",
|
||
"}\n",
|
||
"\n",
|
||
" </style>\n",
|
||
" </head>\n",
|
||
" <body>\n",
|
||
" <div class=\"tokenized-text\" dir=auto>\n",
|
||
" <span class=\"annotation\" style=\"color:hsl(61,32%,64%\" data-label=\"foo\"><span class=\"token odd-token\" >Ma</span></span><span class=\"annotation\" style=\"color:hsl(10,32%,64%\" data-label=\"bar\"><span class=\"token odd-token\" >th</span></span><span class=\"token odd-token\" >ia</span><span class=\"annotation\" style=\"color:hsl(112,32%,64%\" data-label=\"poo\"><span class=\"token odd-token\" >s</span><span class=\"non-token\" > </span></span><span class=\"token even-token\" >B</span><span class=\"annotation\" style=\"color:hsl(214,32%,64%\" data-label=\"shoe\"><span class=\"token even-token\" >y</span><span class=\"token odd-token\" >ne</span></span><span class=\"token odd-token\" >n</span><span class=\"token even-token\" >s</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >'</span><span class=\"token even-token\" >Z͑ͫ̓ͪ̂ͫ</span><span class=\"annotation\" style=\"color:hsl(163,32%,64%\" data-label=\"random tandem bandem sandem landem fandom\"><span class=\"token even-token\" >̽͏̴̙̤̞͉</span></span><span class=\"token even-token\" >͚̯̞̠͍A</span><span class=\"non-token\" >̴̵̜̰͔ͫ͗͢</span><span class=\"token odd-token\" >L</span><span class=\"non-token\" >̠ͨͧͩ͘</span><span class=\"token even-token\" >G̴̻͈͍͔̹̑͗̎̅͛́</span><span class=\"annotation\" style=\"color:hsl(61,32%,64%\" data-label=\"foo\"><span class=\"token even-token\" >O</span><span class=\"non-token\" >̵̨͂̌̌͘</span></span><span class=\"non-token\" >̹̻̝̳</span><span class=\"token odd-token\" >!</span><span class=\"non-token\" >̿̋ͥͥ̂</span><span class=\"annotation\" style=\"color:hsl(10,32%,64%\" data-label=\"bar\"><span class=\"non-token\" >͖̬̰̙̗ͣ̐́́͜͞</span><span class=\"token even-token\" >'</span><span class=\"token odd-token\" >:</span><span class=\"non-token\" > </span><span class=\"token even-token\" >W</span></span><span class=\"token even-token\" >henever</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >you</span><span class=\"token even-token\" >’</span><span class=\"token odd-token\" >re</span><span class=\"non-token\" > </span><span class=\"token even-token\" >working</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >on</span><span class=\"annotation\" style=\"color:hsl(10,32%,64%\" data-label=\"bar\"><span class=\"non-token\" > </span><span class=\"token even-token\" >a</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >piece</span></span><span class=\"non-token\" > </span><span class=\"token even-token\" >of</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >Java</span><span class=\"token even-token\" >Script</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >code</span><span class=\"non-token\" > </span><span class=\"token even-token\" >that</span><span class=\"annotation\" style=\"color:hsl(112,32%,64%\" data-label=\"poo\"><span class=\"non-token\" > </span><span class=\"token odd-token\" >de</span></span><span class=\"token odd-token\" >als</span><span class=\"non-token\" > </span><span class=\"token even-token\" >with</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >strings</span><span class=\"non-token\" > </span><span class=\"token even-token\" >or</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >regular</span><span class=\"non-token\" > </span><span class=\"token even-token\" >expressions</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >in</span><span class=\"non-token\" > </span><span class=\"token even-token\" >some</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >way</span><span class=\"token even-token\" >,</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >just</span><span class=\"non-token\" > </span><span class=\"token even-token\" >add</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >a</span><span class=\"non-token\" > </span><span class=\"token even-token\" >unit</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >test</span><span class=\"non-token\" > </span><span class=\"token even-token\" >that</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >contains</span><span class=\"non-token\" > </span><span class=\"token even-token\" >a</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >pile</span><span class=\"non-token\" > </span><span class=\"token even-token\" >of</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >po</span><span class=\"token even-token\" >o</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >(</span><span class=\"token even-token special-token\" data-stok=\"[UNK]\" >💩</span><span class=\"token odd-token\" >)</span><span class=\"non-token\" > </span><span class=\"token even-token\" >in</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >a</span><span class=\"non-token\" > </span><span class=\"token even-token\" >string</span><span class=\"token odd-token\" >,</span><span class=\"non-token\" > </span><span class=\"token even-token special-token\" data-stok=\"[UNK]\" >💩💩💩💩💩💩💩💩💩💩💩💩</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >and</span><span class=\"non-token\" > </span><span class=\"token even-token\" >see</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >if</span><span class=\"non-token\" > </span><span class=\"token even-token\" >anything</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >breaks</span><span class=\"token even-token\" >.</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >It</span><span class=\"token even-token\" >’</span><span class=\"token odd-token\" >s</span><span class=\"non-token\" > </span><span class=\"token even-token\" >a</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >quick</span><span class=\"token even-token\" >,</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >fun</span><span class=\"token even-token\" >,</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >and</span><span class=\"non-token\" > </span><span class=\"token even-token\" >easy</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >way</span><span class=\"non-token\" > </span><span class=\"token even-token\" >to</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >see</span><span class=\"non-token\" > </span><span class=\"token even-token\" >if</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >your</span><span class=\"non-token\" > </span><span class=\"token even-token\" >code</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >supports</span><span class=\"non-token\" > </span><span class=\"token even-token\" >as</span><span class=\"token odd-token\" >tral</span><span class=\"non-token\" > </span><span class=\"token even-token\" >symbols</span><span class=\"token odd-token\" >.</span><span class=\"non-token\" > </span><span class=\"token even-token\" >Once</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >you</span><span class=\"token even-token\" >’</span><span class=\"token odd-token\" >ve</span><span class=\"non-token\" > </span><span class=\"token even-token\" >found</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >a</span><span class=\"non-token\" > </span><span class=\"token even-token\" >Unicode</span><span class=\"token odd-token\" >-</span><span class=\"token even-token\" >related</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >bug</span><span class=\"non-token\" > </span><span class=\"token even-token\" >in</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >your</span><span class=\"non-token\" > </span><span class=\"token even-token\" >code</span><span class=\"token odd-token\" >,</span><span class=\"non-token\" > </span><span class=\"token even-token\" >all</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >you</span><span class=\"non-token\" > </span><span class=\"token even-token\" >need</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >to</span><span class=\"non-token\" > </span><span class=\"token even-token\" >do</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >is</span><span class=\"non-token\" > </span><span class=\"token even-token\" >apply</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >the</span><span class=\"non-token\" > </span><span class=\"token even-token\" >techniques</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >discussed</span><span class=\"non-token\" > </span><span class=\"token even-token\" >in</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >this</span><span class=\"non-token\" > </span><span class=\"token even-token\" >post</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >to</span><span class=\"non-token\" > </span><span class=\"token even-token\" >fix</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >it</span><span class=\"token even-token\" >.</span>\n",
|
||
" </div>\n",
|
||
" </body>\n",
|
||
" </html>\n",
|
||
" "
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.HTML object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"visualizer(text, annotations=annotations)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Using A Custom Annotation Format\n",
|
||
"Every system has its own representation of annotations. That's why we can instantiate the EncodingVisualizer with a convertion function."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"[{'startPlace': 0, 'endPlace': 3, 'theTag': '0'},\n",
|
||
" {'startPlace': 4, 'endPlace': 7, 'theTag': '4'},\n",
|
||
" {'startPlace': 8, 'endPlace': 11, 'theTag': '8'},\n",
|
||
" {'startPlace': 12, 'endPlace': 15, 'theTag': '12'},\n",
|
||
" {'startPlace': 16, 'endPlace': 19, 'theTag': '16'}]"
|
||
]
|
||
},
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"funnyAnnotations = [dict(startPlace=i, endPlace=i + 3, theTag=str(i)) for i in range(0, 20, 4)]\n",
|
||
"funnyAnnotations"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def converter(funny):\n",
|
||
" return Annotation(start=funny[\"startPlace\"], end=funny[\"endPlace\"], label=funny[\"theTag\"])\n",
|
||
"\n",
|
||
"\n",
|
||
"visualizer = EncodingVisualizer(tokenizer=tokenizer, default_to_notebook=True, annotation_converter=converter)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"\n",
|
||
" <html>\n",
|
||
" <head>\n",
|
||
" <style>\n",
|
||
" .tokenized-text {\n",
|
||
" width:100%;\n",
|
||
" padding:2rem;\n",
|
||
" max-height: 400px;\n",
|
||
" overflow-y: auto;\n",
|
||
" box-sizing:border-box;\n",
|
||
" line-height:4rem; /* Lots of space between lines */\n",
|
||
" font-family: \"Roboto Light\", \"Ubuntu Light\", \"Ubuntu\", monospace;\n",
|
||
" box-shadow: 2px 2px 2px rgba(0,0,0,0.2);\n",
|
||
" background-color: rgba(0,0,0,0.01);\n",
|
||
" letter-spacing:2px; /* Give some extra separation between chars */\n",
|
||
"}\n",
|
||
".non-token{\n",
|
||
" /* White space and other things the tokenizer ignores*/\n",
|
||
" white-space: pre;\n",
|
||
" letter-spacing:4px;\n",
|
||
" border-top:1px solid #A0A0A0; /* A gentle border on top and bottom makes tabs more ovious*/\n",
|
||
" border-bottom:1px solid #A0A0A0;\n",
|
||
" line-height: 1rem;\n",
|
||
" height: calc(100% - 2px);\n",
|
||
"}\n",
|
||
"\n",
|
||
".token {\n",
|
||
" white-space: pre;\n",
|
||
" position:relative;\n",
|
||
" color:black;\n",
|
||
" letter-spacing:2px;\n",
|
||
"}\n",
|
||
"\n",
|
||
".annotation{\n",
|
||
" white-space:nowrap; /* Important - ensures that annotations appears even if the annotated text wraps a line */\n",
|
||
" border-radius:4px;\n",
|
||
" position:relative;\n",
|
||
" width:fit-content;\n",
|
||
"}\n",
|
||
".annotation:before {\n",
|
||
" /*The before holds the text and the after holds the background*/\n",
|
||
" z-index:1000; /* Make sure this is above the background */\n",
|
||
" content:attr(data-label); /* The annotations label is on a data attribute */\n",
|
||
" color:white;\n",
|
||
" position:absolute;\n",
|
||
" font-size:1rem;\n",
|
||
" text-align:center;\n",
|
||
" font-weight:bold;\n",
|
||
"\n",
|
||
" top:1.75rem;\n",
|
||
" line-height:0;\n",
|
||
" left:0;\n",
|
||
" width:100%;\n",
|
||
" padding:0.5rem 0;\n",
|
||
" /* These make it so an annotation doesn't stretch beyond the annotated text if the label is longer*/\n",
|
||
" overflow: hidden;\n",
|
||
" white-space: nowrap;\n",
|
||
" text-overflow:ellipsis;\n",
|
||
"}\n",
|
||
"\n",
|
||
".annotation:after {\n",
|
||
" content:attr(data-label); /* The content defines the width of the annotation*/\n",
|
||
" position:absolute;\n",
|
||
" font-size:0.75rem;\n",
|
||
" text-align:center;\n",
|
||
" font-weight:bold;\n",
|
||
" text-overflow:ellipsis;\n",
|
||
" top:1.75rem;\n",
|
||
" line-height:0;\n",
|
||
" overflow: hidden;\n",
|
||
" white-space: nowrap;\n",
|
||
"\n",
|
||
" left:0;\n",
|
||
" width:100%; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/\n",
|
||
"\n",
|
||
" padding:0.5rem 0;\n",
|
||
" /* Nast hack below:\n",
|
||
" We set the annotations color in code because we don't know the colors at css time.\n",
|
||
" But you can't pass a color as a data attribute to get it into the pseudo element (this thing)\n",
|
||
" So to get around that, annotations have the color set on them with a style attribute and then we\n",
|
||
" can get the color with currentColor.\n",
|
||
" Annotations wrap tokens and tokens set the color back to black\n",
|
||
" */\n",
|
||
" background-color: currentColor;\n",
|
||
"}\n",
|
||
".annotation:hover::after, .annotation:hover::before{\n",
|
||
" /* When the user hovers over an annotation expand the label to display in full\n",
|
||
" */\n",
|
||
" min-width: fit-content;\n",
|
||
"}\n",
|
||
"\n",
|
||
".annotation:hover{\n",
|
||
" /* Emphasize the annotation start end with a border on hover*/\n",
|
||
" border-color: currentColor;\n",
|
||
" border: 2px solid;\n",
|
||
"}\n",
|
||
".special-token:not(:empty){\n",
|
||
" /*\n",
|
||
" A none empty special token is like UNK (as opposed to CLS which has no representation in the text )\n",
|
||
" */\n",
|
||
" position:relative;\n",
|
||
"}\n",
|
||
".special-token:empty::before{\n",
|
||
" /* Special tokens that don't have text are displayed as pseudo elements so we dont select them with the mouse*/\n",
|
||
" content:attr(data-stok);\n",
|
||
" background:#202020;\n",
|
||
" font-size:0.75rem;\n",
|
||
" color:white;\n",
|
||
" margin: 0 0.25rem;\n",
|
||
" padding: 0.25rem;\n",
|
||
" border-radius:4px\n",
|
||
"}\n",
|
||
"\n",
|
||
".special-token:not(:empty):before {\n",
|
||
" /* Special tokens that have text (UNK) are displayed above the actual text*/\n",
|
||
" content:attr(data-stok);\n",
|
||
" position:absolute;\n",
|
||
" bottom:1.75rem;\n",
|
||
" min-width:100%;\n",
|
||
" width:100%;\n",
|
||
" height:1rem;\n",
|
||
" line-height:1rem;\n",
|
||
" font-size:1rem;\n",
|
||
" text-align:center;\n",
|
||
" color:white;\n",
|
||
" font-weight:bold;\n",
|
||
" background:#202020;\n",
|
||
" border-radius:10%;\n",
|
||
"}\n",
|
||
"/*\n",
|
||
"We want to alternate the color of tokens, but we can't use nth child because tokens might be broken up by annotations\n",
|
||
"instead we apply even and odd class at generation time and color them that way\n",
|
||
" */\n",
|
||
".even-token{\n",
|
||
" background:#DCDCDC\t;\n",
|
||
" border: 1px solid #DCDCDC;\n",
|
||
"}\n",
|
||
".odd-token{\n",
|
||
" background:#A0A0A0;\n",
|
||
" border: 1px solid #A0A0A0;\n",
|
||
"}\n",
|
||
".even-token.multi-token,.odd-token.multi-token{\n",
|
||
" background: repeating-linear-gradient(\n",
|
||
" 45deg,\n",
|
||
" transparent,\n",
|
||
" transparent 1px,\n",
|
||
" #ccc 1px,\n",
|
||
" #ccc 1px\n",
|
||
" ),\n",
|
||
" /* on \"bottom\" */\n",
|
||
" linear-gradient(\n",
|
||
" to bottom,\n",
|
||
" #FFB6C1,\n",
|
||
" #999\n",
|
||
" );\n",
|
||
"}\n",
|
||
"\n",
|
||
".multi-token:hover::after {\n",
|
||
" content:\"This char has more than 1 token\"; /* The content defines the width of the annotation*/\n",
|
||
" color:white;\n",
|
||
" background-color: black;\n",
|
||
" position:absolute;\n",
|
||
" font-size:0.75rem;\n",
|
||
" text-align:center;\n",
|
||
" font-weight:bold;\n",
|
||
" text-overflow:ellipsis;\n",
|
||
" top:1.75rem;\n",
|
||
" line-height:0;\n",
|
||
" overflow: hidden;\n",
|
||
" white-space: nowrap;\n",
|
||
" left:0;\n",
|
||
" width:fit-content; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/\n",
|
||
" padding:0.5rem 0;\n",
|
||
"}\n",
|
||
"\n",
|
||
" </style>\n",
|
||
" </head>\n",
|
||
" <body>\n",
|
||
" <div class=\"tokenized-text\" dir=auto>\n",
|
||
" <span class=\"annotation\" style=\"color:hsl(10,32%,64%\" data-label=\"0\"><span class=\"token odd-token\" >Mat</span></span><span class=\"token odd-token\" >h</span><span class=\"annotation\" style=\"color:hsl(163,32%,64%\" data-label=\"4\"><span class=\"token odd-token\" >ias</span></span><span class=\"non-token\" > </span><span class=\"annotation\" style=\"color:hsl(214,32%,64%\" data-label=\"8\"><span class=\"token even-token\" >By</span><span class=\"token odd-token\" >n</span></span><span class=\"token odd-token\" >e</span><span class=\"annotation\" style=\"color:hsl(61,32%,64%\" data-label=\"12\"><span class=\"token odd-token\" >n</span><span class=\"token even-token\" >s</span><span class=\"non-token\" > </span></span><span class=\"token odd-token\" >'</span><span class=\"annotation\" style=\"color:hsl(112,32%,64%\" data-label=\"16\"><span class=\"token even-token\" >Z͑ͫ</span></span><span class=\"token even-token\" >̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A</span><span class=\"non-token\" >̴̵̜̰͔ͫ͗͢</span><span class=\"token odd-token\" >L</span><span class=\"non-token\" >̠ͨͧͩ͘</span><span class=\"token even-token\" >G̴̻͈͍͔̹̑͗̎̅͛́O</span><span class=\"non-token\" >̵̨̹̻̝̳͂̌̌͘</span><span class=\"token odd-token\" >!</span><span class=\"non-token\" >͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞</span><span class=\"token even-token\" >'</span><span class=\"token odd-token\" >:</span><span class=\"non-token\" > </span><span class=\"token even-token\" >Whenever</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >you</span><span class=\"token even-token\" >’</span><span class=\"token odd-token\" >re</span><span class=\"non-token\" > </span><span class=\"token even-token\" >working</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >on</span><span class=\"non-token\" > </span><span class=\"token even-token\" >a</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >piece</span><span class=\"non-token\" > </span><span class=\"token even-token\" >of</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >Java</span><span class=\"token even-token\" >Script</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >code</span><span class=\"non-token\" > </span><span class=\"token even-token\" >that</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >deals</span><span class=\"non-token\" > </span><span class=\"token even-token\" >with</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >strings</span><span class=\"non-token\" > </span><span class=\"token even-token\" >or</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >regular</span><span class=\"non-token\" > </span><span class=\"token even-token\" >expressions</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >in</span><span class=\"non-token\" > </span><span class=\"token even-token\" >some</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >way</span><span class=\"token even-token\" >,</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >just</span><span class=\"non-token\" > </span><span class=\"token even-token\" >add</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >a</span><span class=\"non-token\" > </span><span class=\"token even-token\" >unit</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >test</span><span class=\"non-token\" > </span><span class=\"token even-token\" >that</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >contains</span><span class=\"non-token\" > </span><span class=\"token even-token\" >a</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >pile</span><span class=\"non-token\" > </span><span class=\"token even-token\" >of</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >po</span><span class=\"token even-token\" >o</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >(</span><span class=\"token even-token special-token\" data-stok=\"[UNK]\" >💩</span><span class=\"token odd-token\" >)</span><span class=\"non-token\" > </span><span class=\"token even-token\" >in</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >a</span><span class=\"non-token\" > </span><span class=\"token even-token\" >string</span><span class=\"token odd-token\" >,</span><span class=\"non-token\" > </span><span class=\"token even-token special-token\" data-stok=\"[UNK]\" >💩💩💩💩💩💩💩💩💩💩💩💩</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >and</span><span class=\"non-token\" > </span><span class=\"token even-token\" >see</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >if</span><span class=\"non-token\" > </span><span class=\"token even-token\" >anything</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >breaks</span><span class=\"token even-token\" >.</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >It</span><span class=\"token even-token\" >’</span><span class=\"token odd-token\" >s</span><span class=\"non-token\" > </span><span class=\"token even-token\" >a</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >quick</span><span class=\"token even-token\" >,</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >fun</span><span class=\"token even-token\" >,</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >and</span><span class=\"non-token\" > </span><span class=\"token even-token\" >easy</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >way</span><span class=\"non-token\" > </span><span class=\"token even-token\" >to</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >see</span><span class=\"non-token\" > </span><span class=\"token even-token\" >if</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >your</span><span class=\"non-token\" > </span><span class=\"token even-token\" >code</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >supports</span><span class=\"non-token\" > </span><span class=\"token even-token\" >as</span><span class=\"token odd-token\" >tral</span><span class=\"non-token\" > </span><span class=\"token even-token\" >symbols</span><span class=\"token odd-token\" >.</span><span class=\"non-token\" > </span><span class=\"token even-token\" >Once</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >you</span><span class=\"token even-token\" >’</span><span class=\"token odd-token\" >ve</span><span class=\"non-token\" > </span><span class=\"token even-token\" >found</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >a</span><span class=\"non-token\" > </span><span class=\"token even-token\" >Unicode</span><span class=\"token odd-token\" >-</span><span class=\"token even-token\" >related</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >bug</span><span class=\"non-token\" > </span><span class=\"token even-token\" >in</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >your</span><span class=\"non-token\" > </span><span class=\"token even-token\" >code</span><span class=\"token odd-token\" >,</span><span class=\"non-token\" > </span><span class=\"token even-token\" >all</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >you</span><span class=\"non-token\" > </span><span class=\"token even-token\" >need</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >to</span><span class=\"non-token\" > </span><span class=\"token even-token\" >do</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >is</span><span class=\"non-token\" > </span><span class=\"token even-token\" >apply</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >the</span><span class=\"non-token\" > </span><span class=\"token even-token\" >techniques</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >discussed</span><span class=\"non-token\" > </span><span class=\"token even-token\" >in</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >this</span><span class=\"non-token\" > </span><span class=\"token even-token\" >post</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >to</span><span class=\"non-token\" > </span><span class=\"token even-token\" >fix</span><span class=\"non-token\" > </span><span class=\"token odd-token\" >it</span><span class=\"token even-token\" >.</span>\n",
|
||
" </div>\n",
|
||
" </body>\n",
|
||
" </html>\n",
|
||
" "
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.HTML object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"visualizer(text, annotations=funnyAnnotations)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Trying with Roberta\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"--2020-12-04 09:25:00-- https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json\n",
|
||
"Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n",
|
||
"Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.226.19\n",
|
||
"Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.226.19|:443... connected.\n",
|
||
"HTTP request sent, awaiting response... 200 OK\n",
|
||
"Length: 898823 (878K) [application/json]\n",
|
||
"Saving to: ‘/tmp/roberta-base-vocab.json’\n",
|
||
"\n",
|
||
"/tmp/roberta-base-v 100%[===================>] 877.76K 4.35MB/s in 0.2s \n",
|
||
"\n",
|
||
"2020-12-04 09:25:00 (4.35 MB/s) - ‘/tmp/roberta-base-vocab.json’ saved [898823/898823]\n",
|
||
"\n",
|
||
"--2020-12-04 09:25:00-- https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt\n",
|
||
"Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n",
|
||
"Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.104.253\n",
|
||
"Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.104.253|:443... connected.\n",
|
||
"HTTP request sent, awaiting response... 200 OK\n",
|
||
"Length: 456318 (446K) [text/plain]\n",
|
||
"Saving to: ‘/tmp/roberta-base-merges.txt’\n",
|
||
"\n",
|
||
"/tmp/roberta-base-m 100%[===================>] 445.62K --.-KB/s in 0.1s \n",
|
||
"\n",
|
||
"2020-12-04 09:25:01 (4.04 MB/s) - ‘/tmp/roberta-base-merges.txt’ saved [456318/456318]\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"!wget \"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json\" -O /tmp/roberta-base-vocab.json\n",
|
||
"!wget \"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt\" -O /tmp/roberta-base-merges.txt"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"\n",
|
||
" <html>\n",
|
||
" <head>\n",
|
||
" <style>\n",
|
||
" .tokenized-text {\n",
|
||
" width:100%;\n",
|
||
" padding:2rem;\n",
|
||
" max-height: 400px;\n",
|
||
" overflow-y: auto;\n",
|
||
" box-sizing:border-box;\n",
|
||
" line-height:4rem; /* Lots of space between lines */\n",
|
||
" font-family: \"Roboto Light\", \"Ubuntu Light\", \"Ubuntu\", monospace;\n",
|
||
" box-shadow: 2px 2px 2px rgba(0,0,0,0.2);\n",
|
||
" background-color: rgba(0,0,0,0.01);\n",
|
||
" letter-spacing:2px; /* Give some extra separation between chars */\n",
|
||
"}\n",
|
||
".non-token{\n",
|
||
" /* White space and other things the tokenizer ignores*/\n",
|
||
" white-space: pre;\n",
|
||
" letter-spacing:4px;\n",
|
||
" border-top:1px solid #A0A0A0; /* A gentle border on top and bottom makes tabs more ovious*/\n",
|
||
" border-bottom:1px solid #A0A0A0;\n",
|
||
" line-height: 1rem;\n",
|
||
" height: calc(100% - 2px);\n",
|
||
"}\n",
|
||
"\n",
|
||
".token {\n",
|
||
" white-space: pre;\n",
|
||
" position:relative;\n",
|
||
" color:black;\n",
|
||
" letter-spacing:2px;\n",
|
||
"}\n",
|
||
"\n",
|
||
".annotation{\n",
|
||
" white-space:nowrap; /* Important - ensures that annotations appears even if the annotated text wraps a line */\n",
|
||
" border-radius:4px;\n",
|
||
" position:relative;\n",
|
||
" width:fit-content;\n",
|
||
"}\n",
|
||
".annotation:before {\n",
|
||
" /*The before holds the text and the after holds the background*/\n",
|
||
" z-index:1000; /* Make sure this is above the background */\n",
|
||
" content:attr(data-label); /* The annotations label is on a data attribute */\n",
|
||
" color:white;\n",
|
||
" position:absolute;\n",
|
||
" font-size:1rem;\n",
|
||
" text-align:center;\n",
|
||
" font-weight:bold;\n",
|
||
"\n",
|
||
" top:1.75rem;\n",
|
||
" line-height:0;\n",
|
||
" left:0;\n",
|
||
" width:100%;\n",
|
||
" padding:0.5rem 0;\n",
|
||
" /* These make it so an annotation doesn't stretch beyond the annotated text if the label is longer*/\n",
|
||
" overflow: hidden;\n",
|
||
" white-space: nowrap;\n",
|
||
" text-overflow:ellipsis;\n",
|
||
"}\n",
|
||
"\n",
|
||
".annotation:after {\n",
|
||
" content:attr(data-label); /* The content defines the width of the annotation*/\n",
|
||
" position:absolute;\n",
|
||
" font-size:0.75rem;\n",
|
||
" text-align:center;\n",
|
||
" font-weight:bold;\n",
|
||
" text-overflow:ellipsis;\n",
|
||
" top:1.75rem;\n",
|
||
" line-height:0;\n",
|
||
" overflow: hidden;\n",
|
||
" white-space: nowrap;\n",
|
||
"\n",
|
||
" left:0;\n",
|
||
" width:100%; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/\n",
|
||
"\n",
|
||
" padding:0.5rem 0;\n",
|
||
" /* Nast hack below:\n",
|
||
" We set the annotations color in code because we don't know the colors at css time.\n",
|
||
" But you can't pass a color as a data attribute to get it into the pseudo element (this thing)\n",
|
||
" So to get around that, annotations have the color set on them with a style attribute and then we\n",
|
||
" can get the color with currentColor.\n",
|
||
" Annotations wrap tokens and tokens set the color back to black\n",
|
||
" */\n",
|
||
" background-color: currentColor;\n",
|
||
"}\n",
|
||
".annotation:hover::after, .annotation:hover::before{\n",
|
||
" /* When the user hovers over an annotation expand the label to display in full\n",
|
||
" */\n",
|
||
" min-width: fit-content;\n",
|
||
"}\n",
|
||
"\n",
|
||
".annotation:hover{\n",
|
||
" /* Emphasize the annotation start end with a border on hover*/\n",
|
||
" border-color: currentColor;\n",
|
||
" border: 2px solid;\n",
|
||
"}\n",
|
||
".special-token:not(:empty){\n",
|
||
" /*\n",
|
||
" A none empty special token is like UNK (as opposed to CLS which has no representation in the text )\n",
|
||
" */\n",
|
||
" position:relative;\n",
|
||
"}\n",
|
||
".special-token:empty::before{\n",
|
||
" /* Special tokens that don't have text are displayed as pseudo elements so we dont select them with the mouse*/\n",
|
||
" content:attr(data-stok);\n",
|
||
" background:#202020;\n",
|
||
" font-size:0.75rem;\n",
|
||
" color:white;\n",
|
||
" margin: 0 0.25rem;\n",
|
||
" padding: 0.25rem;\n",
|
||
" border-radius:4px\n",
|
||
"}\n",
|
||
"\n",
|
||
".special-token:not(:empty):before {\n",
|
||
" /* Special tokens that have text (UNK) are displayed above the actual text*/\n",
|
||
" content:attr(data-stok);\n",
|
||
" position:absolute;\n",
|
||
" bottom:1.75rem;\n",
|
||
" min-width:100%;\n",
|
||
" width:100%;\n",
|
||
" height:1rem;\n",
|
||
" line-height:1rem;\n",
|
||
" font-size:1rem;\n",
|
||
" text-align:center;\n",
|
||
" color:white;\n",
|
||
" font-weight:bold;\n",
|
||
" background:#202020;\n",
|
||
" border-radius:10%;\n",
|
||
"}\n",
|
||
"/*\n",
|
||
"We want to alternate the color of tokens, but we can't use nth child because tokens might be broken up by annotations\n",
|
||
"instead we apply even and odd class at generation time and color them that way\n",
|
||
" */\n",
|
||
".even-token{\n",
|
||
" background:#DCDCDC\t;\n",
|
||
" border: 1px solid #DCDCDC;\n",
|
||
"}\n",
|
||
".odd-token{\n",
|
||
" background:#A0A0A0;\n",
|
||
" border: 1px solid #A0A0A0;\n",
|
||
"}\n",
|
||
".even-token.multi-token,.odd-token.multi-token{\n",
|
||
" background: repeating-linear-gradient(\n",
|
||
" 45deg,\n",
|
||
" transparent,\n",
|
||
" transparent 1px,\n",
|
||
" #ccc 1px,\n",
|
||
" #ccc 1px\n",
|
||
" ),\n",
|
||
" /* on \"bottom\" */\n",
|
||
" linear-gradient(\n",
|
||
" to bottom,\n",
|
||
" #FFB6C1,\n",
|
||
" #999\n",
|
||
" );\n",
|
||
"}\n",
|
||
"\n",
|
||
".multi-token:hover::after {\n",
|
||
" content:\"This char has more than 1 token\"; /* The content defines the width of the annotation*/\n",
|
||
" color:white;\n",
|
||
" background-color: black;\n",
|
||
" position:absolute;\n",
|
||
" font-size:0.75rem;\n",
|
||
" text-align:center;\n",
|
||
" font-weight:bold;\n",
|
||
" text-overflow:ellipsis;\n",
|
||
" top:1.75rem;\n",
|
||
" line-height:0;\n",
|
||
" overflow: hidden;\n",
|
||
" white-space: nowrap;\n",
|
||
" left:0;\n",
|
||
" width:fit-content; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/\n",
|
||
" padding:0.5rem 0;\n",
|
||
"}\n",
|
||
"\n",
|
||
" </style>\n",
|
||
" </head>\n",
|
||
" <body>\n",
|
||
" <div class=\"tokenized-text\" dir=auto>\n",
|
||
" <span class=\"annotation\" style=\"color:hsl(61,32%,64%\" data-label=\"foo\"><span class=\"token even-token\" >Ma</span></span><span class=\"annotation\" style=\"color:hsl(10,32%,64%\" data-label=\"bar\"><span class=\"token even-token\" >th</span></span><span class=\"token odd-token\" >ia</span><span class=\"annotation\" style=\"color:hsl(112,32%,64%\" data-label=\"poo\"><span class=\"token odd-token\" >s</span><span class=\"token even-token\" > </span></span><span class=\"token even-token\" >B</span><span class=\"annotation\" style=\"color:hsl(214,32%,64%\" data-label=\"shoe\"><span class=\"token odd-token\" >yn</span><span class=\"token even-token\" >e</span></span><span class=\"token even-token\" >ns</span><span class=\"token odd-token\" > '</span><span class=\"token even-token\" >Z</span><span class=\"token multi-token odd-token\" >͑</span><span class=\"token multi-token odd-token\" >ͫ</span><span class=\"token multi-token odd-token\" >̓</span><span class=\"token multi-token odd-token\" >ͪ</span><span class=\"token multi-token odd-token\" >̂</span><span class=\"token multi-token odd-token\" >ͫ</span><span class=\"annotation\" style=\"color:hsl(163,32%,64%\" data-label=\"random tandem bandem sandem landem fandom\"><span class=\"token multi-token odd-token\" >̽</span><span class=\"token multi-token odd-token\" >͏</span><span class=\"token multi-token odd-token\" >̴</span><span class=\"token multi-token odd-token\" >̙</span><span class=\"token multi-token odd-token\" >̤</span><span class=\"token multi-token odd-token\" >̞</span><span class=\"token multi-token odd-token\" >͉</span></span><span class=\"token multi-token odd-token\" >͚</span><span class=\"token multi-token odd-token\" >̯</span><span class=\"token multi-token odd-token\" >̞</span><span class=\"token multi-token odd-token\" >̠</span><span class=\"token multi-token odd-token\" >͍</span><span class=\"token odd-token\" >A</span><span class=\"token multi-token even-token\" >ͫ</span><span class=\"token multi-token even-token\" >͗</span><span class=\"token multi-token even-token\" >̴</span><span class=\"token multi-token even-token\" >͢</span><span class=\"token multi-token even-token\" >̵</span><span class=\"token multi-token even-token\" >̜</span><span class=\"token multi-token even-token\" >̰</span><span class=\"token multi-token even-token\" >͔</span><span class=\"token even-token\" >L</span><span class=\"token multi-token odd-token\" >ͨ</span><span class=\"token multi-token odd-token\" >ͧ</span><span class=\"token multi-token odd-token\" >ͩ</span><span class=\"token multi-token odd-token\" >͘</span><span class=\"token multi-token odd-token\" >̠</span><span class=\"token odd-token\" >G</span><span class=\"token multi-token even-token\" >̑</span><span class=\"token multi-token even-token\" >͗</span><span class=\"token multi-token even-token\" >̎</span><span class=\"token multi-token even-token\" >̅</span><span class=\"token multi-token even-token\" >͛</span><span class=\"token multi-token even-token\" >́</span><span class=\"token multi-token even-token\" >̴</span><span class=\"token multi-token even-token\" >̻</span><span class=\"token multi-token even-token\" >͈</span><span class=\"token multi-token even-token\" >͍</span><span class=\"token multi-token even-token\" >͔</span><span class=\"token multi-token even-token\" >̹</span><span class=\"annotation\" style=\"color:hsl(61,32%,64%\" data-label=\"foo\"><span class=\"token even-token\" >O</span><span class=\"token multi-token odd-token\" >͂</span><span class=\"token multi-token odd-token\" >̌</span><span class=\"token multi-token odd-token\" >̌</span><span class=\"token multi-token odd-token\" >͘</span><span class=\"token multi-token odd-token\" >̨</span><span class=\"token multi-token odd-token\" >̵</span></span><span class=\"token multi-token odd-token\" >̹</span><span class=\"token multi-token odd-token\" >̻</span><span class=\"token multi-token odd-token\" >̝</span><span class=\"token multi-token odd-token\" >̳</span><span class=\"token odd-token\" >!</span><span class=\"token multi-token even-token\" >̿</span><span class=\"token multi-token even-token\" >̋</span><span class=\"token multi-token even-token\" >ͥ</span><span class=\"token multi-token even-token\" >ͥ</span><span class=\"token multi-token even-token\" >̂</span><span class=\"annotation\" style=\"color:hsl(10,32%,64%\" data-label=\"bar\"><span class=\"token multi-token even-token\" >ͣ</span><span class=\"token multi-token even-token\" >̐</span><span class=\"token multi-token even-token\" >́</span><span class=\"token multi-token even-token\" >́</span><span class=\"token multi-token even-token\" >͞</span><span class=\"token multi-token even-token\" >͜</span><span class=\"token multi-token even-token\" >͖</span><span class=\"token multi-token even-token\" >̬</span><span class=\"token multi-token even-token\" >̰</span><span class=\"token multi-token even-token\" >̙</span><span class=\"token multi-token even-token\" >̗</span><span class=\"token even-token\" >':</span><span class=\"token odd-token\" > W</span></span><span class=\"token odd-token\" >henever</span><span class=\"token even-token\" > you</span><span class=\"token multi-token odd-token\" >’</span><span class=\"token odd-token\" >re</span><span class=\"token even-token\" > working</span><span class=\"token odd-token\" > on</span><span class=\"annotation\" style=\"color:hsl(10,32%,64%\" data-label=\"bar\"><span class=\"token even-token\" > a</span><span class=\"token odd-token\" > piece</span></span><span class=\"token even-token\" > of</span><span class=\"token odd-token\" > JavaScript</span><span class=\"token even-token\" > code</span><span class=\"token odd-token\" > that</span><span class=\"annotation\" style=\"color:hsl(112,32%,64%\" data-label=\"poo\"><span class=\"token even-token\" > de</span></span><span class=\"token even-token\" >als</span><span class=\"token odd-token\" > with</span><span class=\"token even-token\" > strings</span><span class=\"token odd-token\" > or</span><span class=\"token even-token\" > regular</span><span class=\"token odd-token\" > expressions</span><span class=\"token even-token\" > in</span><span class=\"token odd-token\" > some</span><span class=\"token even-token\" > way</span><span class=\"token odd-token\" >,</span><span class=\"token even-token\" > just</span><span class=\"token odd-token\" > add</span><span class=\"token even-token\" > a</span><span class=\"token odd-token\" > unit</span><span class=\"token even-token\" > test</span><span class=\"token odd-token\" > that</span><span class=\"token even-token\" > contains</span><span class=\"token odd-token\" > a</span><span class=\"token even-token\" > pile</span><span class=\"token odd-token\" > of</span><span class=\"token even-token\" > po</span><span class=\"token odd-token\" >o</span><span class=\"token even-token\" > (</span><span class=\"token multi-token odd-token\" >💩</span><span class=\"token even-token\" >)</span><span class=\"token odd-token\" > in</span><span class=\"token even-token\" > a</span><span class=\"token odd-token\" > string</span><span class=\"token even-token\" >,</span><span class=\"token odd-token\" > 💩</span><span class=\"token multi-token even-token\" >💩</span><span class=\"token multi-token odd-token\" >💩</span><span class=\"token multi-token even-token\" >💩</span><span class=\"token multi-token odd-token\" >💩</span><span class=\"token multi-token even-token\" >💩</span><span class=\"token multi-token odd-token\" >💩</span><span class=\"token multi-token even-token\" >💩</span><span class=\"token multi-token odd-token\" >💩</span><span class=\"token multi-token even-token\" >💩</span><span class=\"token multi-token odd-token\" >💩</span><span class=\"token multi-token even-token\" >💩</span><span class=\"token odd-token\" > and</span><span class=\"token even-token\" > see</span><span class=\"token odd-token\" > if</span><span class=\"token even-token\" > anything</span><span class=\"token odd-token\" > breaks</span><span class=\"token even-token\" >.</span><span class=\"token odd-token\" > It</span><span class=\"token multi-token even-token\" >’</span><span class=\"token even-token\" >s</span><span class=\"token odd-token\" > a</span><span class=\"token even-token\" > quick</span><span class=\"token odd-token\" >,</span><span class=\"token even-token\" > fun</span><span class=\"token odd-token\" >,</span><span class=\"token even-token\" > and</span><span class=\"token odd-token\" > easy</span><span class=\"token even-token\" > way</span><span class=\"token odd-token\" > to</span><span class=\"token even-token\" > see</span><span class=\"token odd-token\" > if</span><span class=\"token even-token\" > your</span><span class=\"token odd-token\" > code</span><span class=\"token even-token\" > supports</span><span class=\"token odd-token\" > ast</span><span class=\"token even-token\" >ral</span><span class=\"token odd-token\" > symbols</span><span class=\"token even-token\" >.</span><span class=\"token odd-token\" > Once</span><span class=\"token even-token\" > you</span><span class=\"token multi-token odd-token\" >’</span><span class=\"token odd-token\" >ve</span><span class=\"token even-token\" > found</span><span class=\"token odd-token\" > a</span><span class=\"token even-token\" > Unicode</span><span class=\"token odd-token\" >-</span><span class=\"token even-token\" >related</span><span class=\"token odd-token\" > bug</span><span class=\"token even-token\" > in</span><span class=\"token odd-token\" > your</span><span class=\"token even-token\" > code</span><span class=\"token odd-token\" >,</span><span class=\"token even-token\" > all</span><span class=\"token odd-token\" > you</span><span class=\"token even-token\" > need</span><span class=\"token odd-token\" > to</span><span class=\"token even-token\" > do</span><span class=\"token odd-token\" > is</span><span class=\"token even-token\" > apply</span><span class=\"token odd-token\" > the</span><span class=\"token even-token\" > techniques</span><span class=\"token odd-token\" > discussed</span><span class=\"token even-token\" > in</span><span class=\"token odd-token\" > this</span><span class=\"token even-token\" > post</span><span class=\"token odd-token\" > to</span><span class=\"token even-token\" > fix</span><span class=\"token odd-token\" > it</span><span class=\"token even-token\" >.</span>\n",
|
||
" </div>\n",
|
||
" </body>\n",
|
||
" </html>\n",
|
||
" "
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.HTML object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"from tokenizers import ByteLevelBPETokenizer\n",
|
||
"\n",
|
||
"roberta_tokenizer = ByteLevelBPETokenizer.from_file(\"/tmp/roberta-base-vocab.json\", \"/tmp/roberta-base-merges.txt\")\n",
|
||
"roberta_visualizer = EncodingVisualizer(tokenizer=roberta_tokenizer, default_to_notebook=True)\n",
|
||
"roberta_visualizer(text, annotations=annotations)"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.7.3"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|