mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
2
.github/workflows/docs-check.yml
vendored
2
.github/workflows/docs-check.yml
vendored
@ -35,7 +35,7 @@ jobs:
|
||||
run: make clean && make html_all O="-W --keep-going"
|
||||
|
||||
- name: Upload built doc
|
||||
uses: actions/upload-artifact@v2
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: documentation
|
||||
path: ./docs/build/*
|
||||
|
@ -35,7 +35,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from tokenizers import BertWordPieceTokenizer\n",
|
||||
"from tokenizers.tools import EncodingVisualizer\n"
|
||||
"from tokenizers.tools import EncodingVisualizer"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -305,7 +305,7 @@
|
||||
"anno2 = Annotation(start=2, end=4, label=\"bar\")\n",
|
||||
"anno3 = Annotation(start=6, end=8, label=\"poo\")\n",
|
||||
"anno4 = Annotation(start=9, end=12, label=\"shoe\")\n",
|
||||
"annotations=[\n",
|
||||
"annotations = [\n",
|
||||
" anno1,\n",
|
||||
" anno2,\n",
|
||||
" anno3,\n",
|
||||
@ -315,8 +315,7 @@
|
||||
" Annotation(start=80, end=95, label=\"bar\"),\n",
|
||||
" Annotation(start=120, end=128, label=\"bar\"),\n",
|
||||
" Annotation(start=152, end=155, label=\"poo\"),\n",
|
||||
"]\n",
|
||||
"\n"
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -521,7 +520,7 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"visualizer(text,annotations=annotations)"
|
||||
"visualizer(text, annotations=annotations)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -553,7 +552,7 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"funnyAnnotations = [dict(startPlace=i,endPlace=i+3,theTag=str(i)) for i in range(0,20,4)]\n",
|
||||
"funnyAnnotations = [dict(startPlace=i, endPlace=i + 3, theTag=str(i)) for i in range(0, 20, 4)]\n",
|
||||
"funnyAnnotations"
|
||||
]
|
||||
},
|
||||
@ -563,7 +562,10 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"converter = lambda funny: Annotation(start=funny['startPlace'], end=funny['endPlace'], label=funny['theTag'])\n",
|
||||
"def converter(funny):\n",
|
||||
" return Annotation(start=funny[\"startPlace\"], end=funny[\"endPlace\"], label=funny[\"theTag\"])\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"visualizer = EncodingVisualizer(tokenizer=tokenizer, default_to_notebook=True, annotation_converter=converter)"
|
||||
]
|
||||
},
|
||||
@ -817,7 +819,7 @@
|
||||
],
|
||||
"source": [
|
||||
"!wget \"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json\" -O /tmp/roberta-base-vocab.json\n",
|
||||
"!wget \"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt\" -O /tmp/roberta-base-merges.txt\n"
|
||||
"!wget \"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt\" -O /tmp/roberta-base-merges.txt"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -1023,7 +1025,8 @@
|
||||
],
|
||||
"source": [
|
||||
"from tokenizers import ByteLevelBPETokenizer\n",
|
||||
"roberta_tokenizer = ByteLevelBPETokenizer.from_file('/tmp/roberta-base-vocab.json', '/tmp/roberta-base-merges.txt')\n",
|
||||
"\n",
|
||||
"roberta_tokenizer = ByteLevelBPETokenizer.from_file(\"/tmp/roberta-base-vocab.json\", \"/tmp/roberta-base-merges.txt\")\n",
|
||||
"roberta_visualizer = EncodingVisualizer(tokenizer=roberta_tokenizer, default_to_notebook=True)\n",
|
||||
"roberta_visualizer(text, annotations=annotations)"
|
||||
]
|
||||
|
Reference in New Issue
Block a user