mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
2
.github/workflows/docs-check.yml
vendored
2
.github/workflows/docs-check.yml
vendored
@ -35,7 +35,7 @@ jobs:
|
|||||||
run: make clean && make html_all O="-W --keep-going"
|
run: make clean && make html_all O="-W --keep-going"
|
||||||
|
|
||||||
- name: Upload built doc
|
- name: Upload built doc
|
||||||
uses: actions/upload-artifact@v2
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: documentation
|
name: documentation
|
||||||
path: ./docs/build/*
|
path: ./docs/build/*
|
||||||
|
@ -35,7 +35,7 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from tokenizers import BertWordPieceTokenizer\n",
|
"from tokenizers import BertWordPieceTokenizer\n",
|
||||||
"from tokenizers.tools import EncodingVisualizer\n"
|
"from tokenizers.tools import EncodingVisualizer"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -305,7 +305,7 @@
|
|||||||
"anno2 = Annotation(start=2, end=4, label=\"bar\")\n",
|
"anno2 = Annotation(start=2, end=4, label=\"bar\")\n",
|
||||||
"anno3 = Annotation(start=6, end=8, label=\"poo\")\n",
|
"anno3 = Annotation(start=6, end=8, label=\"poo\")\n",
|
||||||
"anno4 = Annotation(start=9, end=12, label=\"shoe\")\n",
|
"anno4 = Annotation(start=9, end=12, label=\"shoe\")\n",
|
||||||
"annotations=[\n",
|
"annotations = [\n",
|
||||||
" anno1,\n",
|
" anno1,\n",
|
||||||
" anno2,\n",
|
" anno2,\n",
|
||||||
" anno3,\n",
|
" anno3,\n",
|
||||||
@ -315,8 +315,7 @@
|
|||||||
" Annotation(start=80, end=95, label=\"bar\"),\n",
|
" Annotation(start=80, end=95, label=\"bar\"),\n",
|
||||||
" Annotation(start=120, end=128, label=\"bar\"),\n",
|
" Annotation(start=120, end=128, label=\"bar\"),\n",
|
||||||
" Annotation(start=152, end=155, label=\"poo\"),\n",
|
" Annotation(start=152, end=155, label=\"poo\"),\n",
|
||||||
"]\n",
|
"]"
|
||||||
"\n"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -521,7 +520,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"visualizer(text,annotations=annotations)"
|
"visualizer(text, annotations=annotations)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -553,7 +552,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"funnyAnnotations = [dict(startPlace=i,endPlace=i+3,theTag=str(i)) for i in range(0,20,4)]\n",
|
"funnyAnnotations = [dict(startPlace=i, endPlace=i + 3, theTag=str(i)) for i in range(0, 20, 4)]\n",
|
||||||
"funnyAnnotations"
|
"funnyAnnotations"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -563,7 +562,10 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"converter = lambda funny: Annotation(start=funny['startPlace'], end=funny['endPlace'], label=funny['theTag'])\n",
|
"def converter(funny):\n",
|
||||||
|
" return Annotation(start=funny[\"startPlace\"], end=funny[\"endPlace\"], label=funny[\"theTag\"])\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
"visualizer = EncodingVisualizer(tokenizer=tokenizer, default_to_notebook=True, annotation_converter=converter)"
|
"visualizer = EncodingVisualizer(tokenizer=tokenizer, default_to_notebook=True, annotation_converter=converter)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -817,7 +819,7 @@
|
|||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"!wget \"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json\" -O /tmp/roberta-base-vocab.json\n",
|
"!wget \"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json\" -O /tmp/roberta-base-vocab.json\n",
|
||||||
"!wget \"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt\" -O /tmp/roberta-base-merges.txt\n"
|
"!wget \"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt\" -O /tmp/roberta-base-merges.txt"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -1023,7 +1025,8 @@
|
|||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from tokenizers import ByteLevelBPETokenizer\n",
|
"from tokenizers import ByteLevelBPETokenizer\n",
|
||||||
"roberta_tokenizer = ByteLevelBPETokenizer.from_file('/tmp/roberta-base-vocab.json', '/tmp/roberta-base-merges.txt')\n",
|
"\n",
|
||||||
|
"roberta_tokenizer = ByteLevelBPETokenizer.from_file(\"/tmp/roberta-base-vocab.json\", \"/tmp/roberta-base-merges.txt\")\n",
|
||||||
"roberta_visualizer = EncodingVisualizer(tokenizer=roberta_tokenizer, default_to_notebook=True)\n",
|
"roberta_visualizer = EncodingVisualizer(tokenizer=roberta_tokenizer, default_to_notebook=True)\n",
|
||||||
"roberta_visualizer(text, annotations=annotations)"
|
"roberta_visualizer(text, annotations=annotations)"
|
||||||
]
|
]
|
||||||
|
Reference in New Issue
Block a user