diff --git a/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py b/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py
index ffb6c2fd..f5011f42 100644
--- a/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py
+++ b/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py
@@ -49,13 +49,29 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
         vocab_size: int = 8000,
         show_progress: bool = True,
         special_tokens: List[Union[str, AddedToken]] = [],
+        unk_token: Optional[str] = None,
     ):
-        """ Train the model using the given files """
+        """
+        Train the model using the given files
+
+        Args:
+            files (:obj:`List[str]`):
+                A list of path to the files that we should use for training
+            vocab_size (:obj:`int`):
+                The size of the final vocabulary, including all tokens and alphabet.
+            show_progress (:obj:`bool`):
+                Whether to show progress bars while training.
+            special_tokens (:obj:`List[Union[str, AddedToken]]`):
+                A list of special tokens the model should know of.
+            unk_token (:obj:`str`, `optional`):
+                The unknown token to be used by the model.
+        """
 
         trainer = trainers.UnigramTrainer(
             vocab_size=vocab_size,
             special_tokens=special_tokens,
             show_progress=show_progress,
+            unk_token=unk_token,
         )
 
         if isinstance(files, str):
@@ -68,13 +84,29 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
         vocab_size: int = 8000,
         show_progress: bool = True,
         special_tokens: List[Union[str, AddedToken]] = [],
+        unk_token: Optional[str] = None,
     ):
-        """ Train the model using the given iterator """
+        """
+        Train the model using the given iterator
+
+        Args:
+            iterator (:obj:`Union[Iterator[str], Iterator[Iterator[str]]]`):
+                Any iterator over strings or list of strings
+            vocab_size (:obj:`int`):
+                The size of the final vocabulary, including all tokens and alphabet.
+            show_progress (:obj:`bool`):
+                Whether to show progress bars while training.
+            special_tokens (:obj:`List[Union[str, AddedToken]]`):
+                A list of special tokens the model should know of.
+            unk_token (:obj:`str`, `optional`):
+                The unknown token to be used by the model.
+        """
 
         trainer = trainers.UnigramTrainer(
             vocab_size=vocab_size,
             special_tokens=special_tokens,
             show_progress=show_progress,
+            unk_token=unk_token,
         )
 
         self._tokenizer.train_from_iterator(iterator, trainer=trainer)
diff --git a/bindings/python/setup.py b/bindings/python/setup.py
index 9e20c7ad..4b1b66ae 100644
--- a/bindings/python/setup.py
+++ b/bindings/python/setup.py
@@ -2,7 +2,7 @@ from setuptools import setup
 from setuptools_rust import Binding, RustExtension
 
 extras = {}
-extras["testing"] = ["pytest"]
+extras["testing"] = ["pytest", "requests", "numpy", "datasets"]
 
 setup(
     name="tokenizers",
diff --git a/bindings/python/tests/implementations/test_sentencepiece.py b/bindings/python/tests/implementations/test_sentencepiece.py
index 166b2d14..32495cc4 100644
--- a/bindings/python/tests/implementations/test_sentencepiece.py
+++ b/bindings/python/tests/implementations/test_sentencepiece.py
@@ -1,5 +1,7 @@
+import os
 import pytest
 
+
 from tokenizers import SentencePieceBPETokenizer, SentencePieceUnigramTokenizer
 
 
@@ -14,6 +16,32 @@ class TestSentencePieceBPE:
 
 
 class TestSentencePieceUnigram:
+    def test_train(self, tmpdir):
+        p = tmpdir.mkdir("tmpdir").join("file.txt")
+        p.write("A first sentence\nAnother sentence\nAnd a last one")
+
+        tokenizer = SentencePieceUnigramTokenizer()
+        tokenizer.train(files=str(p), show_progress=False)
+
+        output = tokenizer.encode("A sentence")
+        assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e"]
+
+        with pytest.raises(Exception) as excinfo:
+            _ = tokenizer.encode("A sentence 🤗")
+        assert str(excinfo.value) == "Encountered an unknown token but `unk_id` is missing"
+
+    def test_train_with_unk_token(self, tmpdir):
+        p = tmpdir.mkdir("tmpdir").join("file.txt")
+        p.write("A first sentence\nAnother sentence\nAnd a last one")
+
+        tokenizer = SentencePieceUnigramTokenizer()
+        tokenizer.train(
+            files=str(p), show_progress=False, special_tokens=["<unk>"], unk_token="<unk>"
+        )
+        output = tokenizer.encode("A sentence 🤗")
+        assert output.ids[-1] == 0
+        assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e", "▁", "🤗"]
+
     def test_train_from_iterator(self):
         text = ["A first sentence", "Another sentence", "And a last one"]
         tokenizer = SentencePieceUnigramTokenizer()
@@ -21,3 +49,17 @@ class TestSentencePieceUnigram:
 
         output = tokenizer.encode("A sentence")
         assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e"]
+
+        with pytest.raises(Exception) as excinfo:
+            _ = tokenizer.encode("A sentence 🤗")
+        assert str(excinfo.value) == "Encountered an unknown token but `unk_id` is missing"
+
+    def test_train_from_iterator_with_unk_token(self):
+        text = ["A first sentence", "Another sentence", "And a last one"]
+        tokenizer = SentencePieceUnigramTokenizer()
+        tokenizer.train_from_iterator(
+            text, vocab_size=100, show_progress=False, special_tokens=["<unk>"], unk_token="<unk>"
+        )
+        output = tokenizer.encode("A sentence 🤗")
+        assert output.ids[-1] == 0
+        assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e", "▁", "🤗"]