Fixing Trainer with u8 instead of chars. (#452)

* Fixing Trainer with u8 instead of chars. Now check both optimized and unoptimized encodings schemes for Unigram. * Small fixes. * Fixing makefile.
2025-12-06 20:58:22 +00:00 · 2020-10-09 18:57:14 +02:00
parent 35feff0042
commit fbca797b3d
6 changed files with 106 additions and 66 deletions
--- a/bindings/python/tests/utils.py
+++ b/bindings/python/tests/utils.py
@@ -61,8 +61,17 @@ def openai_files(data_dir):

@pytest.fixture(scope="session")
 def train_files(data_dir):
+    big = download("https://norvig.com/big.txt")
+    small = os.path.join(DATA_PATH, "small.txt")
+    with open(small, "w") as f:
+        with open(big, "r") as g:
+            for i, line in enumerate(g):
+                f.write(line)
+                if i > 100:
+                    break
    return {
-        "big": download("https://norvig.com/big.txt"),
+        "small": small,
+        "big": big,
    }