Add an Encoding.sequences to allow masking

2025-12-12 05:18:39 +00:00 · 2020-11-05 13:12:15 -05:00
parent 385d25720a
commit 57d162b269
8 changed files with 74 additions and 1 deletions
--- a/bindings/node/lib/bindings/raw-encoding.d.ts
+++ b/bindings/node/lib/bindings/raw-encoding.d.ts
@@ -123,6 +123,11 @@ export interface RawEncoding {
   */
  getWords(): (number | undefined)[];
  /**
   * The sequences indices
   */
  getSequences(): (number | undefined)[];
  /**
   * Pad the current Encoding at the given length
   *
--- a/bindings/node/lib/bindings/raw-encoding.test.ts
+++ b/bindings/node/lib/bindings/raw-encoding.test.ts
@@ -112,6 +112,13 @@ describe("RawEncoding", () => {
    });
  });
  describe("getSequences", () => {
    it("returns the correct list of indexes", () => {
      expect(encoding.getSequences()).toEqual([0, 0, 0, 0, 0]);
      expect(encodingDual.getSequences()).toEqual([0, 0, 0, 0, 0, 1, 1, 1, 1]);
    });
  });
  describe("wordToTokens", () => {
    it("returns the correct indexes", () => {
      const indexes = encoding.wordToTokens(3);
--- a/bindings/node/lib/implementations/encoding.ts
+++ b/bindings/node/lib/implementations/encoding.ts
@@ -11,6 +11,7 @@ export class Encoding {
  private _tokens?: string[];
  private _typeIds?: number[];
  private _wordIndexes?: (number | undefined)[];
  private _sequenceIndexes?: (number | undefined)[];
  constructor(private _rawEncoding: RawEncoding) {}
@@ -151,6 +152,14 @@ export class Encoding {
    return (this._wordIndexes = this._rawEncoding.getWords());
  }
  get sequenceIndexes(): (number | undefined)[] {
    if (this._sequenceIndexes) {
      return this._sequenceIndexes;
    }
    return (this._sequenceIndexes = this._rawEncoding.getSequences());
  }
  /**
   * Get the encoded tokens corresponding to the word at the given index in one of the input
   * sequences, with the form [startToken, endToken+1]
--- a/bindings/node/native/src/encoding.rs
+++ b/bindings/node/native/src/encoding.rs
@@ -115,7 +115,7 @@ declare_types! {
        }
        method getWords(mut cx) {
-            // getWords(): number[]
+            // getWords(): (number | undefined)[]
            let this = cx.this();
            let guard = cx.lock();
@@ -127,6 +127,18 @@ declare_types! {
            Ok(neon_serde::to_value(&mut cx, &ids)?)
        }
        method getSequences(mut cx) {
            // getSequences(): (number | undefined)[]
            let this = cx.this();
            let guard = cx.lock();
            let ids = this.borrow(&guard)
                .encoding.as_ref().expect("Uninitialized Encoding")
                .get_sequences();
            Ok(neon_serde::to_value(&mut cx, &ids)?)
        }
        method getOffsets(mut cx) {
            // getOffsets(): [number, number][]
--- a/bindings/python/py_src/tokenizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/init.pyi
@@ -327,6 +327,17 @@ class Encoding:
        """
        pass
    @property
    def sequences(self) -> List[Optional[int]]:
        """The generated sequence indices.
        They represent the index of the input sequence associated to each token.
        The sequence id can be None if the token is not related to any input sequence,
        like for example with special tokens.
        Returns:
            A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index.
        """
    @property
    def type_ids(self) -> List[int]:
        """The generated type IDs
--- a/bindings/python/src/encoding.rs
+++ b/bindings/python/src/encoding.rs
@@ -152,6 +152,19 @@ impl PyEncoding {
        self.encoding.get_words().to_vec()
    }
    /// The generated sequence indices.
    ///
    /// They represent the index of the input sequence associated to each token.
    /// The sequence id can be None if the token is not related to any input sequence,
    /// like for example with special tokens.
    ///
    /// Returns:
    ///     A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index.
    #[getter]
    fn get_sequences(&self) -> Vec<Option<usize>> {
        self.encoding.get_sequences()
    }
    /// The generated type IDs
    ///
    /// Generally used for tasks like sequence classification or question answering,
--- a/bindings/python/tests/bindings/test_encoding.py
+++ b/bindings/python/tests/bindings/test_encoding.py
@@ -12,6 +12,12 @@ class TestEncoding:
        pair_encoding = tokenizer.encode("I love HuggingFace", "Do you?")
        return single_encoding, pair_encoding
    def test_sequences(self, encodings):
        single, pair = encodings
        assert single.sequences == [None, 0, 0, 0, 0, None]
        assert pair.sequences == [None, 0, 0, 0, 0, None, 1, 1, 1, None]
    def test_n_sequences(self, encodings):
        single, pair = encodings
        assert single.n_sequences == 1
--- a/tokenizers/src/tokenizer/encoding.rs
+++ b/tokenizers/src/tokenizer/encoding.rs
@@ -129,6 +129,16 @@ impl Encoding {
        &self.words
    }
    pub fn get_sequences(&self) -> Vec<Option<usize>> {
        let mut sequences = vec![None; self.len()];
        for seq_id in 0..self.n_sequences() {
            let range = self.sequence_range(seq_id);
            let seq_len = range.len();
            sequences.splice(range, std::iter::repeat(Some(seq_id)).take(seq_len));
        }
        sequences
    }
    pub fn get_words_mut(&mut self) -> &mut [Option<u32>] {
        &mut self.words
    }