Files
mozc/src/dictionary/system/system_dictionary_test.cc
Hiroyuki Komatsu 47e88eca05 Update code for dictionary directories.
* Updated LICENSE.
* Introduced MOZC_DICT_DIR_COMPONENTS tests for data_manager.
* Added mozc_oss_src_dir to directories.gyp.

PiperOrigin-RevId: 541564673
2023-06-19 09:07:45 +00:00

874 lines
34 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright 2010-2021, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "dictionary/system/system_dictionary.h"
#include <algorithm>
#include <cstdint>
#include <cstdlib>
#include <iterator>
#include <limits>
#include <memory>
#include <set>
#include <string>
#include <utility>
#include <vector>
#include "base/file_util.h"
#include "config/config_handler.h"
#include "data_manager/testing/mock_data_manager.h"
#include "dictionary/dictionary_test_util.h"
#include "dictionary/dictionary_token.h"
#include "dictionary/pos_matcher.h"
#include "dictionary/system/system_dictionary_builder.h"
#include "dictionary/text_dictionary_loader.h"
#include "protocol/commands.pb.h"
#include "protocol/config.pb.h"
#include "request/conversion_request.h"
#include "testing/googletest.h"
#include "testing/gunit.h"
#include "testing/mozctest.h"
#include "absl/container/btree_set.h"
#include "absl/flags/declare.h"
#include "absl/flags/flag.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/str_format.h"
#include "absl/strings/string_view.h"
ABSL_FLAG(int32_t, dictionary_test_size, 100000,
"Dictionary size for this test.");
ABSL_FLAG(int32_t, dictionary_reverse_lookup_test_size, 1000,
"Number of tokens to run reverse lookup test.");
ABSL_DECLARE_FLAG(int32_t, min_key_length_to_use_small_cost_encoding);
namespace mozc {
namespace dictionary {
namespace {
class SystemDictionaryTest : public ::testing::Test {
protected:
SystemDictionaryTest()
: pos_matcher_(mock_data_manager_.GetPosMatcherData()),
text_dict_(pos_matcher_),
dic_fn_(
FileUtil::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "mozc.dic")) {
const std::string dic_path = mozc::testing::GetSourceFileOrDie(
{MOZC_DICT_DIR_COMPONENTS, "dictionary_oss", "dictionary00.txt"});
text_dict_.LoadWithLineLimit(dic_path, "",
absl::GetFlag(FLAGS_dictionary_test_size));
convreq_.set_request(&request_);
convreq_.set_config(&config_);
}
void SetUp() override {
// Don't use small cost encoding by default.
original_flags_min_key_length_to_use_small_cost_encoding_ =
absl::GetFlag(FLAGS_min_key_length_to_use_small_cost_encoding);
absl::SetFlag(&FLAGS_min_key_length_to_use_small_cost_encoding,
std::numeric_limits<int32_t>::max());
request_.Clear();
config::ConfigHandler::GetDefaultConfig(&config_);
}
void TearDown() override {
absl::SetFlag(&FLAGS_min_key_length_to_use_small_cost_encoding,
original_flags_min_key_length_to_use_small_cost_encoding_);
// This config initialization will be removed once ConversionRequest can
// take config as an injected argument.
config::Config config;
config::ConfigHandler::GetDefaultConfig(&config);
config::ConfigHandler::SetConfig(config);
}
void BuildAndWriteSystemDictionary(const std::vector<Token *> &source,
size_t num_tokens,
const std::string &filename);
std::unique_ptr<SystemDictionary> BuildSystemDictionary(
const std::vector<Token *> &source,
size_t num_tokens = std::numeric_limits<size_t>::max());
bool CompareTokensForLookup(const Token &a, const Token &b,
bool reverse) const;
const testing::ScopedTempUserProfileDirectory scoped_profile_dir_;
const testing::MockDataManager mock_data_manager_;
dictionary::PosMatcher pos_matcher_;
TextDictionaryLoader text_dict_;
ConversionRequest convreq_;
config::Config config_;
commands::Request request_;
const std::string dic_fn_;
int original_flags_min_key_length_to_use_small_cost_encoding_;
};
Token *GetTokenPointer(Token &token) { return &token; }
Token *GetTokenPointer(const std::unique_ptr<Token> &token) {
return token.get();
}
// Get pointers to the Tokens contained in `token_container`. Since the returned
// vector contains mutable pointers to the elements of `token_container`, it
// cannot be passed by const reference.
template <typename C>
std::vector<Token *> MakeTokenPointers(C *token_container) {
std::vector<Token *> ptrs;
std::transform(std::begin(*token_container), std::end(*token_container),
std::back_inserter(ptrs),
[](auto &token) { return GetTokenPointer(token); });
return ptrs;
}
void SystemDictionaryTest::BuildAndWriteSystemDictionary(
const std::vector<Token *> &source, size_t num_tokens,
const std::string &filename) {
SystemDictionaryBuilder builder;
std::vector<Token *> tokens;
tokens.reserve(std::min(source.size(), num_tokens));
// Picks up first tokens.
for (auto it = source.begin();
tokens.size() < num_tokens && it != source.end(); ++it) {
tokens.push_back(*it);
}
builder.BuildFromTokens(tokens);
builder.WriteToFile(filename);
}
std::unique_ptr<SystemDictionary> SystemDictionaryTest::BuildSystemDictionary(
const std::vector<Token *> &source, size_t num_tokens) {
BuildAndWriteSystemDictionary(source, num_tokens, dic_fn_);
return SystemDictionary::Builder(dic_fn_).Build().value();
}
// Returns true if they seem to be same
bool SystemDictionaryTest::CompareTokensForLookup(const Token &a,
const Token &b,
bool reverse) const {
const bool key_value_check = reverse ? (a.key == b.value && a.value == b.key)
: (a.key == b.key && a.value == b.value);
if (!key_value_check) {
return false;
}
const bool comp_cost = a.cost == b.cost;
if (!comp_cost) {
return false;
}
const bool spelling_match = (a.attributes & Token::SPELLING_CORRECTION) ==
(b.attributes & Token::SPELLING_CORRECTION);
if (!spelling_match) {
return false;
}
const bool id_match = (a.lid == b.lid) && (a.rid == b.rid);
if (!id_match) {
return false;
}
return true;
}
TEST_F(SystemDictionaryTest, HasValue) {
std::vector<Token> tokens;
for (int i = 0; i < 4; ++i) {
tokens.emplace_back(absl::StrFormat("きー%d", i),
absl::StrFormat("バリュー%d", i));
}
const std::string kFull = "";
const std::string kHiragana = "ひらがな";
const std::string kKatakanaKey = "かたかな";
const std::string kKatakanaValue = "カタカナ";
tokens.emplace_back("Mozc", "Mozc"); // Alphabet
tokens.emplace_back("upper", "UPPER"); // Alphabet upper case
tokens.emplace_back("full", kFull); // Alphabet full width
tokens.emplace_back(kHiragana, kHiragana); // Hiragana
tokens.emplace_back(kKatakanaKey, kKatakanaValue); // Katakana
std::unique_ptr<SystemDictionary> system_dic =
BuildSystemDictionary(MakeTokenPointers(&tokens));
ASSERT_TRUE(system_dic.get() != nullptr);
EXPECT_TRUE(system_dic->HasValue("バリュー0"));
EXPECT_TRUE(system_dic->HasValue("バリュー1"));
EXPECT_TRUE(system_dic->HasValue("バリュー2"));
EXPECT_TRUE(system_dic->HasValue("バリュー3"));
EXPECT_FALSE(system_dic->HasValue("バリュー4"));
EXPECT_FALSE(system_dic->HasValue("バリュー5"));
EXPECT_FALSE(system_dic->HasValue("バリュー6"));
EXPECT_TRUE(system_dic->HasValue("Mozc"));
EXPECT_FALSE(system_dic->HasValue("mozc"));
EXPECT_TRUE(system_dic->HasValue("UPPER"));
EXPECT_FALSE(system_dic->HasValue("upper"));
EXPECT_TRUE(system_dic->HasValue(kFull));
EXPECT_FALSE(system_dic->HasValue("full"));
EXPECT_TRUE(system_dic->HasValue(kHiragana));
EXPECT_FALSE(system_dic->HasValue("ヒラガナ\n"));
EXPECT_TRUE(system_dic->HasValue(kKatakanaValue));
EXPECT_FALSE(system_dic->HasValue(kKatakanaKey));
}
TEST_F(SystemDictionaryTest, NormalWord) {
Token token = {"", "", 100, 50, 70, Token::NONE};
std::unique_ptr<SystemDictionary> system_dic = BuildSystemDictionary(
{&token}, absl::GetFlag(FLAGS_dictionary_test_size));
ASSERT_TRUE(system_dic);
CollectTokenCallback callback;
// Look up by exact key.
system_dic->LookupPrefix(token.key, convreq_, &callback);
ASSERT_EQ(1, callback.tokens().size());
EXPECT_TOKEN_EQ(token, callback.tokens().front());
// Look up by prefix.
callback.Clear();
system_dic->LookupPrefix("あいう", convreq_, &callback);
ASSERT_EQ(1, callback.tokens().size());
EXPECT_TOKEN_EQ(token, callback.tokens().front());
// Nothing should be looked up.
callback.Clear();
system_dic->LookupPrefix("かきく", convreq_, &callback);
EXPECT_TRUE(callback.tokens().empty());
}
TEST_F(SystemDictionaryTest, SameWord) {
std::vector<Token> tokens = {
{"", "", 100, 50, 70, Token::NONE},
{"", "", 150, 100, 200, Token::NONE},
{"", "", 100, 1000, 2000, Token::NONE},
{"", "", 1000, 2000, 3000, Token::NONE},
};
std::vector<Token *> source_tokens = MakeTokenPointers(&tokens);
std::unique_ptr<SystemDictionary> system_dic = BuildSystemDictionary(
source_tokens, absl::GetFlag(FLAGS_dictionary_test_size));
ASSERT_TRUE(system_dic);
// All the tokens should be looked up.
CollectTokenCallback callback;
system_dic->LookupPrefix("", convreq_, &callback);
EXPECT_TOKENS_EQ_UNORDERED(source_tokens, callback.tokens());
}
TEST_F(SystemDictionaryTest, LookupAllWords) {
const std::vector<std::unique_ptr<Token>> &source_tokens =
text_dict_.tokens();
std::unique_ptr<SystemDictionary> system_dic =
BuildSystemDictionary(MakeTokenPointers(&source_tokens),
absl::GetFlag(FLAGS_dictionary_test_size));
ASSERT_TRUE(system_dic);
// All the tokens should be looked up.
for (size_t i = 0; i < source_tokens.size(); ++i) {
CheckTokenExistenceCallback callback(source_tokens[i].get());
system_dic->LookupPrefix(source_tokens[i]->key, convreq_, &callback);
EXPECT_TRUE(callback.found())
<< "Token was not found: " << PrintToken(*source_tokens[i]);
}
}
TEST_F(SystemDictionaryTest, SimpleLookupPrefix) {
const std::string k0 = "";
const std::string k1 = "はひふへほ";
Token t0 = {k0, "aa", 0, 0, 0, Token::NONE};
Token t1 = {k1, "bb", 0, 0, 0, Token::NONE};
std::vector<Token *> source_tokens = {&t0, &t1};
text_dict_.CollectTokens(&source_tokens);
std::unique_ptr<SystemDictionary> system_dic =
BuildSystemDictionary(source_tokens, 100);
ASSERT_TRUE(system_dic);
// |t0| should be looked up from |k1|.
CheckTokenExistenceCallback callback(&t0);
system_dic->LookupPrefix(k1, convreq_, &callback);
EXPECT_TRUE(callback.found());
}
class LookupPrefixTestCallback : public SystemDictionary::Callback {
public:
ResultType OnKey(absl::string_view key) override {
if (key == "かき") {
return TRAVERSE_CULL;
} else if (key == "") {
return TRAVERSE_NEXT_KEY;
} else if (key == "") {
return TRAVERSE_DONE;
}
return TRAVERSE_CONTINUE;
}
ResultType OnToken(absl::string_view key, absl::string_view actual_key,
const Token &token) override {
result_.insert(std::make_pair(token.key, token.value));
return TRAVERSE_CONTINUE;
}
const std::set<std::pair<std::string, std::string>> &result() const {
return result_;
}
private:
std::set<std::pair<std::string, std::string>> result_;
};
TEST_F(SystemDictionaryTest, LookupPrefix) {
// Set up a test dictionary.
struct {
const char *key;
const char *value;
} kKeyValues[] = {
{"", ""}, {"", ""}, {"", ""},
{"あい", ""}, {"あい", ""}, {"あいう", "藍雨"},
{"", ""}, {"かき", "牡蠣"}, {"かき", "夏季"},
{"かきく", "柿久"}, {"", ""}, {"", ""},
{"さし", ""}, {"", ""}, {"", ""},
{"たち", "多値"}, {"たちつ", "タチツ"}, {"", ""},
{"", ""}, {"はひ", "ハヒ"}, {"", ""},
{"はび", "波美"}, {"ばび", "馬尾"}, {"ばびぶ", "バビブ"},
};
constexpr size_t kKeyValuesSize = std::size(kKeyValues);
std::vector<Token> tokens;
tokens.reserve(kKeyValuesSize);
for (const auto &kv : kKeyValues) {
tokens.emplace_back(kv.key, kv.value);
}
std::unique_ptr<SystemDictionary> system_dic =
BuildSystemDictionary(MakeTokenPointers(&tokens), kKeyValuesSize);
ASSERT_TRUE(system_dic);
// Test for normal prefix lookup without key expansion.
{
LookupPrefixTestCallback callback;
system_dic->LookupPrefix("あい", // "あい"
convreq_, &callback);
const std::set<std::pair<std::string, std::string>> &result =
callback.result();
// "あ" -- "あい" should be found.
for (size_t i = 0; i < 5; ++i) {
const std::pair<std::string, std::string> entry(kKeyValues[i].key,
kKeyValues[i].value);
EXPECT_TRUE(result.end() != result.find(entry));
}
// The others should not be found.
for (size_t i = 5; i < std::size(kKeyValues); ++i) {
const std::pair<std::string, std::string> entry(kKeyValues[i].key,
kKeyValues[i].value);
EXPECT_TRUE(result.end() == result.find(entry));
}
}
// Test for normal prefix lookup without key expansion, but with culling
// feature.
{
LookupPrefixTestCallback callback;
system_dic->LookupPrefix("かきく", convreq_, &callback);
const std::set<std::pair<std::string, std::string>> &result =
callback.result();
// Only "か" should be found as the callback doesn't traverse the subtree of
// "かき" due to culling request from LookupPrefixTestCallback::OnKey().
for (size_t i = 0; i < kKeyValuesSize; ++i) {
const std::pair<std::string, std::string> entry(kKeyValues[i].key,
kKeyValues[i].value);
EXPECT_EQ(entry.first == "", result.find(entry) != result.end());
}
}
// Test for TRAVERSE_NEXT_KEY.
{
LookupPrefixTestCallback callback;
system_dic->LookupPrefix("さしす", convreq_, &callback);
const std::set<std::pair<std::string, std::string>> &result =
callback.result();
// Only "さし" should be found as tokens for "さ" is skipped (see
// LookupPrefixTestCallback::OnKey()).
for (size_t i = 0; i < kKeyValuesSize; ++i) {
const std::pair<std::string, std::string> entry(kKeyValues[i].key,
kKeyValues[i].value);
EXPECT_EQ(entry.first == "さし", result.find(entry) != result.end());
}
}
// Test for TRAVERSE_DONE.
{
LookupPrefixTestCallback callback;
system_dic->LookupPrefix("たちつ", convreq_, &callback);
const std::set<std::pair<std::string, std::string>> &result =
callback.result();
// Nothing should be found as the traversal is immediately done after seeing
// "た"; see LookupPrefixTestCallback::OnKey().
EXPECT_TRUE(result.empty());
}
// Test for prefix lookup with key expansion.
{
LookupPrefixTestCallback callback;
// Use kana modifier insensitive lookup
request_.set_kana_modifier_insensitive_conversion(true);
config_.set_use_kana_modifier_insensitive_conversion(true);
system_dic->LookupPrefix("はひ", convreq_, &callback);
const std::set<std::pair<std::string, std::string>> &result =
callback.result();
const char *kExpectedKeys[] = {
"", "", "はひ", "ばひ", "はび", "ばび",
};
const absl::btree_set<std::string> expected(
kExpectedKeys, kExpectedKeys + std::size(kExpectedKeys));
for (size_t i = 0; i < kKeyValuesSize; ++i) {
const bool to_be_found =
expected.find(kKeyValues[i].key) != expected.end();
const std::pair<std::string, std::string> entry(kKeyValues[i].key,
kKeyValues[i].value);
EXPECT_EQ(result.find(entry) != result.end(), to_be_found);
}
}
}
TEST_F(SystemDictionaryTest, LookupPredictive) {
Token tokens[] = {
{"まみむめもや", "value0", 0, 0, 0, Token::NONE},
{"まみむめもやゆよ", "value1", 0, 0, 0, Token::NONE},
};
// Build a dictionary with the above two tokens plus those from test data.
std::vector<Token *> source_tokens = MakeTokenPointers(&tokens);
text_dict_.CollectTokens(&source_tokens); // Load test data.
std::unique_ptr<SystemDictionary> system_dic =
BuildSystemDictionary(source_tokens, 10000);
ASSERT_TRUE(system_dic);
// All the tokens in |tokens| should be looked up by "まみむめも".
constexpr char kMamimumemo[] = "まみむめも";
CheckMultiTokensExistenceCallback callback({&tokens[0], &tokens[1]});
system_dic->LookupPredictive(kMamimumemo, convreq_, &callback);
EXPECT_TRUE(callback.AreAllFound());
}
TEST_F(SystemDictionaryTest, LookupPredictiveKanaModifierInsensitiveLookup) {
Token tokens[] = {
{"がっこう", "学校", 0, 0, 0, Token::NONE},
{"かっこう", "格好", 0, 0, 0, Token::NONE},
};
const std::vector<Token *> source_tokens = {&tokens[0], &tokens[1]};
std::unique_ptr<SystemDictionary> system_dic =
BuildSystemDictionary(source_tokens, 100);
ASSERT_TRUE(system_dic);
const std::string kKey = "かつこう";
// Without Kana modifier insensitive lookup flag, nothing is looked up.
CollectTokenCallback callback;
request_.set_kana_modifier_insensitive_conversion(false);
config_.set_use_kana_modifier_insensitive_conversion(false);
system_dic->LookupPredictive(kKey, convreq_, &callback);
EXPECT_TRUE(callback.tokens().empty());
// With Kana modifier insensitive lookup flag, every token is looked up.
callback.Clear();
request_.set_kana_modifier_insensitive_conversion(true);
config_.set_use_kana_modifier_insensitive_conversion(true);
system_dic->LookupPredictive(kKey, convreq_, &callback);
EXPECT_TOKENS_EQ_UNORDERED(source_tokens, callback.tokens());
}
TEST_F(SystemDictionaryTest, LookupPredictiveCutOffEmulatingBFS) {
Token tokens[] = {
{"あい", "ai", 0, 0, 0, Token::NONE},
{"あいうえお", "aiueo", 0, 0, 0, Token::NONE},
};
// Build a dictionary with the above two tokens plus those from test data.
std::vector<Token *> source_tokens = MakeTokenPointers(&tokens);
text_dict_.CollectTokens(&source_tokens); // Load test data.
std::unique_ptr<SystemDictionary> system_dic =
BuildSystemDictionary(source_tokens, 10000);
ASSERT_TRUE(system_dic);
// Since there are many entries starting with "あ" in test dictionary, it's
// expected that "あいうえお" is not looked up because of longer key cut-off
// mechanism. However, "あい" is looked up as it's short.
CheckMultiTokensExistenceCallback callback({&tokens[0], &tokens[1]});
system_dic->LookupPredictive("", convreq_, &callback);
EXPECT_TRUE(callback.IsFound(&tokens[0]));
EXPECT_FALSE(callback.IsFound(&tokens[1]));
}
TEST_F(SystemDictionaryTest, LookupExact) {
const std::string k0 = "";
const std::string k1 = "はひふへほ";
Token t0 = {k0, "aa", 0, 0, 0, Token::NONE};
Token t1 = {k1, "bb", 0, 0, 0, Token::NONE};
std::vector<Token *> source_tokens = {&t0, &t1};
text_dict_.CollectTokens(&source_tokens);
std::unique_ptr<SystemDictionary> system_dic =
BuildSystemDictionary(source_tokens, 100);
ASSERT_TRUE(system_dic);
// |t0| should not be looked up from |k1|.
CheckTokenExistenceCallback callback0(&t0);
system_dic->LookupExact(k1, convreq_, &callback0);
EXPECT_FALSE(callback0.found());
// But |t1| should be found.
CheckTokenExistenceCallback callback1(&t1);
system_dic->LookupExact(k1, convreq_, &callback1);
EXPECT_TRUE(callback1.found());
// Nothing should be found from "hoge".
CollectTokenCallback callback_hoge;
system_dic->LookupExact("hoge", convreq_, &callback_hoge);
EXPECT_TRUE(callback_hoge.tokens().empty());
}
TEST_F(SystemDictionaryTest, LookupReverse) {
Token tokens[] = {
{"", "", 1, 2, 3, Token::NONE},
{"どらえもん", "ドラえもん", 1, 2, 3, Token::NONE},
{"といざらす®", "トイザらス®", 1, 2, 3, Token::NONE},
// Both token[3] and token[4] will be encoded into 3 bytes.
{"ああああああ", "ああああああ", 32000, 1, 1, Token::NONE},
{"ああああああ", "ああああああ", 32000, 1, 2, Token::NONE},
// token[5] will be encoded into 3 bytes.
{"いいいいいい", "いいいいいい", 32000, 1, 1, Token::NONE},
{"どらえもん", "ドラえもん", 1, 2, 3, Token::SPELLING_CORRECTION},
{"こんさーと", "コンサート", 1, 1, 1, Token::NONE},
{"ばーじょん", "バージョン", 1, 1, 1, Token::NONE},
};
std::vector<Token *> source_tokens = MakeTokenPointers(&tokens);
text_dict_.CollectTokens(&source_tokens);
std::unique_ptr<SystemDictionary> system_dic =
BuildSystemDictionary(source_tokens, source_tokens.size());
ASSERT_TRUE(system_dic);
const size_t test_size =
std::min<size_t>(absl::GetFlag(FLAGS_dictionary_reverse_lookup_test_size),
source_tokens.size());
for (size_t source_index = 0; source_index < test_size; ++source_index) {
const Token &source_token = *source_tokens[source_index];
CollectTokenCallback callback;
system_dic->LookupReverse(source_token.value, convreq_, &callback);
bool found = false;
for (const Token &token : callback.tokens()) {
// Make sure any of the key lengths of the lookup results
// doesn't exceed the original key length.
// It happened once
// when called with "バージョン", returning "ヴァージョン".
EXPECT_LE(token.key.size(), source_token.value.size())
<< token.key << ":" << token.value << "\t" << source_token.value;
if (CompareTokensForLookup(source_token, token, true)) {
found = true;
}
}
if ((source_token.attributes & Token::SPELLING_CORRECTION) ==
Token::SPELLING_CORRECTION) {
EXPECT_FALSE(found) << "Spelling correction token was retrieved:"
<< PrintToken(source_token);
if (found) {
return;
}
} else {
EXPECT_TRUE(found) << "Failed to find " << source_token.key << ":"
<< source_token.value;
if (!found) {
return;
}
}
}
{
// test for non exact transliterated index string.
// append "が"
const std::string key = absl::StrCat(tokens[7].value, "");
CollectTokenCallback callback;
system_dic->LookupReverse(key, convreq_, &callback);
bool found = false;
for (const Token &token : callback.tokens()) {
if (CompareTokensForLookup(tokens[7], token, true)) {
found = true;
}
}
EXPECT_TRUE(found) << "Missed token for non exact transliterated index "
<< key;
}
}
TEST_F(SystemDictionaryTest, LookupReverseIndex) {
const std::vector<std::unique_ptr<Token>> &source_tokens =
text_dict_.tokens();
BuildAndWriteSystemDictionary(MakeTokenPointers(&source_tokens),
absl::GetFlag(FLAGS_dictionary_test_size),
dic_fn_);
std::unique_ptr<SystemDictionary> system_dic_without_index =
SystemDictionary::Builder(dic_fn_)
.SetOptions(SystemDictionary::NONE)
.Build()
.value();
ASSERT_TRUE(system_dic_without_index)
<< "Failed to open dictionary source:" << dic_fn_;
std::unique_ptr<SystemDictionary> system_dic_with_index =
SystemDictionary::Builder(dic_fn_)
.SetOptions(SystemDictionary::ENABLE_REVERSE_LOOKUP_INDEX)
.Build()
.value();
ASSERT_TRUE(system_dic_with_index)
<< "Failed to open dictionary source:" << dic_fn_;
int size = absl::GetFlag(FLAGS_dictionary_reverse_lookup_test_size);
for (auto it = source_tokens.begin(); size > 0 && it != source_tokens.end();
++it, --size) {
const Token &t = **it;
CollectTokenCallback callback1, callback2;
system_dic_without_index->LookupReverse(t.value, convreq_, &callback1);
system_dic_with_index->LookupReverse(t.value, convreq_, &callback2);
const std::vector<Token> &tokens1 = callback1.tokens();
const std::vector<Token> &tokens2 = callback2.tokens();
ASSERT_EQ(tokens1.size(), tokens2.size());
for (size_t i = 0; i < tokens1.size(); ++i) {
EXPECT_TOKEN_EQ(tokens1[i], tokens2[i]);
}
}
}
TEST_F(SystemDictionaryTest, LookupReverseWithCache) {
const std::string kDoraemon = "ドラえもん";
Token source_token;
source_token.key = "どらえもん";
source_token.value = kDoraemon;
source_token.cost = 1;
source_token.lid = 2;
source_token.rid = 3;
std::vector<Token *> source_tokens = {&source_token};
text_dict_.CollectTokens(&source_tokens);
std::unique_ptr<SystemDictionary> system_dic =
BuildSystemDictionary(source_tokens, source_tokens.size());
ASSERT_TRUE(system_dic);
system_dic->PopulateReverseLookupCache(kDoraemon);
Token target_token = source_token;
target_token.key.swap(target_token.value);
CheckTokenExistenceCallback callback(&target_token);
system_dic->LookupReverse(kDoraemon, convreq_, &callback);
EXPECT_TRUE(callback.found())
<< "Could not find " << PrintToken(source_token);
system_dic->ClearReverseLookupCache();
}
TEST_F(SystemDictionaryTest, SpellingCorrectionTokens) {
std::vector<Token> tokens = {
{"あぼがど", "アボカド", 1, 0, 2, Token::SPELLING_CORRECTION},
{"しゅみれーしょん", "シミュレーション", 1, 100, 3,
Token::SPELLING_CORRECTION},
{"あきはばら", "秋葉原", 1000, 1, 2, Token::NONE},
};
std::vector<Token *> source_tokens = MakeTokenPointers(&tokens);
std::unique_ptr<SystemDictionary> system_dic =
BuildSystemDictionary(source_tokens, source_tokens.size());
ASSERT_TRUE(system_dic);
for (size_t i = 0; i < source_tokens.size(); ++i) {
CheckTokenExistenceCallback callback(source_tokens[i]);
system_dic->LookupPrefix(source_tokens[i]->key, convreq_, &callback);
EXPECT_TRUE(callback.found())
<< "Token " << i << " was not found: " << PrintToken(*source_tokens[i]);
}
}
TEST_F(SystemDictionaryTest, EnableNoModifierTargetWithLoudsTrie) {
const std::string k0 = "かつ";
const std::string k1 = "かっこ";
const std::string k2 = "かつこう";
const std::string k3 = "かっこう";
const std::string k4 = "がっこう";
Token tokens[5] = {
{k0, "aa", 0, 0, 0, Token::NONE}, {k1, "bb", 0, 0, 0, Token::NONE},
{k2, "cc", 0, 0, 0, Token::NONE}, {k3, "dd", 0, 0, 0, Token::NONE},
{k4, "ee", 0, 0, 0, Token::NONE},
};
std::vector<Token *> source_tokens = MakeTokenPointers(&tokens);
text_dict_.CollectTokens(&source_tokens);
std::unique_ptr<SystemDictionary> system_dic =
BuildSystemDictionary(source_tokens, 100);
ASSERT_TRUE(system_dic);
request_.set_kana_modifier_insensitive_conversion(true);
config_.set_use_kana_modifier_insensitive_conversion(true);
// Prefix search
for (size_t i = 0; i < std::size(tokens); ++i) {
CheckTokenExistenceCallback callback(&tokens[i]);
// "かつこう" -> "かつ", "かっこ", "かつこう", "かっこう" and "がっこう"
system_dic->LookupPrefix(k2, convreq_, &callback);
EXPECT_TRUE(callback.found())
<< "Token " << i << " was not found: " << PrintToken(tokens[i]);
}
// Predictive searches
{
// "かつ" -> "かつ", "かっこ", "かつこう", "かっこう" and "がっこう"
std::vector<Token *> expected = MakeTokenPointers(&tokens);
CheckMultiTokensExistenceCallback callback(expected);
system_dic->LookupPredictive(k0, convreq_, &callback);
EXPECT_TRUE(callback.AreAllFound());
}
{
// "かっこ" -> "かっこ", "かっこう" and "がっこう"
std::vector<Token *> expected = {&tokens[1], &tokens[3], &tokens[4]};
CheckMultiTokensExistenceCallback callback(expected);
system_dic->LookupPredictive(k1, convreq_, &callback);
EXPECT_TRUE(callback.AreAllFound());
}
}
TEST_F(SystemDictionaryTest, NoModifierForKanaEntries) {
Token t0 = {"ていすてぃんぐ", "テイスティング", 0, 0, 0, Token::NONE};
Token t1 = {"てすとです", "てすとです", 0, 0, 0, Token::NONE};
std::vector<Token *> source_tokens = {&t0, &t1};
text_dict_.CollectTokens(&source_tokens);
std::unique_ptr<SystemDictionary> system_dic =
BuildSystemDictionary(source_tokens, 100);
ASSERT_TRUE(system_dic);
// Lookup |t0| from "ていすていんぐ"
const std::string k = "ていすていんぐ";
request_.set_kana_modifier_insensitive_conversion(true);
config_.set_use_kana_modifier_insensitive_conversion(true);
CheckTokenExistenceCallback callback(&t0);
system_dic->LookupPrefix(k, convreq_, &callback);
EXPECT_TRUE(callback.found()) << "Not found: " << PrintToken(t0);
}
TEST_F(SystemDictionaryTest, DoNotReturnNoModifierTargetWithLoudsTrie) {
const std::string k0 = "かつ";
const std::string k1 = "かっこ";
const std::string k2 = "かつこう";
const std::string k3 = "かっこう";
const std::string k4 = "がっこう";
Token tokens[5] = {
{k0, "aa", 0, 0, 0, Token::NONE}, {k1, "bb", 0, 0, 0, Token::NONE},
{k2, "cc", 0, 0, 0, Token::NONE}, {k3, "dd", 0, 0, 0, Token::NONE},
{k4, "ee", 0, 0, 0, Token::NONE},
};
std::vector<Token *> source_tokens = MakeTokenPointers(&tokens);
text_dict_.CollectTokens(&source_tokens);
std::unique_ptr<SystemDictionary> system_dic =
BuildSystemDictionary(source_tokens, 100);
ASSERT_TRUE(system_dic);
request_.set_kana_modifier_insensitive_conversion(false);
config_.set_use_kana_modifier_insensitive_conversion(false);
// Prefix search
{
// "かっこう" (k3) -> "かっこ" (k1) and "かっこう" (k3)
// Make sure "がっこう" is not in the results when searched by "かっこう"
std::vector<Token *> to_be_looked_up = {&tokens[1], &tokens[3]};
std::vector<Token *> not_to_be_looked_up = {&tokens[0], &tokens[2],
&tokens[4]};
for (size_t i = 0; i < to_be_looked_up.size(); ++i) {
CheckTokenExistenceCallback callback(to_be_looked_up[i]);
system_dic->LookupPrefix(k3, convreq_, &callback);
EXPECT_TRUE(callback.found())
<< "Token is not found: " << PrintToken(*to_be_looked_up[i]);
}
for (size_t i = 0; i < not_to_be_looked_up.size(); ++i) {
CheckTokenExistenceCallback callback(not_to_be_looked_up[i]);
system_dic->LookupPrefix(k3, convreq_, &callback);
EXPECT_FALSE(callback.found()) << "Token should not be found: "
<< PrintToken(*not_to_be_looked_up[i]);
}
}
// Predictive search
{
// "かっこ" -> "かっこ" and "かっこう"
// Make sure "がっこう" is not in the results when searched by "かっこ"
std::vector<Token *> to_be_looked_up = {&tokens[1], &tokens[3]};
std::vector<Token *> not_to_be_looked_up = {&tokens[0], &tokens[2],
&tokens[4]};
for (size_t i = 0; i < to_be_looked_up.size(); ++i) {
CheckTokenExistenceCallback callback(to_be_looked_up[i]);
system_dic->LookupPredictive(k1, convreq_, &callback);
EXPECT_TRUE(callback.found())
<< "Token is not found: " << PrintToken(*to_be_looked_up[i]);
}
for (size_t i = 0; i < not_to_be_looked_up.size(); ++i) {
CheckTokenExistenceCallback callback(not_to_be_looked_up[i]);
system_dic->LookupPredictive(k3, convreq_, &callback);
EXPECT_FALSE(callback.found()) << "Token should not be found: "
<< PrintToken(*not_to_be_looked_up[i]);
}
}
}
TEST_F(SystemDictionaryTest, ShouldNotUseSmallCostEncodingForHeteronyms) {
absl::SetFlag(&FLAGS_min_key_length_to_use_small_cost_encoding,
original_flags_min_key_length_to_use_small_cost_encoding_);
std::vector<Token> tokens = {
{"しょうろんぽう", "ショウロンポウ", 5948, 100, 100, Token::NONE},
{"しょうろんぽう", "小籠包", 7692, 100, 100, Token::NONE},
{"しょーろんぽう", "ショーロンポウ", 6092, 200, 200, Token::NONE},
{"しょーろんぽう", "小籠包", 9000, 100, 100, Token::NONE},
};
std::vector<Token *> source_tokens = MakeTokenPointers(&tokens);
std::unique_ptr<SystemDictionary> system_dic =
BuildSystemDictionary(source_tokens, source_tokens.size());
ASSERT_TRUE(system_dic);
for (size_t i = 0; i < source_tokens.size(); ++i) {
CheckTokenExistenceCallback callback(source_tokens[i]);
system_dic->LookupPrefix(source_tokens[i]->key, convreq_, &callback);
// The original token will not be found when the token is encoded with
// small_cost_encoding
EXPECT_TRUE(callback.found())
<< "Token " << i << " was not found: " << PrintToken(*source_tokens[i]);
}
}
} // namespace
} // namespace dictionary
} // namespace mozc