mirror of
https://github.com/mii443/mozc.git
synced 2025-08-22 16:15:46 +00:00
Split util module for zip code
BUG= TEST= REF_BUG= REF_CL=132211827 REF_TIME=2016-09-05T14:10:24+09:00 REF_TIME_RAW=1473052224 +0900
This commit is contained in:
@ -30,7 +30,7 @@
|
|||||||
|
|
||||||
MAJOR=2
|
MAJOR=2
|
||||||
MINOR=18
|
MINOR=18
|
||||||
BUILD=2608
|
BUILD=2609
|
||||||
REVISION=102
|
REVISION=102
|
||||||
# This version represents the version of Mozc IME engine (converter, predictor,
|
# This version represents the version of Mozc IME engine (converter, predictor,
|
||||||
# etc.). This version info is included both in the Mozc server and in the Mozc
|
# etc.). This version info is included both in the Mozc server and in the Mozc
|
||||||
|
@ -51,12 +51,13 @@
|
|||||||
|
|
||||||
__author__ = "toshiyuki"
|
__author__ = "toshiyuki"
|
||||||
|
|
||||||
import codecs
|
|
||||||
import optparse
|
import optparse
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
||||||
|
from dictionary import zip_code_util
|
||||||
|
|
||||||
|
|
||||||
ZIP_CODE_LABEL = 'ZIP_CODE'
|
ZIP_CODE_LABEL = 'ZIP_CODE'
|
||||||
ZIP_CODE_COST = 7000
|
ZIP_CODE_COST = 7000
|
||||||
@ -85,8 +86,8 @@ class ZipEntry(object):
|
|||||||
|
|
||||||
def ProcessZipCodeCSV(file_name):
|
def ProcessZipCodeCSV(file_name):
|
||||||
"""Process zip code csv."""
|
"""Process zip code csv."""
|
||||||
csv_lines = ReadCSV(file_name)
|
csv_lines = zip_code_util.ReadCSV(file_name)
|
||||||
merged_csv_lines = MergeCSV(csv_lines)
|
merged_csv_lines = zip_code_util.MergeCSV(csv_lines)
|
||||||
for tokens in merged_csv_lines:
|
for tokens in merged_csv_lines:
|
||||||
for entry in ReadZipCodeEntries(tokens[2], tokens[6], tokens[7], tokens[8]):
|
for entry in ReadZipCodeEntries(tokens[2], tokens[6], tokens[7], tokens[8]):
|
||||||
entry.Output()
|
entry.Output()
|
||||||
@ -94,95 +95,12 @@ def ProcessZipCodeCSV(file_name):
|
|||||||
|
|
||||||
def ProcessJigyosyoCSV(file_name):
|
def ProcessJigyosyoCSV(file_name):
|
||||||
"""Process jigyosyo csv."""
|
"""Process jigyosyo csv."""
|
||||||
for tokens in ReadCSV(file_name):
|
for tokens in zip_code_util.ReadCSV(file_name):
|
||||||
entry = ReadJigyosyoEntry(tokens[7], tokens[3], tokens[4],
|
entry = ReadJigyosyoEntry(tokens[7], tokens[3], tokens[4],
|
||||||
tokens[5], tokens[2])
|
tokens[5], tokens[2])
|
||||||
entry.Output()
|
entry.Output()
|
||||||
|
|
||||||
|
|
||||||
def ReadCSV(file_name):
|
|
||||||
"""Read CSV file."""
|
|
||||||
# Do not use csv reader module because it does not support unicode
|
|
||||||
return [GetCells(line) for line in codecs.open(file_name,
|
|
||||||
'r',
|
|
||||||
'shift_jis',
|
|
||||||
errors='replace')]
|
|
||||||
|
|
||||||
|
|
||||||
def GetCells(line):
|
|
||||||
"""Get cells."""
|
|
||||||
# [A, B, C, ..] from "A","B ",C,..
|
|
||||||
return [column.strip('"').strip() for column in line.strip().split(',')]
|
|
||||||
|
|
||||||
|
|
||||||
def MergeCSV(csv_lines):
|
|
||||||
"""Merge CSV."""
|
|
||||||
# When the flag says a zip code have no multiple entry while we can see
|
|
||||||
# multiple line for that zip code, we have to merge them.
|
|
||||||
zip_count = {}
|
|
||||||
ret = []
|
|
||||||
for entry in csv_lines:
|
|
||||||
zip_code = entry[2]
|
|
||||||
zip_count[zip_code] = zip_count.get(zip_code, 0) + 1
|
|
||||||
if not ShouldMerge(zip_count, entry):
|
|
||||||
ret.append(entry)
|
|
||||||
else:
|
|
||||||
last_entry = ret[-1]
|
|
||||||
last_entry[8] += entry[8] # '町域'
|
|
||||||
return ret
|
|
||||||
|
|
||||||
|
|
||||||
def ShouldMerge(zip_count, entry):
|
|
||||||
"""Return true if this entry should be merged to the previous entry."""
|
|
||||||
zip_code = entry[2]
|
|
||||||
flag_multi = (entry[12] == '1')
|
|
||||||
should_merge = (zip_count[zip_code] > 1 and not flag_multi)
|
|
||||||
should_merge_special = ShouldMergeSpecial(entry)
|
|
||||||
return should_merge or should_merge_special
|
|
||||||
|
|
||||||
|
|
||||||
class SpecialMergeZip(object):
|
|
||||||
"""Container class for special zip code entry to be merged."""
|
|
||||||
|
|
||||||
def __init__(self, zip_code, pref, city, towns):
|
|
||||||
self.zip_code = zip_code
|
|
||||||
self.pref = pref
|
|
||||||
self.city = city
|
|
||||||
self.towns = towns
|
|
||||||
|
|
||||||
|
|
||||||
_SPECIAL_CASES = [
|
|
||||||
SpecialMergeZip(u'5900111', u'大阪府', u'堺市中区', [u'三原台']),
|
|
||||||
SpecialMergeZip(u'8710046', u'大分県', u'中津市',
|
|
||||||
[u'金谷', u'西堀端', u'東堀端', u'古金谷']),
|
|
||||||
SpecialMergeZip(u'9218046', u'石川県', u'金沢市',
|
|
||||||
[u'大桑町', u'三小牛町']),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def ShouldMergeSpecial(entry):
|
|
||||||
"""Return true for special cases to be merged."""
|
|
||||||
zip_code = entry[2]
|
|
||||||
level1 = entry[6]
|
|
||||||
level2 = entry[7]
|
|
||||||
level3 = entry[8]
|
|
||||||
for special_case in _SPECIAL_CASES:
|
|
||||||
if (zip_code == special_case.zip_code and
|
|
||||||
level1 == special_case.pref and
|
|
||||||
level2 == special_case.city and
|
|
||||||
ContinuedLine(level3, special_case.towns)):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def ContinuedLine(level3, towns):
|
|
||||||
"""Return true if this seems continued line."""
|
|
||||||
for town in towns:
|
|
||||||
if level3.startswith(town):
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def ReadZipCodeEntries(zip_code, level1, level2, level3):
|
def ReadZipCodeEntries(zip_code, level1, level2, level3):
|
||||||
"""Read zip code entries."""
|
"""Read zip code entries."""
|
||||||
return [ZipEntry(zip_code, u''.join([level1, level2, town]))
|
return [ZipEntry(zip_code, u''.join([level1, level2, town]))
|
||||||
|
116
src/dictionary/zip_code_util.py
Normal file
116
src/dictionary/zip_code_util.py
Normal file
@ -0,0 +1,116 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright 2010-2016, Google Inc.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without
|
||||||
|
# modification, are permitted provided that the following conditions are
|
||||||
|
# met:
|
||||||
|
#
|
||||||
|
# * Redistributions of source code must retain the above copyright
|
||||||
|
# notice, this list of conditions and the following disclaimer.
|
||||||
|
# * Redistributions in binary form must reproduce the above
|
||||||
|
# copyright notice, this list of conditions and the following disclaimer
|
||||||
|
# in the documentation and/or other materials provided with the
|
||||||
|
# distribution.
|
||||||
|
# * Neither the name of Google Inc. nor the names of its
|
||||||
|
# contributors may be used to endorse or promote products derived from
|
||||||
|
# this software without specific prior written permission.
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
"""Util module for Japanese postal(ZIP) code."""
|
||||||
|
|
||||||
|
import codecs
|
||||||
|
|
||||||
|
|
||||||
|
def ReadCSV(file_name):
|
||||||
|
"""Read CSV file."""
|
||||||
|
# Do not use csv reader module because it does not support unicode
|
||||||
|
return [GetCells(line) for line in codecs.open(file_name,
|
||||||
|
'r',
|
||||||
|
'shift_jis',
|
||||||
|
errors='replace')]
|
||||||
|
|
||||||
|
|
||||||
|
def GetCells(line):
|
||||||
|
"""Get cells."""
|
||||||
|
# [A, B, C, ..] from "A","B ",C,..
|
||||||
|
return [column.strip('"').strip() for column in line.strip().split(',')]
|
||||||
|
|
||||||
|
|
||||||
|
def MergeCSV(csv_lines):
|
||||||
|
"""Merge CSV."""
|
||||||
|
# When the flag says a zip code have no multiple entry while we can see
|
||||||
|
# multiple line for that zip code, we have to merge them.
|
||||||
|
zip_count = {}
|
||||||
|
ret = []
|
||||||
|
for entry in csv_lines:
|
||||||
|
zip_code = entry[2]
|
||||||
|
zip_count[zip_code] = zip_count.get(zip_code, 0) + 1
|
||||||
|
if not ShouldMerge(zip_count, entry):
|
||||||
|
ret.append(entry)
|
||||||
|
else:
|
||||||
|
last_entry = ret[-1]
|
||||||
|
last_entry[8] += entry[8] # '町域'
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
def ShouldMerge(zip_count, entry):
|
||||||
|
"""Return true if this entry should be merged to the previous entry."""
|
||||||
|
zip_code = entry[2]
|
||||||
|
flag_multi = (entry[12] == '1')
|
||||||
|
should_merge = (zip_count[zip_code] > 1 and not flag_multi)
|
||||||
|
should_merge_special = ShouldMergeSpecial(entry)
|
||||||
|
return should_merge or should_merge_special
|
||||||
|
|
||||||
|
|
||||||
|
class SpecialMergeZip(object):
|
||||||
|
"""Container class for special zip code entry to be merged."""
|
||||||
|
|
||||||
|
def __init__(self, zip_code, pref, city, towns):
|
||||||
|
self.zip_code = zip_code
|
||||||
|
self.pref = pref
|
||||||
|
self.city = city
|
||||||
|
self.towns = towns
|
||||||
|
|
||||||
|
|
||||||
|
_SPECIAL_CASES = [
|
||||||
|
SpecialMergeZip(u'5900111', u'大阪府', u'堺市中区', [u'三原台']),
|
||||||
|
SpecialMergeZip(u'8710046', u'大分県', u'中津市',
|
||||||
|
[u'金谷', u'西堀端', u'東堀端', u'古金谷']),
|
||||||
|
SpecialMergeZip(u'9218046', u'石川県', u'金沢市',
|
||||||
|
[u'大桑町', u'三小牛町']),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def ShouldMergeSpecial(entry):
|
||||||
|
"""Return true for special cases to be merged."""
|
||||||
|
zip_code = entry[2]
|
||||||
|
level1 = entry[6]
|
||||||
|
level2 = entry[7]
|
||||||
|
level3 = entry[8]
|
||||||
|
for special_case in _SPECIAL_CASES:
|
||||||
|
if (zip_code == special_case.zip_code and
|
||||||
|
level1 == special_case.pref and
|
||||||
|
level2 == special_case.city and
|
||||||
|
ContinuedLine(level3, special_case.towns)):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def ContinuedLine(level3, towns):
|
||||||
|
"""Return true if this seems continued line."""
|
||||||
|
for town in towns:
|
||||||
|
if level3.startswith(town):
|
||||||
|
return False
|
||||||
|
return True
|
Reference in New Issue
Block a user