260 lines
10 KiB
Python
260 lines
10 KiB
Python
import re
|
|
|
|
import six
|
|
|
|
from . import messages
|
|
from .unicode_block import (
|
|
unicode_block,
|
|
UNICODE_BASIC_LATIN,
|
|
UNICODE_LATIN_1_SUPPLEMENT,
|
|
UNICODE_LATIN_EXTENDED_B,
|
|
UNICODE_GENERAL_PUNCTUATION,
|
|
UNICODE_ARABIC,
|
|
UNICODE_LATIN_EXTENDED_ADDITIONAL,
|
|
UNICODE_HIRAGANA,
|
|
UNICODE_KATAKANA,
|
|
UNICODE_BOPOMOFO,
|
|
UNICODE_BOPOMOFO_EXTENDED,
|
|
UNICODE_CJK_UNIFIED_IDEOGRAPHS,
|
|
UNICODE_HANGUL_SYLLABLES,
|
|
)
|
|
|
|
|
|
class NGram(object):
|
|
LATIN1_EXCLUDED = messages.get_string('NGram.LATIN1_EXCLUDE')
|
|
N_GRAM = 3
|
|
|
|
def __init__(self):
|
|
self.grams = ' '
|
|
self.capitalword = False
|
|
|
|
def add_char(self, ch):
|
|
'''Append a character into ngram buffer.'''
|
|
ch = self.normalize(ch)
|
|
last_char = self.grams[-1]
|
|
if last_char == ' ':
|
|
self.grams = ' '
|
|
self.capitalword = False
|
|
if ch == ' ':
|
|
return
|
|
elif len(self.grams) >= self.N_GRAM:
|
|
self.grams = self.grams[1:]
|
|
self.grams += ch
|
|
|
|
if ch.isupper():
|
|
if last_char.isupper():
|
|
self.capitalword = True
|
|
else:
|
|
self.capitalword = False
|
|
|
|
def get(self, n):
|
|
'''Get n-gram.'''
|
|
if self.capitalword:
|
|
return
|
|
if n < 1 or n > self.N_GRAM or len(self.grams) < n:
|
|
return
|
|
if n == 1:
|
|
ch = self.grams[-1]
|
|
if ch == ' ':
|
|
return
|
|
return ch
|
|
else:
|
|
return self.grams[-n:]
|
|
|
|
@classmethod
|
|
def normalize(cls, ch):
|
|
block = unicode_block(ch)
|
|
if block == UNICODE_BASIC_LATIN:
|
|
if ch < 'A' or ('Z' < ch < 'a') or 'z' < ch:
|
|
ch = ' '
|
|
elif block == UNICODE_LATIN_1_SUPPLEMENT:
|
|
if cls.LATIN1_EXCLUDED.find(ch) >= 0:
|
|
ch = ' '
|
|
elif block == UNICODE_LATIN_EXTENDED_B:
|
|
# normalization for Romanian
|
|
if ch == six.u('\u0219'): # Small S with comma below => with cedilla
|
|
ch = six.u('\u015f')
|
|
if ch == six.u('\u021b'): # Small T with comma below => with cedilla
|
|
ch = six.u('\u0163')
|
|
elif block == UNICODE_GENERAL_PUNCTUATION:
|
|
ch = ' '
|
|
elif block == UNICODE_ARABIC:
|
|
if ch == six.u('\u06cc'):
|
|
ch = six.u('\u064a') # Farsi yeh => Arabic yeh
|
|
elif block == UNICODE_LATIN_EXTENDED_ADDITIONAL:
|
|
if ch >= six.u('\u1ea0'):
|
|
ch = six.u('\u1ec3')
|
|
elif block == UNICODE_HIRAGANA:
|
|
ch = six.u('\u3042')
|
|
elif block == UNICODE_KATAKANA:
|
|
ch = six.u('\u30a2')
|
|
elif block in (UNICODE_BOPOMOFO, UNICODE_BOPOMOFO_EXTENDED):
|
|
ch = six.u('\u3105')
|
|
elif block == UNICODE_CJK_UNIFIED_IDEOGRAPHS:
|
|
ch = cls.CJK_MAP.get(ch, ch)
|
|
elif block == UNICODE_HANGUL_SYLLABLES:
|
|
ch = six.u('\uac00')
|
|
return ch
|
|
|
|
@classmethod
|
|
def normalize_vi(cls, text):
|
|
'''Normalizer for Vietnamese.
|
|
Normalize Alphabet + Diacritical Mark(U+03xx) into U+1Exx.
|
|
'''
|
|
def repl(m):
|
|
alphabet = cls.TO_NORMALIZE_VI_CHARS.find(m.group(1))
|
|
dmark = cls.DMARK_CLASS.find(m.group(2)) # Diacritical Mark
|
|
return cls.NORMALIZED_VI_CHARS[dmark][alphabet]
|
|
return cls.ALPHABET_WITH_DMARK.sub(repl, text)
|
|
|
|
NORMALIZED_VI_CHARS = [
|
|
messages.get_string('NORMALIZED_VI_CHARS_0300'),
|
|
messages.get_string('NORMALIZED_VI_CHARS_0301'),
|
|
messages.get_string('NORMALIZED_VI_CHARS_0303'),
|
|
messages.get_string('NORMALIZED_VI_CHARS_0309'),
|
|
messages.get_string('NORMALIZED_VI_CHARS_0323')]
|
|
TO_NORMALIZE_VI_CHARS = messages.get_string('TO_NORMALIZE_VI_CHARS')
|
|
DMARK_CLASS = messages.get_string('DMARK_CLASS')
|
|
ALPHABET_WITH_DMARK = re.compile(
|
|
'([' + TO_NORMALIZE_VI_CHARS + '])([' + DMARK_CLASS + '])',
|
|
re.UNICODE)
|
|
|
|
# CJK Kanji Normalization Mapping
|
|
CJK_CLASS = [
|
|
messages.get_string('NGram.KANJI_1_0'),
|
|
messages.get_string('NGram.KANJI_1_2'),
|
|
messages.get_string('NGram.KANJI_1_4'),
|
|
messages.get_string('NGram.KANJI_1_8'),
|
|
messages.get_string('NGram.KANJI_1_11'),
|
|
messages.get_string('NGram.KANJI_1_12'),
|
|
messages.get_string('NGram.KANJI_1_13'),
|
|
messages.get_string('NGram.KANJI_1_14'),
|
|
messages.get_string('NGram.KANJI_1_16'),
|
|
messages.get_string('NGram.KANJI_1_18'),
|
|
messages.get_string('NGram.KANJI_1_22'),
|
|
messages.get_string('NGram.KANJI_1_27'),
|
|
messages.get_string('NGram.KANJI_1_29'),
|
|
messages.get_string('NGram.KANJI_1_31'),
|
|
messages.get_string('NGram.KANJI_1_35'),
|
|
messages.get_string('NGram.KANJI_2_0'),
|
|
messages.get_string('NGram.KANJI_2_1'),
|
|
messages.get_string('NGram.KANJI_2_4'),
|
|
messages.get_string('NGram.KANJI_2_9'),
|
|
messages.get_string('NGram.KANJI_2_10'),
|
|
messages.get_string('NGram.KANJI_2_11'),
|
|
messages.get_string('NGram.KANJI_2_12'),
|
|
messages.get_string('NGram.KANJI_2_13'),
|
|
messages.get_string('NGram.KANJI_2_15'),
|
|
messages.get_string('NGram.KANJI_2_16'),
|
|
messages.get_string('NGram.KANJI_2_18'),
|
|
messages.get_string('NGram.KANJI_2_21'),
|
|
messages.get_string('NGram.KANJI_2_22'),
|
|
messages.get_string('NGram.KANJI_2_23'),
|
|
messages.get_string('NGram.KANJI_2_28'),
|
|
messages.get_string('NGram.KANJI_2_29'),
|
|
messages.get_string('NGram.KANJI_2_30'),
|
|
messages.get_string('NGram.KANJI_2_31'),
|
|
messages.get_string('NGram.KANJI_2_32'),
|
|
messages.get_string('NGram.KANJI_2_35'),
|
|
messages.get_string('NGram.KANJI_2_36'),
|
|
messages.get_string('NGram.KANJI_2_37'),
|
|
messages.get_string('NGram.KANJI_2_38'),
|
|
messages.get_string('NGram.KANJI_3_1'),
|
|
messages.get_string('NGram.KANJI_3_2'),
|
|
messages.get_string('NGram.KANJI_3_3'),
|
|
messages.get_string('NGram.KANJI_3_4'),
|
|
messages.get_string('NGram.KANJI_3_5'),
|
|
messages.get_string('NGram.KANJI_3_8'),
|
|
messages.get_string('NGram.KANJI_3_9'),
|
|
messages.get_string('NGram.KANJI_3_11'),
|
|
messages.get_string('NGram.KANJI_3_12'),
|
|
messages.get_string('NGram.KANJI_3_13'),
|
|
messages.get_string('NGram.KANJI_3_15'),
|
|
messages.get_string('NGram.KANJI_3_16'),
|
|
messages.get_string('NGram.KANJI_3_18'),
|
|
messages.get_string('NGram.KANJI_3_19'),
|
|
messages.get_string('NGram.KANJI_3_22'),
|
|
messages.get_string('NGram.KANJI_3_23'),
|
|
messages.get_string('NGram.KANJI_3_27'),
|
|
messages.get_string('NGram.KANJI_3_29'),
|
|
messages.get_string('NGram.KANJI_3_30'),
|
|
messages.get_string('NGram.KANJI_3_31'),
|
|
messages.get_string('NGram.KANJI_3_32'),
|
|
messages.get_string('NGram.KANJI_3_35'),
|
|
messages.get_string('NGram.KANJI_3_36'),
|
|
messages.get_string('NGram.KANJI_3_37'),
|
|
messages.get_string('NGram.KANJI_3_38'),
|
|
messages.get_string('NGram.KANJI_4_0'),
|
|
messages.get_string('NGram.KANJI_4_9'),
|
|
messages.get_string('NGram.KANJI_4_10'),
|
|
messages.get_string('NGram.KANJI_4_16'),
|
|
messages.get_string('NGram.KANJI_4_17'),
|
|
messages.get_string('NGram.KANJI_4_18'),
|
|
messages.get_string('NGram.KANJI_4_22'),
|
|
messages.get_string('NGram.KANJI_4_24'),
|
|
messages.get_string('NGram.KANJI_4_28'),
|
|
messages.get_string('NGram.KANJI_4_34'),
|
|
messages.get_string('NGram.KANJI_4_39'),
|
|
messages.get_string('NGram.KANJI_5_10'),
|
|
messages.get_string('NGram.KANJI_5_11'),
|
|
messages.get_string('NGram.KANJI_5_12'),
|
|
messages.get_string('NGram.KANJI_5_13'),
|
|
messages.get_string('NGram.KANJI_5_14'),
|
|
messages.get_string('NGram.KANJI_5_18'),
|
|
messages.get_string('NGram.KANJI_5_26'),
|
|
messages.get_string('NGram.KANJI_5_29'),
|
|
messages.get_string('NGram.KANJI_5_34'),
|
|
messages.get_string('NGram.KANJI_5_39'),
|
|
messages.get_string('NGram.KANJI_6_0'),
|
|
messages.get_string('NGram.KANJI_6_3'),
|
|
messages.get_string('NGram.KANJI_6_9'),
|
|
messages.get_string('NGram.KANJI_6_10'),
|
|
messages.get_string('NGram.KANJI_6_11'),
|
|
messages.get_string('NGram.KANJI_6_12'),
|
|
messages.get_string('NGram.KANJI_6_16'),
|
|
messages.get_string('NGram.KANJI_6_18'),
|
|
messages.get_string('NGram.KANJI_6_20'),
|
|
messages.get_string('NGram.KANJI_6_21'),
|
|
messages.get_string('NGram.KANJI_6_22'),
|
|
messages.get_string('NGram.KANJI_6_23'),
|
|
messages.get_string('NGram.KANJI_6_25'),
|
|
messages.get_string('NGram.KANJI_6_28'),
|
|
messages.get_string('NGram.KANJI_6_29'),
|
|
messages.get_string('NGram.KANJI_6_30'),
|
|
messages.get_string('NGram.KANJI_6_32'),
|
|
messages.get_string('NGram.KANJI_6_34'),
|
|
messages.get_string('NGram.KANJI_6_35'),
|
|
messages.get_string('NGram.KANJI_6_37'),
|
|
messages.get_string('NGram.KANJI_6_39'),
|
|
messages.get_string('NGram.KANJI_7_0'),
|
|
messages.get_string('NGram.KANJI_7_3'),
|
|
messages.get_string('NGram.KANJI_7_6'),
|
|
messages.get_string('NGram.KANJI_7_7'),
|
|
messages.get_string('NGram.KANJI_7_9'),
|
|
messages.get_string('NGram.KANJI_7_11'),
|
|
messages.get_string('NGram.KANJI_7_12'),
|
|
messages.get_string('NGram.KANJI_7_13'),
|
|
messages.get_string('NGram.KANJI_7_16'),
|
|
messages.get_string('NGram.KANJI_7_18'),
|
|
messages.get_string('NGram.KANJI_7_19'),
|
|
messages.get_string('NGram.KANJI_7_20'),
|
|
messages.get_string('NGram.KANJI_7_21'),
|
|
messages.get_string('NGram.KANJI_7_23'),
|
|
messages.get_string('NGram.KANJI_7_25'),
|
|
messages.get_string('NGram.KANJI_7_28'),
|
|
messages.get_string('NGram.KANJI_7_29'),
|
|
messages.get_string('NGram.KANJI_7_32'),
|
|
messages.get_string('NGram.KANJI_7_33'),
|
|
messages.get_string('NGram.KANJI_7_35'),
|
|
messages.get_string('NGram.KANJI_7_37')]
|
|
|
|
CJK_MAP = {}
|
|
|
|
@classmethod
|
|
def _init_cjk_map(cls):
|
|
for cjk_list in cls.CJK_CLASS:
|
|
representative = cjk_list[0]
|
|
for ch in cjk_list:
|
|
cls.CJK_MAP[ch] = representative
|
|
|
|
NGram._init_cjk_map()
|