70 lines
2.1 KiB
Python
70 lines
2.1 KiB
Python
from collections import defaultdict
|
|
import re
|
|
|
|
import six
|
|
from six.moves import xrange
|
|
|
|
from .ngram import NGram
|
|
|
|
|
|
class LangProfile(object):
|
|
MINIMUM_FREQ = 2
|
|
LESS_FREQ_RATIO = 100000
|
|
|
|
ROMAN_CHAR_RE = re.compile(r'^[A-Za-z]$')
|
|
ROMAN_SUBSTR_RE = re.compile(r'.*[A-Za-z].*')
|
|
|
|
def __init__(self, name=None, freq=None, n_words=None):
|
|
self.freq = defaultdict(int)
|
|
if freq is not None:
|
|
self.freq.update(freq)
|
|
|
|
if n_words is None:
|
|
n_words = [0] * NGram.N_GRAM
|
|
|
|
self.name = name
|
|
self.n_words = n_words
|
|
|
|
def add(self, gram):
|
|
'''Add n-gram to profile.'''
|
|
if self.name is None or gram is None: # Illegal
|
|
return
|
|
length = len(gram)
|
|
if length < 1 or length > NGram.N_GRAM: # Illegal
|
|
return
|
|
self.n_words[length - 1] += 1
|
|
self.freq[gram] += 1
|
|
|
|
def omit_less_freq(self):
|
|
'''Eliminate below less frequency n-grams and noise Latin alphabets.'''
|
|
if self.name is None: # Illegal
|
|
return
|
|
threshold = max(self.n_words[0] // self.LESS_FREQ_RATIO, self.MINIMUM_FREQ)
|
|
|
|
roman = 0
|
|
for key, count in list(six.iteritems(self.freq)):
|
|
if count <= threshold:
|
|
self.n_words[len(key)-1] -= count
|
|
del self.freq[key]
|
|
elif self.ROMAN_CHAR_RE.match(key):
|
|
roman += count
|
|
|
|
# roman check
|
|
if roman < self.n_words[0] // 3:
|
|
for key, count in list(six.iteritems(self.freq)):
|
|
if self.ROMAN_SUBSTR_RE.match(key):
|
|
self.n_words[len(key)-1] -= count
|
|
del self.freq[key]
|
|
|
|
def update(self, text):
|
|
'''Update the language profile with (fragmented) text.
|
|
Extract n-grams from text and add their frequency into the profile.
|
|
'''
|
|
if text is None:
|
|
return
|
|
text = NGram.normalize_vi(text)
|
|
gram = NGram()
|
|
for ch in text:
|
|
gram.add_char(ch)
|
|
for n in xrange(1, NGram.N_GRAM+1):
|
|
self.add(gram.get(n))
|