250 lines
8.5 KiB
Python
250 lines
8.5 KiB
Python
|
import random
|
||
|
import re
|
||
|
|
||
|
import six
|
||
|
from six.moves import zip, xrange
|
||
|
|
||
|
from .lang_detect_exception import ErrorCode, LangDetectException
|
||
|
from .language import Language
|
||
|
from .utils.ngram import NGram
|
||
|
from .utils.unicode_block import unicode_block
|
||
|
|
||
|
|
||
|
class Detector(object):
|
||
|
'''
|
||
|
Detector class is to detect language from specified text.
|
||
|
Its instance is able to be constructed via the factory class DetectorFactory.
|
||
|
|
||
|
After appending a target text to the Detector instance with .append(string),
|
||
|
the detector provides the language detection results for target text via .detect() or .get_probabilities().
|
||
|
|
||
|
.detect() method returns a single language name which has the highest probability.
|
||
|
.get_probabilities() methods returns a list of multiple languages and their probabilities.
|
||
|
|
||
|
The detector has some parameters for language detection.
|
||
|
See set_alpha(double), .set_max_text_length(int) .set_prior_map(dict).
|
||
|
|
||
|
Example:
|
||
|
|
||
|
from langdetect.detector_factory import DetectorFactory
|
||
|
factory = DetectorFactory()
|
||
|
factory.load_profile('/path/to/profile/directory')
|
||
|
|
||
|
def detect(text):
|
||
|
detector = factory.create()
|
||
|
detector.append(text)
|
||
|
return detector.detect()
|
||
|
|
||
|
def detect_langs(text):
|
||
|
detector = factory.create()
|
||
|
detector.append(text)
|
||
|
return detector.get_probabilities()
|
||
|
'''
|
||
|
|
||
|
ALPHA_DEFAULT = 0.5
|
||
|
ALPHA_WIDTH = 0.05
|
||
|
|
||
|
ITERATION_LIMIT = 1000
|
||
|
PROB_THRESHOLD = 0.1
|
||
|
CONV_THRESHOLD = 0.99999
|
||
|
BASE_FREQ = 10000
|
||
|
UNKNOWN_LANG = 'unknown'
|
||
|
|
||
|
URL_RE = re.compile(r'https?://[-_.?&~;+=/#0-9A-Za-z]{1,2076}')
|
||
|
MAIL_RE = re.compile(r'[-_.0-9A-Za-z]{1,64}@[-_0-9A-Za-z]{1,255}[-_.0-9A-Za-z]{1,255}')
|
||
|
|
||
|
def __init__(self, factory):
|
||
|
self.word_lang_prob_map = factory.word_lang_prob_map
|
||
|
self.langlist = factory.langlist
|
||
|
self.seed = factory.seed
|
||
|
self.random = random.Random()
|
||
|
self.text = ''
|
||
|
self.langprob = None
|
||
|
|
||
|
self.alpha = self.ALPHA_DEFAULT
|
||
|
self.n_trial = 7
|
||
|
self.max_text_length = 10000
|
||
|
self.prior_map = None
|
||
|
self.verbose = False
|
||
|
|
||
|
def set_verbose(self):
|
||
|
self.verbose = True
|
||
|
|
||
|
def set_alpha(self, alpha):
|
||
|
self.alpha = alpha
|
||
|
|
||
|
def set_prior_map(self, prior_map):
|
||
|
'''Set prior information about language probabilities.'''
|
||
|
self.prior_map = [0.0] * len(self.langlist)
|
||
|
sump = 0.0
|
||
|
for i in xrange(len(self.prior_map)):
|
||
|
lang = self.langlist[i]
|
||
|
if lang in prior_map:
|
||
|
p = prior_map[lang]
|
||
|
if p < 0:
|
||
|
raise LangDetectException(ErrorCode.InitParamError, 'Prior probability must be non-negative.')
|
||
|
self.prior_map[i] = p
|
||
|
sump += p
|
||
|
if sump <= 0.0:
|
||
|
raise LangDetectException(ErrorCode.InitParamError, 'More one of prior probability must be non-zero.')
|
||
|
for i in xrange(len(self.prior_map)):
|
||
|
self.prior_map[i] /= sump
|
||
|
|
||
|
def set_max_text_length(self, max_text_length):
|
||
|
'''Specify max size of target text to use for language detection.
|
||
|
The default value is 10000(10KB).
|
||
|
'''
|
||
|
self.max_text_length = max_text_length
|
||
|
|
||
|
def append(self, text):
|
||
|
'''Append the target text for language detection.
|
||
|
If the total size of target text exceeds the limit size specified by
|
||
|
Detector.set_max_text_length(int), the rest is cut down.
|
||
|
'''
|
||
|
text = self.URL_RE.sub(' ', text)
|
||
|
text = self.MAIL_RE.sub(' ', text)
|
||
|
text = NGram.normalize_vi(text)
|
||
|
pre = 0
|
||
|
for i in xrange(min(len(text), self.max_text_length)):
|
||
|
ch = text[i]
|
||
|
if ch != ' ' or pre != ' ':
|
||
|
self.text += ch
|
||
|
pre = ch
|
||
|
|
||
|
def cleaning_text(self):
|
||
|
'''Cleaning text to detect
|
||
|
(eliminate URL, e-mail address and Latin sentence if it is not written in Latin alphabet).
|
||
|
'''
|
||
|
latin_count, non_latin_count = 0, 0
|
||
|
for ch in self.text:
|
||
|
if 'A' <= ch <= 'z':
|
||
|
latin_count += 1
|
||
|
elif ch >= six.u('\u0300') and unicode_block(ch) != 'Latin Extended Additional':
|
||
|
non_latin_count += 1
|
||
|
|
||
|
if latin_count * 2 < non_latin_count:
|
||
|
text_without_latin = ''
|
||
|
for ch in self.text:
|
||
|
if ch < 'A' or 'z' < ch:
|
||
|
text_without_latin += ch
|
||
|
self.text = text_without_latin
|
||
|
|
||
|
def detect(self):
|
||
|
'''Detect language of the target text and return the language name
|
||
|
which has the highest probability.
|
||
|
'''
|
||
|
probabilities = self.get_probabilities()
|
||
|
if probabilities:
|
||
|
return probabilities[0].lang
|
||
|
return self.UNKNOWN_LANG
|
||
|
|
||
|
def get_probabilities(self):
|
||
|
if self.langprob is None:
|
||
|
self._detect_block()
|
||
|
return self._sort_probability(self.langprob)
|
||
|
|
||
|
def _detect_block(self):
|
||
|
self.cleaning_text()
|
||
|
ngrams = self._extract_ngrams()
|
||
|
if not ngrams:
|
||
|
raise LangDetectException(ErrorCode.CantDetectError, 'No features in text.')
|
||
|
|
||
|
self.langprob = [0.0] * len(self.langlist)
|
||
|
|
||
|
self.random.seed(self.seed)
|
||
|
for t in xrange(self.n_trial):
|
||
|
prob = self._init_probability()
|
||
|
alpha = self.alpha + self.random.gauss(0.0, 1.0) * self.ALPHA_WIDTH
|
||
|
|
||
|
i = 0
|
||
|
while True:
|
||
|
self._update_lang_prob(prob, self.random.choice(ngrams), alpha)
|
||
|
if i % 5 == 0:
|
||
|
if self._normalize_prob(prob) > self.CONV_THRESHOLD or i >= self.ITERATION_LIMIT:
|
||
|
break
|
||
|
if self.verbose:
|
||
|
six.print_('>', self._sort_probability(prob))
|
||
|
i += 1
|
||
|
for j in xrange(len(self.langprob)):
|
||
|
self.langprob[j] += prob[j] / self.n_trial
|
||
|
if self.verbose:
|
||
|
six.print_('==>', self._sort_probability(prob))
|
||
|
|
||
|
def _init_probability(self):
|
||
|
'''Initialize the map of language probabilities.
|
||
|
If there is the specified prior map, use it as initial map.
|
||
|
'''
|
||
|
if self.prior_map is not None:
|
||
|
return list(self.prior_map)
|
||
|
else:
|
||
|
return [1.0 / len(self.langlist)] * len(self.langlist)
|
||
|
|
||
|
def _extract_ngrams(self):
|
||
|
'''Extract n-grams from target text.'''
|
||
|
RANGE = list(xrange(1, NGram.N_GRAM + 1))
|
||
|
|
||
|
result = []
|
||
|
ngram = NGram()
|
||
|
for ch in self.text:
|
||
|
ngram.add_char(ch)
|
||
|
if ngram.capitalword:
|
||
|
continue
|
||
|
for n in RANGE:
|
||
|
# optimized w = ngram.get(n)
|
||
|
if len(ngram.grams) < n:
|
||
|
break
|
||
|
w = ngram.grams[-n:]
|
||
|
if w and w != ' ' and w in self.word_lang_prob_map:
|
||
|
result.append(w)
|
||
|
return result
|
||
|
|
||
|
def _update_lang_prob(self, prob, word, alpha):
|
||
|
'''Update language probabilities with N-gram string(N=1,2,3).'''
|
||
|
if word is None or word not in self.word_lang_prob_map:
|
||
|
return False
|
||
|
|
||
|
lang_prob_map = self.word_lang_prob_map[word]
|
||
|
if self.verbose:
|
||
|
six.print_('%s(%s): %s' % (word, self._unicode_encode(word), self._word_prob_to_string(lang_prob_map)))
|
||
|
|
||
|
weight = alpha / self.BASE_FREQ
|
||
|
for i in xrange(len(prob)):
|
||
|
prob[i] *= weight + lang_prob_map[i]
|
||
|
return True
|
||
|
|
||
|
def _word_prob_to_string(self, prob):
|
||
|
result = ''
|
||
|
for j in xrange(len(prob)):
|
||
|
p = prob[j]
|
||
|
if p >= 0.00001:
|
||
|
result += ' %s:%.5f' % (self.langlist[j], p)
|
||
|
return result
|
||
|
|
||
|
def _normalize_prob(self, prob):
|
||
|
'''Normalize probabilities and check convergence by the maximun probability.
|
||
|
'''
|
||
|
maxp, sump = 0.0, sum(prob)
|
||
|
for i in xrange(len(prob)):
|
||
|
p = prob[i] / sump
|
||
|
if maxp < p:
|
||
|
maxp = p
|
||
|
prob[i] = p
|
||
|
return maxp
|
||
|
|
||
|
def _sort_probability(self, prob):
|
||
|
result = [Language(lang, p) for (lang, p) in zip(self.langlist, prob) if p > self.PROB_THRESHOLD]
|
||
|
result.sort(reverse=True)
|
||
|
return result
|
||
|
|
||
|
def _unicode_encode(self, word):
|
||
|
buf = ''
|
||
|
for ch in word:
|
||
|
if ch >= six.u('\u0080'):
|
||
|
st = hex(0x10000 + ord(ch))[2:]
|
||
|
while len(st) < 4:
|
||
|
st = '0' + st
|
||
|
buf += r'\u' + st[1:5]
|
||
|
else:
|
||
|
buf += ch
|
||
|
return buf
|