93 lines
3 KiB
Python
93 lines
3 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
"""
|
||
|
pygments.regexopt
|
||
|
~~~~~~~~~~~~~~~~~
|
||
|
|
||
|
An algorithm that generates optimized regexes for matching long lists of
|
||
|
literal strings.
|
||
|
|
||
|
:copyright: Copyright 2006-2017 by the Pygments team, see AUTHORS.
|
||
|
:license: BSD, see LICENSE for details.
|
||
|
"""
|
||
|
|
||
|
import re
|
||
|
from re import escape
|
||
|
from os.path import commonprefix
|
||
|
from itertools import groupby
|
||
|
from operator import itemgetter
|
||
|
|
||
|
CS_ESCAPE = re.compile(r'[\^\\\-\]]')
|
||
|
FIRST_ELEMENT = itemgetter(0)
|
||
|
|
||
|
|
||
|
def make_charset(letters):
|
||
|
return '[' + CS_ESCAPE.sub(lambda m: '\\' + m.group(), ''.join(letters)) + ']'
|
||
|
|
||
|
|
||
|
def regex_opt_inner(strings, open_paren):
|
||
|
"""Return a regex that matches any string in the sorted list of strings."""
|
||
|
close_paren = open_paren and ')' or ''
|
||
|
# print strings, repr(open_paren)
|
||
|
if not strings:
|
||
|
# print '-> nothing left'
|
||
|
return ''
|
||
|
first = strings[0]
|
||
|
if len(strings) == 1:
|
||
|
# print '-> only 1 string'
|
||
|
return open_paren + escape(first) + close_paren
|
||
|
if not first:
|
||
|
# print '-> first string empty'
|
||
|
return open_paren + regex_opt_inner(strings[1:], '(?:') \
|
||
|
+ '?' + close_paren
|
||
|
if len(first) == 1:
|
||
|
# multiple one-char strings? make a charset
|
||
|
oneletter = []
|
||
|
rest = []
|
||
|
for s in strings:
|
||
|
if len(s) == 1:
|
||
|
oneletter.append(s)
|
||
|
else:
|
||
|
rest.append(s)
|
||
|
if len(oneletter) > 1: # do we have more than one oneletter string?
|
||
|
if rest:
|
||
|
# print '-> 1-character + rest'
|
||
|
return open_paren + regex_opt_inner(rest, '') + '|' \
|
||
|
+ make_charset(oneletter) + close_paren
|
||
|
# print '-> only 1-character'
|
||
|
return open_paren + make_charset(oneletter) + close_paren
|
||
|
prefix = commonprefix(strings)
|
||
|
if prefix:
|
||
|
plen = len(prefix)
|
||
|
# we have a prefix for all strings
|
||
|
# print '-> prefix:', prefix
|
||
|
return open_paren + escape(prefix) \
|
||
|
+ regex_opt_inner([s[plen:] for s in strings], '(?:') \
|
||
|
+ close_paren
|
||
|
# is there a suffix?
|
||
|
strings_rev = [s[::-1] for s in strings]
|
||
|
suffix = commonprefix(strings_rev)
|
||
|
if suffix:
|
||
|
slen = len(suffix)
|
||
|
# print '-> suffix:', suffix[::-1]
|
||
|
return open_paren \
|
||
|
+ regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \
|
||
|
+ escape(suffix[::-1]) + close_paren
|
||
|
# recurse on common 1-string prefixes
|
||
|
# print '-> last resort'
|
||
|
return open_paren + \
|
||
|
'|'.join(regex_opt_inner(list(group[1]), '')
|
||
|
for group in groupby(strings, lambda s: s[0] == first[0])) \
|
||
|
+ close_paren
|
||
|
|
||
|
|
||
|
def regex_opt(strings, prefix='', suffix=''):
|
||
|
"""Return a compiled regex that matches any string in the given list.
|
||
|
|
||
|
The strings to match must be literal strings, not regexes. They will be
|
||
|
regex-escaped.
|
||
|
|
||
|
*prefix* and *suffix* are pre- and appended to the final regex.
|
||
|
"""
|
||
|
strings = sorted(strings)
|
||
|
return prefix + regex_opt_inner(strings, '(') + suffix
|