1 # -*- encoding: utf-8 -*-
3 from __future__ import print_function, unicode_literals
6 __all__ = ['remove_decoration', 'decorated_match']
8 # for range \u00c0 \u0179
9 char_to_alternatives_lower = {
31 # This chars lower() function doesn't work
32 char_to_alternatives_upper = {
35 char_to_alternatives = {} # idem, but with upper case too
36 for char, alternatives in six.iteritems(char_to_alternatives_lower):
37 char_to_alternatives[char] = alternatives
38 char_to_alternatives[char.upper()] = alternatives.upper()
39 for char, alternatives in six.iteritems(char_to_alternatives_upper):
40 char_to_alternatives[char] = alternatives
42 alternative_to_char = {} # reverse
43 for char, alternatives in six.iteritems(char_to_alternatives_lower):
44 for alternative in alternatives:
45 alternative_to_char[alternative] = char
46 alternative_to_char[alternative.upper()] = char.upper()
47 for char, alternatives in six.iteritems(char_to_alternatives_upper):
48 for alternative in alternatives:
49 alternative_to_char[alternative] = char
51 # ligatures (only two chars supported)
52 ligatures_expansions_lower = {
54 # 'ij': 'ij', buggy: see http://en.wikipedia.org/wiki/Typographic_ligature
58 ligatures_expansions = {} # idem, but with upper case too
59 for ligature, expansion in six.iteritems(ligatures_expansions_lower):
60 ligatures_expansions[ligature] = expansion
61 ligatures_expansions[ligature.upper()] = expansion[0].upper()+expansion[1:]
63 ligatures_contractions = {} # reverse
64 for ligature, expansion in six.iteritems(ligatures_expansions_lower):
65 ligatures_contractions[expansion] = ligature
66 ligatures_contractions[expansion[0].upper()+expansion[1:]] = ligature.upper()
68 def remove_decoration(txt):
71 l = alternative_to_char.get(l, l)
72 l = ligatures_expansions.get(l, l)
76 def decorated_match_single_char(c, casesensitive=False):
77 assert type(c) == type('')
80 result = c + char_to_alternatives.get(c, '')
90 def decorated_match(txt, casesensitive=False):
91 assert type(txt) == type('')
93 txt = remove_decoration(txt)
98 c1 = txt[i] # current character
99 c12 = txt[i:i+2] # both current and next characters. Contains a single char on last iteration so that it never matches, that is OK
100 ligature = ligatures_contractions.get(c12, None)
102 result += '('+ligature
103 if not casesensitive:
104 result += '|'+ligature.upper()
105 result += '|'+decorated_match_single_char(c12[0], casesensitive) \
106 +decorated_match_single_char(c12[1], casesensitive) \
108 i += 1 # skip next character, we allready did both
110 result += decorated_match_single_char(c1, casesensitive)
115 if __name__ == '__main__':
117 from optparse import OptionParser
118 parser = OptionParser(usage='%prog [options] string')
119 parser.add_option('--charset', help="set charset. default=%default", action='store', dest='charset', default='utf-8')
120 parser.add_option('-r', '--regexp', help="generate regular expression.", action='store_true', dest='regexp')
121 parser.add_option('-i', help="used with -r, make regexp case insensitive.", action='store_false', dest='casesensitive', default=True)
122 (options, args) = parser.parse_args()
125 print('Missing required parameter. Try "Œuf"', file=sys.stderr)
128 input = ' '.join(args)
130 input = unicode(b' '.join(args), options.charset)
131 #print("input:", input) # Œuf
132 #print("undecorated:", remove_decoration(input)) # Oeuf
133 #print("regex:", decorated_match(input)) # (œ|Œ|[oòóôöøōŏőOÒÓÔÖØŌŎŐ][eèéêëēĕėęěEÈÉÊËĒĔĖĘĚ])[uùúûüũūŭůűųUÙÚÛÜŨŪŬŮŰŲ][fF]
136 print(decorated_match(input, options.casesensitive))
138 print(decorated_match(input, options.casesensitive).encode(options.charset))
141 print(remove_decoration(input))
143 print(remove_decoration(input).encode(options.charset))