2 # -*- encoding: utf-8 -*-
4 from __future__ import print_function, unicode_literals
7 __all__ = ['remove_decoration', 'decorated_match']
9 # for range \u00c0 \u0179
10 char_to_alternatives_lower = {
32 # This chars lower() function doesn't work
33 char_to_alternatives_upper = {
36 char_to_alternatives = {} # idem, but with upper case too
37 for char, alternatives in six.iteritems(char_to_alternatives_lower):
38 char_to_alternatives[char] = alternatives
39 char_to_alternatives[char.upper()] = alternatives.upper()
40 for char, alternatives in six.iteritems(char_to_alternatives_upper):
41 char_to_alternatives[char] = alternatives
43 alternative_to_char = {} # reverse
44 for char, alternatives in six.iteritems(char_to_alternatives_lower):
45 for alternative in alternatives:
46 alternative_to_char[alternative] = char
47 alternative_to_char[alternative.upper()] = char.upper()
48 for char, alternatives in six.iteritems(char_to_alternatives_upper):
49 for alternative in alternatives:
50 alternative_to_char[alternative] = char
52 # ligatures (only two chars supported)
53 ligatures_expansions_lower = {
55 # 'ij': 'ij', buggy: see http://en.wikipedia.org/wiki/Typographic_ligature
59 ligatures_expansions = {} # idem, but with upper case too
60 for ligature, expansion in six.iteritems(ligatures_expansions_lower):
61 ligatures_expansions[ligature] = expansion
62 ligatures_expansions[ligature.upper()] = expansion[0].upper()+expansion[1:]
64 ligatures_contractions = {} # reverse
65 for ligature, expansion in six.iteritems(ligatures_expansions_lower):
66 ligatures_contractions[expansion] = ligature
67 ligatures_contractions[expansion[0].upper()+expansion[1:]] = ligature.upper()
69 def remove_decoration(txt):
72 l = alternative_to_char.get(l, l)
73 l = ligatures_expansions.get(l, l)
77 def decorated_match_single_char(c, casesensitive=False):
78 assert type(c) == type('')
81 result = c + char_to_alternatives.get(c, '')
91 def decorated_match(txt, casesensitive=False):
92 assert type(txt) == type('')
94 txt = remove_decoration(txt)
99 c1 = txt[i] # current character
100 c12 = txt[i:i+2] # both current and next characters. Contains a single char on last iteration so that it never matches, that is OK
101 ligature = ligatures_contractions.get(c12, None)
103 result += '('+ligature
104 if not casesensitive:
105 result += '|'+ligature.upper()
106 result += '|'+decorated_match_single_char(c12[0], casesensitive) \
107 +decorated_match_single_char(c12[1], casesensitive) \
109 i += 1 # skip next character, we allready did both
111 result += decorated_match_single_char(c1, casesensitive)
116 if __name__ == '__main__':
118 from optparse import OptionParser
119 parser = OptionParser(usage='%prog [options] string')
120 parser.add_option('--charset', help="set charset. default=%default", action='store', dest='charset', default='utf-8')
121 parser.add_option('-r', '--regexp', help="generate regular expression.", action='store_true', dest='regexp')
122 parser.add_option('-i', help="used with -r, make regexp case insensitive.", action='store_false', dest='casesensitive', default=True)
123 (options, args) = parser.parse_args()
126 print('Missing required parameter. Try "Œuf"', file=sys.stderr)
129 input = unicode(b' '.join(args), options.charset)
131 input = ' '.join(args)
132 #print("input:", input) # Œuf
133 #print("undecorated:", remove_decoration(input)) # Oeuf
134 #print("regex:", decorated_match(input)) # (œ|Œ|[oòóôöøōŏőOÒÓÔÖØŌŎŐ][eèéêëēĕėęěEÈÉÊËĒĔĖĘĚ])[uùúûüũūŭůűųUÙÚÛÜŨŪŬŮŰŲ][fF]
137 print(decorated_match(input, options.casesensitive).encode(options.charset))
139 print(decorated_match(input, options.casesensitive))
142 print(remove_decoration(input).encode(options.charset))
144 print(remove_decoration(input))