2 # -*- encoding: utf-8 -*-
4 __all__ = ['remove_decoration', 'decorated_match']
6 # for range \u00c0 \u0179
7 char_to_alternatives_lower={
29 # This chars lower() function doesn't work
30 char_to_alternatives_upper={
33 char_to_alternatives={} # idem, but with upper case too
34 for char,alternatives in char_to_alternatives_lower.iteritems():
35 char_to_alternatives[char] = alternatives
36 char_to_alternatives[char.upper()] = alternatives.upper()
37 for char,alternatives in char_to_alternatives_upper.iteritems():
38 char_to_alternatives[char] = alternatives
40 alternative_to_char = {} # reverse
41 for char,alternatives in char_to_alternatives_lower.iteritems():
42 for alternative in alternatives:
43 alternative_to_char[alternative] = char
44 alternative_to_char[alternative.upper()] = char.upper()
45 for char,alternatives in char_to_alternatives_upper.iteritems():
46 for alternative in alternatives:
47 alternative_to_char[alternative] = char
49 # ligatures (only two chars supported)
50 ligatures_expansions_lower = {
52 # u'ij': u'ij', buggy: see http://en.wikipedia.org/wiki/Typographic_ligature
56 ligatures_expansions = {} # idem, but with upper case too
57 for ligature, expansion in ligatures_expansions_lower.iteritems():
58 ligatures_expansions[ligature] = expansion
59 ligatures_expansions[ligature.upper()] = expansion[0].upper()+expansion[1:]
61 ligatures_contractions = {} # reverse
62 for ligature, expansion in ligatures_expansions_lower.iteritems():
63 ligatures_contractions[expansion] = ligature
64 ligatures_contractions[expansion[0].upper()+expansion[1:]] = ligature.upper()
66 def remove_decoration(txt):
69 l = alternative_to_char.get(l, l)
70 l = ligatures_expansions.get(l, l)
74 def decorated_match_single_char(c, casesensitive=False):
75 assert type(c)==unicode
78 result = c + char_to_alternatives.get(c, u'')
84 return u'['+result+u']'
88 def decorated_match(txt, casesensitive=False):
89 assert type(txt)==unicode
91 txt = remove_decoration(txt)
96 c1 = txt[i] # current character
97 c12 = txt[i:i+2] # both current and next characters. Contains a single char on last iteration so that it never matches, that is OK
98 ligature = ligatures_contractions.get(c12, None)
100 result += u'('+ligature
101 if not casesensitive:
102 result += u'|'+ligature.upper()
103 result += u'|'+decorated_match_single_char(c12[0], casesensitive) \
104 +decorated_match_single_char(c12[1], casesensitive) \
106 i+=1 # skip next character, we allready did both
108 result += decorated_match_single_char(c1, casesensitive)
113 if __name__ == '__main__':
115 from optparse import OptionParser
116 parser = OptionParser(usage='%prog [options] string')
117 parser.add_option('--charset', help="set charset. default=%default", action='store', dest='charset', default='utf-8')
118 parser.add_option('-r', '--regexp', help="generate regular expression.", action='store_true', dest='regexp')
119 parser.add_option('-i', help="used with -r, make regexp case insensitive.", action='store_false', dest='casesensitive', default=True)
120 (options, args) = parser.parse_args()
123 print >> sys.stderr, u'Missing required parameter. Try "Œuf"'
125 input = unicode(' '.join(args), options.charset)
126 #print "input:", input # Œuf
127 #print "undecorated:", remove_decoration(input) # Oeuf
128 #print "regex:", decorated_match(input) # (œ|Œ|[oòóôöøōŏőOÒÓÔÖØŌŎŐ][eèéêëēĕėęěEÈÉÊËĒĔĖĘĚ])[uùúûüũūŭůűųUÙÚÛÜŨŪŬŮŰŲ][fF]
130 print decorated_match(input, options.casesensitive).encode(options.charset)
132 print remove_decoration(input).encode(options.charset)