2 # -*- encoding: utf-8 -*-
4 __all__ = ['remove_decoration', 'decorated_match']
6 # for range \u00c0 \u0179
7 char_to_alternatives_lower={
29 # This chars lower() function doesn't work
30 char_to_alternatives_upper={
33 char_to_alternatives={} # idem, but with upper case too
34 for char,alternatives in char_to_alternatives_lower.iteritems():
35 char_to_alternatives[char] = alternatives
36 char_to_alternatives[char.upper()] = alternatives.upper()
37 for char,alternatives in char_to_alternatives_upper.iteritems():
38 char_to_alternatives[char] = alternatives
40 alternative_to_char = {} # reverse
41 for char,alternatives in char_to_alternatives_lower.iteritems():
42 for alternative in alternatives:
43 alternative_to_char[alternative] = char
44 alternative_to_char[alternative.upper()] = char.upper()
45 for char,alternatives in char_to_alternatives_upper.iteritems():
46 for alternative in alternatives:
47 alternative_to_char[alternative] = char
49 # ligatures (only two chars supported)
50 ligatures_expansions_lower = {
52 # u'ij': u'ij', buggy: see http://en.wikipedia.org/wiki/Typographic_ligature
56 ligatures_expansions = {} # idem, but with upper case too
57 for ligature, expansion in ligatures_expansions_lower.iteritems():
58 ligatures_expansions[ligature] = expansion
59 ligatures_expansions[ligature.upper()] = expansion[0].upper()+expansion[1:]
61 ligatures_contractions = {} # reverse
62 for ligature, expansion in ligatures_expansions_lower.iteritems():
63 ligatures_contractions[expansion] = ligature
64 ligatures_contractions[expansion[0].upper()+expansion[1:]] = ligature.upper()
66 def remove_decoration(txt):
69 l = alternative_to_char.get(l, l)
70 l = ligatures_expansions.get(l, l)
74 def decorated_match_single_char(c, casesensitive=False):
75 assert type(c)==unicode
78 result = c + char_to_alternatives.get(c, u'')
84 return u'['+result+u']'
88 def decorated_match(txt, casesensitive=False):
89 assert type(txt)==unicode
91 txt = remove_decoration(txt)
96 c1 = txt[i] # current character
97 c12 = txt[i:i+2] # both current and next characters. Contains a single char on last iteration so that it never matches, that is OK
98 ligature = ligatures_contractions.get(c12, None)
100 result += u'('+ligature
101 if not casesensitive:
102 result += u'|'+ligature.upper()
103 result += u'|'+decorated_match_single_char(c12[0], casesensitive) \
104 +decorated_match_single_char(c12[1], casesensitive) \
106 i+=1 # skip next character, we allready did both
108 result += decorated_match_single_char(c1, casesensitive)
113 if __name__ == '__main__':
116 print >> sys.stderr, u'Missing required parameter. Try "Œuf"'
118 input = unicode(' '.join(sys.argv[1:]), 'utf-8')
119 print "input:", input # Œuf
120 print "undecorated:", remove_decoration(input) # Oeuf
121 print "regex:", decorated_match(input) # (œ|Œ|[oòóôöøōŏőOÒÓÔÖØŌŎŐ][eèéêëēĕėęěEÈÉÊËĒĔĖĘĚ])[uùúûüũūŭůűųUÙÚÛÜŨŪŬŮŰŲ][fF]