2 # -*- encoding: utf-8 -*-
4 __all__ = ['remove_decoration', 'decorated_match']
6 # for range \u00c0 \u0179
7 char_to_alternatives_lower={
29 char_to_alternatives={} # idem, but with upper case too
30 for char,alternatives in char_to_alternatives_lower.iteritems():
31 char_to_alternatives[char] = alternatives
32 char_to_alternatives[char.upper()] = alternatives.upper()
35 alternative_to_char = {} # reverse
36 for char,alternatives in char_to_alternatives_lower.iteritems():
37 for alternative in alternatives:
38 alternative_to_char[alternative] = char
39 alternative_to_char[alternative.upper()] = char.upper()
41 # ligatures (only two chars supported)
42 ligatures_expansions_lower = {
44 # u'ij': u'ij', buggy: see http://en.wikipedia.org/wiki/Typographic_ligature
48 ligatures_expansions = {} # idem, but with upper case too
49 for ligature, expansion in ligatures_expansions_lower.iteritems():
50 ligatures_expansions[ligature] = expansion
51 ligatures_expansions[ligature.upper()] = expansion[0].upper()+expansion[1:]
53 ligatures_contractions = {} # reverse
54 for ligature, expansion in ligatures_expansions_lower.iteritems():
55 ligatures_contractions[expansion] = ligature
56 ligatures_contractions[expansion[0].upper()+expansion[1:]] = ligature.upper()
58 def remove_decoration(txt):
61 l = alternative_to_char.get(l, l)
62 l = ligatures_expansions.get(l, l)
66 def decorated_match_single_char(c, casesensitive=False):
67 assert type(c)==unicode
70 result = c + char_to_alternatives.get(c, u'')
76 return u'['+result+u']'
80 def decorated_match(txt, casesensitive=False):
81 assert type(txt)==unicode
83 txt = remove_decoration(txt)
88 c1 = txt[i] # current character
89 c12 = txt[i:i+2] # both current and next characters. Contains a single char on last iteration so that it never matches, that is OK
90 ligature = ligatures_contractions.get(c12, None)
92 result += u'('+ligature
94 result += u'|'+ligature.upper()
95 result += u'|'+decorated_match_single_char(c12[0], casesensitive) \
96 +decorated_match_single_char(c12[1], casesensitive) \
98 i+=1 # skip next character, we allready did both
100 result += decorated_match_single_char(c1, casesensitive)
105 if __name__ == '__main__':
108 print >> sys.stderr, u'Missing required parameter. Try "Œuf"'
110 input = unicode(' '.join(sys.argv[1:]), 'utf-8')
111 print "input:", input # Œuf
112 print "undecorated:", remove_decoration(input) # Oeuf
113 print "regex:", decorated_match(input) # (œ|Œ|[oòóôöøōŏőOÒÓÔÖØŌŎŐ][eèéêëēĕėęěEÈÉÊËĒĔĖĘĚ])[uùúûüũūŭůűųUÙÚÛÜŨŪŬŮŰŲ][fF]