Added license
[decoratedstr.git] / decoratedstr.py
1 #!/usr/bin/env python
2 # -*- encoding: utf-8 -*-
3
4 __all__ = ['remove_decoration', 'decorated_match']
5
6 # for range \u00c0 \u0179
7 char_to_alternatives_lower={
8     u'a': u'àáâãäåāăą',
9     u'c': u'çćĉċč',
10     u'd': u'ďđ',
11     u'e': u'èéêëēĕėęě',
12     u'g': u'ĝğġģ',
13     u'h': u'ĥħ',
14     u'i': u'ìíîïĩīĭįı',
15     u'j': u'ĵ',
16     u'k': u'ķ',
17     u'l': u'ĺļľŀł',
18     u'n': u'ñńņňʼnŋ',
19     u'o': u'òóôöøōŏő',
20     u'r': u'ŕŗř',
21     u's': u'śŝşš',
22     u't': u'ţťŧ',
23     u'u': u'ùúûüũūŭůűų',
24     u'w': u'ŵ',
25     u'y': u'ýÿŷ',
26     u'z': u'źżž',
27 }
28
29 char_to_alternatives={} # idem, but with upper case too
30 for char,alternatives in char_to_alternatives_lower.iteritems():
31     char_to_alternatives[char] = alternatives
32     char_to_alternatives[char.upper()] = alternatives.upper()
33
34
35 alternative_to_char = {} # reverse
36 for char,alternatives in char_to_alternatives_lower.iteritems():
37     for alternative in alternatives:
38         alternative_to_char[alternative] = char
39         alternative_to_char[alternative.upper()] = char.upper()
40
41 # ligatures (only two chars supported)
42 ligatures_expansions_lower = {
43     u'æ': u'ae',
44     # u'ij': u'ij', buggy: see http://en.wikipedia.org/wiki/Typographic_ligature
45     u'œ': u'oe',
46 }
47
48 ligatures_expansions = {} # idem, but with upper case too
49 for ligature, expansion in ligatures_expansions_lower.iteritems():
50     ligatures_expansions[ligature] = expansion
51     ligatures_expansions[ligature.upper()] = expansion[0].upper()+expansion[1:]
52
53 ligatures_contractions = {} # reverse
54 for ligature, expansion in ligatures_expansions_lower.iteritems():
55     ligatures_contractions[expansion] = ligature
56     ligatures_contractions[expansion[0].upper()+expansion[1:]] =  ligature.upper()
57
58 def remove_decoration(txt):
59     result = u''
60     for l in txt:
61         l = alternative_to_char.get(l, l)
62         l = ligatures_expansions.get(l, l)
63         result += l
64     return result
65
66 def decorated_match_single_char(c, casesensitive=False):
67     assert type(c)==unicode
68     if not casesensitive:
69         c = c.lower()
70     result = c + char_to_alternatives.get(c, u'')
71     if not casesensitive:
72         u = result.upper()
73         if result != u:
74             result += u
75     if len(result)>1:
76         return u'['+result+u']'
77     else:
78         return result
79
80 def decorated_match(txt, casesensitive=False):
81     assert type(txt)==unicode
82     result = u''
83     txt = remove_decoration(txt)
84     if not casesensitive:
85         txt = txt.lower()
86     i = 0
87     while i<len(txt):
88         c1 = txt[i] # current character
89         c12 = txt[i:i+2] # both current and next characters. Contains a single char on last iteration so that it never matches, that is OK
90         ligature = ligatures_contractions.get(c12, None)
91         if ligature:
92             result += u'('+ligature
93             if not casesensitive:
94                 result += u'|'+ligature.upper()
95             result += u'|'+decorated_match_single_char(c12[0], casesensitive) \
96                           +decorated_match_single_char(c12[1], casesensitive) \
97                    +u')'
98             i+=1 # skip next character, we allready did both
99         else:
100             result += decorated_match_single_char(c1, casesensitive)
101         i += 1
102     return result
103
104
105 if __name__ == '__main__':
106     import sys
107     if len(sys.argv)<2:
108         print >> sys.stderr, u'Missing required parameter. Try "Œuf"'
109         sys.exit(1)
110     input = unicode(' '.join(sys.argv[1:]), 'utf-8')
111     print "input:", input                            # Œuf
112     print "undecorated:", remove_decoration(input)   # Oeuf
113     print "regex:", decorated_match(input)           # (œ|Œ|[oòóôöøōŏőOÒÓÔÖØŌŎŐ][eèéêëēĕėęěEÈÉÊËĒĔĖĘĚ])[uùúûüũūŭůűųUÙÚÛÜŨŪŬŮŰŲ][fF]