Added command line case insentitive option for regexp
[decoratedstr.git] / decoratedstr.py
1 #!/usr/bin/env python
2 # -*- encoding: utf-8 -*-
3
4 __all__ = ['remove_decoration', 'decorated_match']
5
6 # for range \u00c0 \u0179
7 char_to_alternatives_lower={
8     u'a': u'àáâãäåāăą',
9     u'c': u'çćĉċč',
10     u'd': u'ďđ',
11     u'e': u'èéêëēĕėęě',
12     u'g': u'ĝğġģ',
13     u'h': u'ĥħ',
14     u'i': u'ìíîïĩīĭįı',
15     u'j': u'ĵ',
16     u'k': u'ķ',
17     u'l': u'ĺļľŀł',
18     u'n': u'ñńņňʼnŋ',
19     u'o': u'òóôöøōŏő',
20     u'r': u'ŕŗř',
21     u's': u'śŝşš',
22     u't': u'ţťŧ',
23     u'u': u'ùúûüũūŭůűų',
24     u'w': u'ŵ',
25     u'y': u'ýÿŷ',
26     u'z': u'źżž',
27 }
28
29 # This chars lower() function doesn't work
30 char_to_alternatives_upper={
31     u'I': u'İ',
32 }
33 char_to_alternatives={} # idem, but with upper case too
34 for char,alternatives in char_to_alternatives_lower.iteritems():
35     char_to_alternatives[char] = alternatives
36     char_to_alternatives[char.upper()] = alternatives.upper()
37 for char,alternatives in char_to_alternatives_upper.iteritems():
38     char_to_alternatives[char] = alternatives
39
40 alternative_to_char = {} # reverse
41 for char,alternatives in char_to_alternatives_lower.iteritems():
42     for alternative in alternatives:
43         alternative_to_char[alternative] = char
44         alternative_to_char[alternative.upper()] = char.upper()
45 for char,alternatives in char_to_alternatives_upper.iteritems():
46     for alternative in alternatives:
47         alternative_to_char[alternative] = char
48
49 # ligatures (only two chars supported)
50 ligatures_expansions_lower = {
51     u'æ': u'ae',
52     # u'ij': u'ij', buggy: see http://en.wikipedia.org/wiki/Typographic_ligature
53     u'œ': u'oe',
54 }
55
56 ligatures_expansions = {} # idem, but with upper case too
57 for ligature, expansion in ligatures_expansions_lower.iteritems():
58     ligatures_expansions[ligature] = expansion
59     ligatures_expansions[ligature.upper()] = expansion[0].upper()+expansion[1:]
60
61 ligatures_contractions = {} # reverse
62 for ligature, expansion in ligatures_expansions_lower.iteritems():
63     ligatures_contractions[expansion] = ligature
64     ligatures_contractions[expansion[0].upper()+expansion[1:]] =  ligature.upper()
65
66 def remove_decoration(txt):
67     result = u''
68     for l in txt:
69         l = alternative_to_char.get(l, l)
70         l = ligatures_expansions.get(l, l)
71         result += l
72     return result
73
74 def decorated_match_single_char(c, casesensitive=False):
75     assert type(c)==unicode
76     if not casesensitive:
77         c = c.lower()
78     result = c + char_to_alternatives.get(c, u'')
79     if not casesensitive:
80         u = result.upper()
81         if result != u:
82             result += u
83     if len(result)>1:
84         return u'['+result+u']'
85     else:
86         return result
87
88 def decorated_match(txt, casesensitive=False):
89     assert type(txt)==unicode
90     result = u''
91     txt = remove_decoration(txt)
92     if not casesensitive:
93         txt = txt.lower()
94     i = 0
95     while i<len(txt):
96         c1 = txt[i] # current character
97         c12 = txt[i:i+2] # both current and next characters. Contains a single char on last iteration so that it never matches, that is OK
98         ligature = ligatures_contractions.get(c12, None)
99         if ligature:
100             result += u'('+ligature
101             if not casesensitive:
102                 result += u'|'+ligature.upper()
103             result += u'|'+decorated_match_single_char(c12[0], casesensitive) \
104                           +decorated_match_single_char(c12[1], casesensitive) \
105                    +u')'
106             i+=1 # skip next character, we allready did both
107         else:
108             result += decorated_match_single_char(c1, casesensitive)
109         i += 1
110     return result
111
112
113 if __name__ == '__main__':
114     import sys
115     from optparse import OptionParser
116     parser = OptionParser(usage='%prog [options] string')
117     parser.add_option('--charset', help="set charset. default=%default", action='store', dest='charset', default='utf-8')
118     parser.add_option('-r', '--regexp', help="generate regular expression.", action='store_true', dest='regexp')
119     parser.add_option('-i', help="used with -r, make regexp case insensitive.", action='store_false', dest='casesensitive', default=True)
120     (options, args) = parser.parse_args()
121
122     if not args:
123         print >> sys.stderr, u'Missing required parameter. Try "Œuf"'
124         sys.exit(1)
125     input = unicode(' '.join(args), options.charset)
126     #print "input:", input                            # Œuf
127     #print "undecorated:", remove_decoration(input)   # Oeuf
128     #print "regex:", decorated_match(input)           # (œ|Œ|[oòóôöøōŏőOÒÓÔÖØŌŎŐ][eèéêëēĕėęěEÈÉÊËĒĔĖĘĚ])[uùúûüũūŭůűųUÙÚÛÜŨŪŬŮŰŲ][fF]
129     if options.regexp:
130         print decorated_match(input, options.casesensitive).encode(options.charset)
131     else:
132         print remove_decoration(input).encode(options.charset)