u'z': u'źżž',
}
+# This chars lower() function doesn't work
+char_to_alternatives_upper={
+ u'I': u'İ',
+}
char_to_alternatives={} # idem, but with upper case too
for char,alternatives in char_to_alternatives_lower.iteritems():
char_to_alternatives[char] = alternatives
char_to_alternatives[char.upper()] = alternatives.upper()
-
+for char,alternatives in char_to_alternatives_upper.iteritems():
+ char_to_alternatives[char] = alternatives
alternative_to_char = {} # reverse
for char,alternatives in char_to_alternatives_lower.iteritems():
for alternative in alternatives:
alternative_to_char[alternative] = char
alternative_to_char[alternative.upper()] = char.upper()
+for char,alternatives in char_to_alternatives_upper.iteritems():
+ for alternative in alternatives:
+ alternative_to_char[alternative] = char
# ligatures (only two chars supported)
ligatures_expansions_lower = {
if __name__ == '__main__':
import sys
- if len(sys.argv)<2:
+ from optparse import OptionParser
+ parser = OptionParser(usage='%prog [options] string')
+ parser.add_option('--charset', help="set charset. default=%default", action='store', dest='charset', default='utf-8')
+ parser.add_option('-r', '--regexp', help="generate regular expression.", action='store_true', dest='regexp')
+ (options, args) = parser.parse_args()
+
+ if not args:
print >> sys.stderr, u'Missing required parameter. Try "Œuf"'
sys.exit(1)
- input = unicode(' '.join(sys.argv[1:]), 'utf-8')
- print "input:", input # Œuf
- print "undecorated:", remove_decoration(input) # Oeuf
- print "regex:", decorated_match(input) # (œ|Œ|[oòóôöøōŏőOÒÓÔÖØŌŎŐ][eèéêëēĕėęěEÈÉÊËĒĔĖĘĚ])[uùúûüũūŭůűųUÙÚÛÜŨŪŬŮŰŲ][fF]
+ input = unicode(' '.join(args), options.charset)
+ #print "input:", input # Œuf
+ #print "undecorated:", remove_decoration(input) # Oeuf
+ #print "regex:", decorated_match(input) # (œ|Œ|[oòóôöøōŏőOÒÓÔÖØŌŎŐ][eèéêëēĕėęěEÈÉÊËĒĔĖĘĚ])[uùúûüũūŭůűųUÙÚÛÜŨŪŬŮŰŲ][fF]
+ if options.regexp:
+ print decorated_match(input).encode(options.charset)
+ else:
+ print remove_decoration(input).encode(options.charset)