X-Git-Url: https://git.nirgal.com/?p=decoratedstr.git;a=blobdiff_plain;f=decoratedstr.py;h=1dc8737028190caf7a99f3a9ac66320ddc7e78fc;hp=866497ccab144e4888611716f65e4247753b37a3;hb=6ef7d1c732fe4046d0f56233c7e6682f82da139d;hpb=b8308fec3a934c9ebe5f8b0376b307400905d542;ds=sidebyside diff --git a/decoratedstr.py b/decoratedstr.py index 866497c..1dc8737 100755 --- a/decoratedstr.py +++ b/decoratedstr.py @@ -26,17 +26,25 @@ char_to_alternatives_lower={ u'z': u'źżž', } +# This chars lower() function doesn't work +char_to_alternatives_upper={ + u'I': u'İ', +} char_to_alternatives={} # idem, but with upper case too for char,alternatives in char_to_alternatives_lower.iteritems(): char_to_alternatives[char] = alternatives char_to_alternatives[char.upper()] = alternatives.upper() - +for char,alternatives in char_to_alternatives_upper.iteritems(): + char_to_alternatives[char] = alternatives alternative_to_char = {} # reverse for char,alternatives in char_to_alternatives_lower.iteritems(): for alternative in alternatives: alternative_to_char[alternative] = char alternative_to_char[alternative.upper()] = char.upper() +for char,alternatives in char_to_alternatives_upper.iteritems(): + for alternative in alternatives: + alternative_to_char[alternative] = char # ligatures (only two chars supported) ligatures_expansions_lower = { @@ -104,10 +112,20 @@ def decorated_match(txt, casesensitive=False): if __name__ == '__main__': import sys - if len(sys.argv)<2: + from optparse import OptionParser + parser = OptionParser(usage='%prog [options] string') + parser.add_option('--charset', help="set charset. default=%default", action='store', dest='charset', default='utf-8') + parser.add_option('-r', '--regexp', help="generate regular expression.", action='store_true', dest='regexp') + (options, args) = parser.parse_args() + + if not args: print >> sys.stderr, u'Missing required parameter. Try "Œuf"' sys.exit(1) - input = unicode(' '.join(sys.argv[1:]), 'utf-8') - print "input:", input # Œuf - print "undecorated:", remove_decoration(input) # Oeuf - print "regex:", decorated_match(input) # (œ|Œ|[oòóôöøōŏőOÒÓÔÖØŌŎŐ][eèéêëēĕėęěEÈÉÊËĒĔĖĘĚ])[uùúûüũūŭůűųUÙÚÛÜŨŪŬŮŰŲ][fF] + input = unicode(' '.join(args), options.charset) + #print "input:", input # Œuf + #print "undecorated:", remove_decoration(input) # Oeuf + #print "regex:", decorated_match(input) # (œ|Œ|[oòóôöøōŏőOÒÓÔÖØŌŎŐ][eèéêëēĕėęěEÈÉÊËĒĔĖĘĚ])[uùúûüũūŭůűųUÙÚÛÜŨŪŬŮŰŲ][fF] + if options.regexp: + print decorated_match(input).encode(options.charset) + else: + print remove_decoration(input).encode(options.charset)