#!/usr/bin/env python
# -*- encoding: utf-8 -*-
+from __future__ import print_function, unicode_literals
+import six
+
__all__ = ['remove_decoration', 'decorated_match']
# for range \u00c0 \u0179
-char_to_alternatives_lower={
- u'a': u'àáâãäåāăą',
- u'c': u'çćĉċč',
- u'd': u'ďđ',
- u'e': u'èéêëēĕėęě',
- u'g': u'ĝğġģ',
- u'h': u'ĥħ',
- u'i': u'ìíîïĩīĭįı',
- u'j': u'ĵ',
- u'k': u'ķ',
- u'l': u'ĺļľŀł',
- u'n': u'ñńņňʼnŋ',
- u'o': u'òóôöøōŏő',
- u'r': u'ŕŗř',
- u's': u'śŝşš',
- u't': u'ţťŧ',
- u'u': u'ùúûüũūŭůűų',
- u'w': u'ŵ',
- u'y': u'ýÿŷ',
- u'z': u'źżž',
+char_to_alternatives_lower = {
+ 'a': 'àáâãäåāăą',
+ 'c': 'çćĉċč',
+ 'd': 'ďđ',
+ 'e': 'èéêëēĕėęě',
+ 'g': 'ĝğġģ',
+ 'h': 'ĥħ',
+ 'i': 'ìíîïĩīĭįı',
+ 'j': 'ĵ',
+ 'k': 'ķ',
+ 'l': 'ĺļľŀł',
+ 'n': 'ñńņňʼnŋ',
+ 'o': 'òóôöøōŏő',
+ 'r': 'ŕŗř',
+ 's': 'śŝşš',
+ 't': 'ţťŧ',
+ 'u': 'ùúûüũūŭůűų',
+ 'w': 'ŵ',
+ 'y': 'ýÿŷ',
+ 'z': 'źżž',
}
# This chars lower() function doesn't work
-char_to_alternatives_upper={
- u'I': u'İ',
+char_to_alternatives_upper = {
+ 'I': 'İ',
}
-char_to_alternatives={} # idem, but with upper case too
-for char,alternatives in char_to_alternatives_lower.iteritems():
+char_to_alternatives = {} # idem, but with upper case too
+for char, alternatives in six.iteritems(char_to_alternatives_lower):
char_to_alternatives[char] = alternatives
char_to_alternatives[char.upper()] = alternatives.upper()
-for char,alternatives in char_to_alternatives_upper.iteritems():
+for char, alternatives in six.iteritems(char_to_alternatives_upper):
char_to_alternatives[char] = alternatives
alternative_to_char = {} # reverse
-for char,alternatives in char_to_alternatives_lower.iteritems():
+for char, alternatives in six.iteritems(char_to_alternatives_lower):
for alternative in alternatives:
alternative_to_char[alternative] = char
alternative_to_char[alternative.upper()] = char.upper()
-for char,alternatives in char_to_alternatives_upper.iteritems():
+for char, alternatives in six.iteritems(char_to_alternatives_upper):
for alternative in alternatives:
alternative_to_char[alternative] = char
# ligatures (only two chars supported)
ligatures_expansions_lower = {
- u'æ': u'ae',
- # u'ij': u'ij', buggy: see http://en.wikipedia.org/wiki/Typographic_ligature
- u'œ': u'oe',
+ 'æ': 'ae',
+ # 'ij': 'ij', buggy: see http://en.wikipedia.org/wiki/Typographic_ligature
+ 'œ': 'oe',
}
ligatures_expansions = {} # idem, but with upper case too
-for ligature, expansion in ligatures_expansions_lower.iteritems():
+for ligature, expansion in six.iteritems(ligatures_expansions_lower):
ligatures_expansions[ligature] = expansion
ligatures_expansions[ligature.upper()] = expansion[0].upper()+expansion[1:]
ligatures_contractions = {} # reverse
-for ligature, expansion in ligatures_expansions_lower.iteritems():
+for ligature, expansion in six.iteritems(ligatures_expansions_lower):
ligatures_contractions[expansion] = ligature
- ligatures_contractions[expansion[0].upper()+expansion[1:]] = ligature.upper()
+ ligatures_contractions[expansion[0].upper()+expansion[1:]] = ligature.upper()
def remove_decoration(txt):
- result = u''
+ result = ''
for l in txt:
l = alternative_to_char.get(l, l)
l = ligatures_expansions.get(l, l)
return result
def decorated_match_single_char(c, casesensitive=False):
- assert type(c)==unicode
+ assert type(c) == type('')
if not casesensitive:
c = c.lower()
- result = c + char_to_alternatives.get(c, u'')
+ result = c + char_to_alternatives.get(c, '')
if not casesensitive:
u = result.upper()
if result != u:
result += u
- if len(result)>1:
- return u'['+result+u']'
+ if len(result) > 1:
+ return '['+result+']'
else:
return result
def decorated_match(txt, casesensitive=False):
- assert type(txt)==unicode
- result = u''
+ assert type(txt) == type('')
+ result = ''
txt = remove_decoration(txt)
if not casesensitive:
txt = txt.lower()
i = 0
- while i<len(txt):
+ while i < len(txt):
c1 = txt[i] # current character
c12 = txt[i:i+2] # both current and next characters. Contains a single char on last iteration so that it never matches, that is OK
ligature = ligatures_contractions.get(c12, None)
if ligature:
- result += u'('+ligature
+ result += '('+ligature
if not casesensitive:
- result += u'|'+ligature.upper()
- result += u'|'+decorated_match_single_char(c12[0], casesensitive) \
+ result += '|'+ligature.upper()
+ result += '|'+decorated_match_single_char(c12[0], casesensitive) \
+decorated_match_single_char(c12[1], casesensitive) \
- +u')'
- i+=1 # skip next character, we allready did both
+ +')'
+ i += 1 # skip next character, we allready did both
else:
result += decorated_match_single_char(c1, casesensitive)
i += 1
(options, args) = parser.parse_args()
if not args:
- print >> sys.stderr, u'Missing required parameter. Try "Œuf"'
+ print('Missing required parameter. Try "Œuf"', file=sys.stderr)
sys.exit(1)
- input = unicode(' '.join(args), options.charset)
- #print "input:", input # Œuf
- #print "undecorated:", remove_decoration(input) # Oeuf
- #print "regex:", decorated_match(input) # (œ|Œ|[oòóôöøōŏőOÒÓÔÖØŌŎŐ][eèéêëēĕėęěEÈÉÊËĒĔĖĘĚ])[uùúûüũūŭůűųUÙÚÛÜŨŪŬŮŰŲ][fF]
+ if six.PY3:
+ input = ' '.join(args)
+ else:
+ input = unicode(b' '.join(args), options.charset)
+ #print("input:", input) # Œuf
+ #print("undecorated:", remove_decoration(input)) # Oeuf
+ #print("regex:", decorated_match(input)) # (œ|Œ|[oòóôöøōŏőOÒÓÔÖØŌŎŐ][eèéêëēĕėęěEÈÉÊËĒĔĖĘĚ])[uùúûüũūŭůűųUÙÚÛÜŨŪŬŮŰŲ][fF]
if options.regexp:
- print decorated_match(input, options.casesensitive).encode(options.charset)
+ if six.PY3:
+ print(decorated_match(input, options.casesensitive))
+ else:
+ print(decorated_match(input, options.casesensitive).encode(options.charset))
else:
- print remove_decoration(input).encode(options.charset)
+ if six.PY3:
+ print(remove_decoration(input))
+ else:
+ print(remove_decoration(input).encode(options.charset))