First release
authorJean-Michel Nirgal Vourgère <jmv@nirgal.com>
Tue, 20 Jan 2009 15:11:09 +0000 (15:11 +0000)
committerJean-Michel Nirgal Vourgère <jmv@nirgal.com>
Tue, 20 Jan 2009 15:11:09 +0000 (15:11 +0000)
debian/changelog [new file with mode: 0644]
debian/compat [new file with mode: 0644]
debian/control [new file with mode: 0644]
debian/install [new file with mode: 0644]
debian/rules [new file with mode: 0755]
decoratedstr.py [new file with mode: 0755]

diff --git a/debian/changelog b/debian/changelog
new file mode 100644 (file)
index 0000000..dd653de
--- /dev/null
@@ -0,0 +1,5 @@
+python-decoratedstr (0.1) UNRELEASED; urgency=low
+
+  * Initial release.
+
+ -- Jean-Michel Vourgère <jmv_deb@nirgal.com>  Tue, 20 Jan 2009 11:21:39 +0100
diff --git a/debian/compat b/debian/compat
new file mode 100644 (file)
index 0000000..7ed6ff8
--- /dev/null
@@ -0,0 +1 @@
+5
diff --git a/debian/control b/debian/control
new file mode 100644 (file)
index 0000000..a3e15cd
--- /dev/null
@@ -0,0 +1,13 @@
+Source: python-decoratedstr
+Section: python
+Priority: optional
+Maintainer: Jean-Michel Vourgère <jmv_deb@nirgal.com>
+Build-Depends-Indep: debhelper (>> 3.0.0), python-support (>= 0.5.3)
+Standards-Version: 3.8.0
+
+Package: python-decoratedstr
+Section: python
+Architecture: all
+Depends: python
+Description: Python library for decorated strings
+ Provide functions for accentuated characters and ligatures handling
diff --git a/debian/install b/debian/install
new file mode 100644 (file)
index 0000000..91c10c1
--- /dev/null
@@ -0,0 +1 @@
+decoratedstr.py /usr/share/python-support/python-decoratedstr/
diff --git a/debian/rules b/debian/rules
new file mode 100755 (executable)
index 0000000..8cca214
--- /dev/null
@@ -0,0 +1,80 @@
+#!/usr/bin/make -f
+# Sample debian/rules that uses debhelper.
+# GNU copyright 1997 to 1999 by Joey Hess.
+
+# Uncomment this to turn on verbose mode.
+#export DH_VERBOSE=1
+
+#CFLAGS = -g
+#ifneq (,$(findstring noopt,$(DEB_BUILD_OPTIONS)))
+#      CFLAGS += -O0
+#else
+#      CFLAGS += -O2
+#endif
+
+build: build-stamp
+build-stamp:
+       dh_testdir
+
+       # Add here commands to compile the package.
+       #-$(MAKE)
+       #docbook-to-man debian/gentoo.sgml > gentoo.1
+
+       touch build-stamp
+
+clean:
+       dh_testdir
+       dh_testroot
+       rm -f build-stamp
+
+       # Add here commands to clean up after the build process.
+       #$(MAKE) clean
+
+       dh_clean
+
+install: build
+       dh_testdir
+       dh_testroot
+       dh_clean -k
+       dh_installdirs
+       
+       # Add here commands to install the package into debian/curcy.
+       #$(MAKE) install DESTDIR=$(CURDIR)/debian/curcy
+
+# Build architecture-independent files here.
+binary-indep: build install
+# We have nothing to do by default.
+
+# Build architecture-dependent files here.
+binary-arch: build install
+       dh_testdir
+       dh_testroot
+       dh_install -X.svn
+#      dh_installdebconf
+       dh_installdocs
+       dh_installexamples
+       dh_installmenu
+#      dh_installlogrotate
+#      dh_installemacsen
+#      dh_installpam
+#      dh_installmime
+#      dh_installinit
+#      dh_installcron
+#      dh_installman man/*
+       dh_installinfo
+       dh_installchangelogs
+       dh_link
+       dh_strip
+       dh_compress
+       dh_fixperms
+       dh_pysupport
+#      dh_makeshlibs
+       dh_installdeb
+#      dh_perl
+       dh_shlibdeps
+       dh_gencontrol
+       dh_md5sums
+       dh_builddeb
+
+binary: binary-indep binary-arch
+.PHONY: build clean binary-indep binary-arch binary install
diff --git a/decoratedstr.py b/decoratedstr.py
new file mode 100755 (executable)
index 0000000..866497c
--- /dev/null
@@ -0,0 +1,113 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+__all__ = ['remove_decoration', 'decorated_match']
+
+# for range \u00c0 \u0179
+char_to_alternatives_lower={
+    u'a': u'àáâãäåāăą',
+    u'c': u'çćĉċč',
+    u'd': u'ďđ',
+    u'e': u'èéêëēĕėęě',
+    u'g': u'ĝğġģ',
+    u'h': u'ĥħ',
+    u'i': u'ìíîïĩīĭįı',
+    u'j': u'ĵ',
+    u'k': u'ķ',
+    u'l': u'ĺļľŀł',
+    u'n': u'ñńņňʼnŋ',
+    u'o': u'òóôöøōŏő',
+    u'r': u'ŕŗř',
+    u's': u'śŝşš',
+    u't': u'ţťŧ',
+    u'u': u'ùúûüũūŭůűų',
+    u'w': u'ŵ',
+    u'y': u'ýÿŷ',
+    u'z': u'źżž',
+}
+
+char_to_alternatives={} # idem, but with upper case too
+for char,alternatives in char_to_alternatives_lower.iteritems():
+    char_to_alternatives[char] = alternatives
+    char_to_alternatives[char.upper()] = alternatives.upper()
+
+
+alternative_to_char = {} # reverse
+for char,alternatives in char_to_alternatives_lower.iteritems():
+    for alternative in alternatives:
+        alternative_to_char[alternative] = char
+        alternative_to_char[alternative.upper()] = char.upper()
+
+# ligatures (only two chars supported)
+ligatures_expansions_lower = {
+    u'æ': u'ae',
+    # u'ij': u'ij', buggy: see http://en.wikipedia.org/wiki/Typographic_ligature
+    u'œ': u'oe',
+}
+
+ligatures_expansions = {} # idem, but with upper case too
+for ligature, expansion in ligatures_expansions_lower.iteritems():
+    ligatures_expansions[ligature] = expansion
+    ligatures_expansions[ligature.upper()] = expansion[0].upper()+expansion[1:]
+
+ligatures_contractions = {} # reverse
+for ligature, expansion in ligatures_expansions_lower.iteritems():
+    ligatures_contractions[expansion] = ligature
+    ligatures_contractions[expansion[0].upper()+expansion[1:]] =  ligature.upper()
+
+def remove_decoration(txt):
+    result = u''
+    for l in txt:
+        l = alternative_to_char.get(l, l)
+        l = ligatures_expansions.get(l, l)
+        result += l
+    return result
+
+def decorated_match_single_char(c, casesensitive=False):
+    assert type(c)==unicode
+    if not casesensitive:
+        c = c.lower()
+    result = c + char_to_alternatives.get(c, u'')
+    if not casesensitive:
+        u = result.upper()
+        if result != u:
+            result += u
+    if len(result)>1:
+        return u'['+result+u']'
+    else:
+        return result
+
+def decorated_match(txt, casesensitive=False):
+    assert type(txt)==unicode
+    result = u''
+    txt = remove_decoration(txt)
+    if not casesensitive:
+        txt = txt.lower()
+    i = 0
+    while i<len(txt):
+        c1 = txt[i] # current character
+        c12 = txt[i:i+2] # both current and next characters. Contains a single char on last iteration so that it never matches, that is OK
+        ligature = ligatures_contractions.get(c12, None)
+        if ligature:
+            result += u'('+ligature
+            if not casesensitive:
+                result += u'|'+ligature.upper()
+            result += u'|'+decorated_match_single_char(c12[0], casesensitive) \
+                          +decorated_match_single_char(c12[1], casesensitive) \
+                   +u')'
+            i+=1 # skip next character, we allready did both
+        else:
+            result += decorated_match_single_char(c1, casesensitive)
+        i += 1
+    return result
+
+
+if __name__ == '__main__':
+    import sys
+    if len(sys.argv)<2:
+        print >> sys.stderr, u'Missing required parameter. Try "Œuf"'
+        sys.exit(1)
+    input = unicode(' '.join(sys.argv[1:]), 'utf-8')
+    print "input:", input                            # Œuf
+    print "undecorated:", remove_decoration(input)   # Oeuf
+    print "regex:", decorated_match(input)           # (œ|Œ|[oòóôöøōŏőOÒÓÔÖØŌŎŐ][eèéêëēĕėęěEÈÉÊËĒĔĖĘĚ])[uùúûüũūŭůűųUÙÚÛÜŨŪŬŮŰŲ][fF]