From 4b2e5c1108076f72d1e3e7ea37aea88ac732aa39 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Jean-Michel=20Nirgal=20Vourg=C3=A8re?= Date: Wed, 18 Mar 2009 11:37:35 +0000 Subject: [PATCH 1/1] First release --- debian/changelog | 5 + debian/compat | 1 + debian/control | 15 ++ debian/copyright | 15 ++ debian/install | 1 + debian/rules | 80 +++++++ entities.txt | 526 +++++++++++++++++++++++++++++++++++++++++++++++ htmlentities.py | 453 ++++++++++++++++++++++++++++++++++++++++ 8 files changed, 1096 insertions(+) create mode 100644 debian/changelog create mode 100644 debian/compat create mode 100644 debian/control create mode 100644 debian/copyright create mode 100644 debian/install create mode 100755 debian/rules create mode 100644 entities.txt create mode 100755 htmlentities.py diff --git a/debian/changelog b/debian/changelog new file mode 100644 index 0000000..205e466 --- /dev/null +++ b/debian/changelog @@ -0,0 +1,5 @@ +python-htmlentities (0.1) UNRELEASED; urgency=low + + * Initial release. + + -- Jean-Michel Vourgère Wed, 18 Mar 2009 12:34:44 +0100 diff --git a/debian/compat b/debian/compat new file mode 100644 index 0000000..7ed6ff8 --- /dev/null +++ b/debian/compat @@ -0,0 +1 @@ +5 diff --git a/debian/control b/debian/control new file mode 100644 index 0000000..9c7532a --- /dev/null +++ b/debian/control @@ -0,0 +1,15 @@ +Source: python-htmlentities +Section: python +Priority: optional +Maintainer: Jean-Michel Vourgère +Build-Depends: debhelper (>> 3.0.0) +Build-Depends-Indep: python-support (>= 0.5.3) +Standards-Version: 3.8.0 + +Package: python-htmlentities +Section: python +Architecture: all +Depends: python +Description: Python library for XML character entities + Provide functions for resolving characters entities such as &   + It emulates Internet Explorer resolution and supports buggy input. diff --git a/debian/copyright b/debian/copyright new file mode 100644 index 0000000..22691c8 --- /dev/null +++ b/debian/copyright @@ -0,0 +1,15 @@ +Intellectual property is a human pleague causing much misery throughout the world. +You are free to copy / modify / redistribute / resell theses files. + +© 2009 Jean-Michel Vourgère + +THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/debian/install b/debian/install new file mode 100644 index 0000000..c689cfd --- /dev/null +++ b/debian/install @@ -0,0 +1 @@ +htmlentities.py /usr/share/python-support/python-htmlentities/ diff --git a/debian/rules b/debian/rules new file mode 100755 index 0000000..6962cc6 --- /dev/null +++ b/debian/rules @@ -0,0 +1,80 @@ +#!/usr/bin/make -f +# Sample debian/rules that uses debhelper. +# GNU copyright 1997 to 1999 by Joey Hess. + +# Uncomment this to turn on verbose mode. +#export DH_VERBOSE=1 + +#CFLAGS = -g +#ifneq (,$(findstring noopt,$(DEB_BUILD_OPTIONS))) +# CFLAGS += -O0 +#else +# CFLAGS += -O2 +#endif + +build: build-stamp +build-stamp: + dh_testdir + + # Add here commands to compile the package. + #-$(MAKE) + #docbook-to-man debian/gentoo.sgml > gentoo.1 + + touch build-stamp + +clean: + dh_testdir + dh_testroot + rm -f build-stamp + + # Add here commands to clean up after the build process. + #$(MAKE) clean + + dh_clean + +install: build + dh_testdir + dh_testroot + dh_clean -k + dh_installdirs + + # Add here commands to install the package into debian/curcy. + #$(MAKE) install DESTDIR=$(CURDIR)/debian/curcy + +# Build architecture-independent files here. +binary-indep: build install + dh_testdir + dh_testroot + dh_install -X.svn +# dh_installdebconf + dh_installdocs entities.txt + dh_installexamples + dh_installmenu +# dh_installlogrotate +# dh_installemacsen +# dh_installpam +# dh_installmime +# dh_installinit +# dh_installcron +# dh_installman man/* + dh_installinfo + dh_installchangelogs + dh_link + dh_strip + dh_compress + dh_fixperms + dh_pysupport +# dh_makeshlibs + dh_installdeb +# dh_perl + dh_shlibdeps + dh_gencontrol + dh_md5sums + dh_builddeb + +# We have nothing to do by default. + +# Build architecture-dependent files here. +binary-arch: build install +binary: binary-indep binary-arch +.PHONY: build clean binary-indep binary-arch binary install diff --git a/entities.txt b/entities.txt new file mode 100644 index 0000000..5059c43 --- /dev/null +++ b/entities.txt @@ -0,0 +1,526 @@ +Taken from DTD from w3c.org (HTML 4.01) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/htmlentities.py b/htmlentities.py new file mode 100755 index 0000000..abc1bc8 --- /dev/null +++ b/htmlentities.py @@ -0,0 +1,453 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + +__all__ = ['resolve', 'expand', 'cleanCDATA'] + +entities = { + u'nbsp': 160, + u'iexcl': 161, + u'cent': 162, + u'pound': 163, + u'curren': 164, + u'yen': 165, + u'brvbar': 166, + u'sect': 167, + u'uml': 168, + u'copy': 169, + u'ordf': 170, + u'laquo': 171, + u'not': 172, + u'shy': 173, + u'reg': 174, + u'macr': 175, + u'deg': 176, + u'plusmn': 177, + u'sup2': 178, + u'sup3': 179, + u'acute': 180, + u'micro': 181, + u'para': 182, + u'middot': 183, + u'cedil': 184, + u'sup1': 185, + u'ordm': 186, + u'raquo': 187, + u'frac14': 188, + u'frac12': 189, + u'frac34': 190, + u'iquest': 191, + u'Agrave': 192, + u'Aacute': 193, + u'Acirc': 194, + u'Atilde': 195, + u'Auml': 196, + u'Aring': 197, + u'AElig': 198, + u'Ccedil': 199, + u'Egrave': 200, + u'Eacute': 201, + u'Ecirc': 202, + u'Euml': 203, + u'Igrave': 204, + u'Iacute': 205, + u'Icirc': 206, + u'Iuml': 207, + u'ETH': 208, + u'Ntilde': 209, + u'Ograve': 210, + u'Oacute': 211, + u'Ocirc': 212, + u'Otilde': 213, + u'Ouml': 214, + u'times': 215, + u'Oslash': 216, + u'Ugrave': 217, + u'Uacute': 218, + u'Ucirc': 219, + u'Uuml': 220, + u'Yacute': 221, + u'THORN': 222, + u'szlig': 223, + u'agrave': 224, + u'aacute': 225, + u'acirc': 226, + u'atilde': 227, + u'auml': 228, + u'aring': 229, + u'aelig': 230, + u'ccedil': 231, + u'egrave': 232, + u'eacute': 233, + u'ecirc': 234, + u'euml': 235, + u'igrave': 236, + u'iacute': 237, + u'icirc': 238, + u'iuml': 239, + u'eth': 240, + u'ntilde': 241, + u'ograve': 242, + u'oacute': 243, + u'ocirc': 244, + u'otilde': 245, + u'ouml': 246, + u'divide': 247, + u'oslash': 248, + u'ugrave': 249, + u'uacute': 250, + u'ucirc': 251, + u'uuml': 252, + u'yacute': 253, + u'thorn': 254, + u'yuml': 255, + u'fnof': 402, + u'Alpha': 913, + u'Beta': 914, + u'Gamma': 915, + u'Delta': 916, + u'Epsilon': 917, + u'Zeta': 918, + u'Eta': 919, + u'Theta': 920, + u'Iota': 921, + u'Kappa': 922, + u'Lambda': 923, + u'Mu': 924, + u'Nu': 925, + u'Xi': 926, + u'Omicron': 927, + u'Pi': 928, + u'Rho': 929, + u'Sigma': 931, + u'Tau': 932, + u'Upsilon': 933, + u'Phi': 934, + u'Chi': 935, + u'Psi': 936, + u'Omega': 937, + u'alpha': 945, + u'beta': 946, + u'gamma': 947, + u'delta': 948, + u'epsilon': 949, + u'zeta': 950, + u'eta': 951, + u'theta': 952, + u'iota': 953, + u'kappa': 954, + u'lambda': 955, + u'mu': 956, + u'nu': 957, + u'xi': 958, + u'omicron': 959, + u'pi': 960, + u'rho': 961, + u'sigmaf': 962, + u'sigma': 963, + u'tau': 964, + u'upsilon': 965, + u'phi': 966, + u'chi': 967, + u'psi': 968, + u'omega': 969, + u'thetasym': 977, + u'upsih': 978, + u'piv': 982, + u'bull': 8226, + u'hellip': 8230, + u'prime': 8242, + u'Prime': 8243, + u'oline': 8254, + u'frasl': 8260, + u'weierp': 8472, + u'image': 8465, + u'real': 8476, + u'trade': 8482, + u'alefsym': 8501, + u'larr': 8592, + u'uarr': 8593, + u'rarr': 8594, + u'darr': 8595, + u'harr': 8596, + u'crarr': 8629, + u'lArr': 8656, + u'uArr': 8657, + u'rArr': 8658, + u'dArr': 8659, + u'hArr': 8660, + u'forall': 8704, + u'part': 8706, + u'exist': 8707, + u'empty': 8709, + u'nabla': 8711, + u'isin': 8712, + u'notin': 8713, + u'ni': 8715, + u'prod': 8719, + u'sum': 8721, + u'minus': 8722, + u'lowast': 8727, + u'radic': 8730, + u'prop': 8733, + u'infin': 8734, + u'ang': 8736, + u'and': 8743, + u'or': 8744, + u'cap': 8745, + u'cup': 8746, + u'int': 8747, + u'there4': 8756, + u'sim': 8764, + u'cong': 8773, + u'asymp': 8776, + u'ne': 8800, + u'equiv': 8801, + u'le': 8804, + u'ge': 8805, + u'sub': 8834, + u'sup': 8835, + u'nsub': 8836, + u'sube': 8838, + u'supe': 8839, + u'oplus': 8853, + u'otimes': 8855, + u'perp': 8869, + u'sdot': 8901, + u'lceil': 8968, + u'rceil': 8969, + u'lfloor': 8970, + u'rfloor': 8971, + u'lang': 9001, + u'rang': 9002, + u'loz': 9674, + u'spades': 9824, + u'clubs': 9827, + u'hearts': 9829, + u'diams': 9830, + u'quot': 34, + u'amp': 38, + u'lt': 60, + u'gt': 62, + u'OElig': 338, + u'oelig': 339, + u'Scaron': 352, + u'scaron': 353, + u'Yuml': 376, + u'circ': 710, + u'tilde': 732, + u'ensp': 8194, + u'emsp': 8195, + u'thinsp': 8201, + u'zwnj': 8204, + u'zwj': 8205, + u'lrm': 8206, + u'rlm': 8207, + u'ndash': 8211, + u'mdash': 8212, + u'lsquo': 8216, + u'rsquo': 8217, + u'sbquo': 8218, + u'ldquo': 8220, + u'rdquo': 8221, + u'bdquo': 8222, + u'dagger': 8224, + u'Dagger': 8225, + u'permil': 8240, + u'lsaquo': 8249, + u'rsaquo': 8250, + u'euro': 8364, +} + +entities_autocomplete = {} +longestEntityLen = 0 + +for key,value in entities.iteritems(): + if value<=255: + entities_autocomplete[key] = value + l = len(key) + if l>longestEntityLen: + longestEntityLen = l + +# Characters in range 127-159 are illegals, but they are sometimes wrongly used in web pages +# Internet Explorer assumes it is taken from Microsoft extension to Latin 1 page 8859-1 aka CP1512 +# However, to be clean, we must remap them to their real unicode values +# Unknown codes are translated into a space +iso88591_remap = [ + 32, # 127: ??? + 8364, # 128: Euro symbol + 32, # 129: ??? + 8218, # 130: Single Low-9 Quotation Mark + 402, # 131: Latin Small Letter F With Hook + 8222, # 132: Double Low-9 Quotation Mark + 8230, # 133: Horizontal Ellipsis + 8224, # 134: Dagger + 8225, # 135: Double Dagger + 710, # 136: Modifier Letter Circumflex Accent + 8240, # 137: Per Mille Sign + 352, # 138: Latin Capital Letter S With Caron + 8249, # 139: Single Left-Pointing Angle Quotation Mark + 338, # 140: Latin Capital Ligature OE + 32, # 141: ??? + 381, # 142: Latin Capital Letter Z With Caron + 32, # 143: ??? + 32, # 144: ??? + 8216, # 145: Left Single Quotation Mark + 8217, # 146: Right Single Quotation Mark + 8220, # 147: Left Double Quotation Mark + 8221, # 148: Right Double Quotation Mark + 8226, # 149: Bullet + 8211, # 150: En Dash + 8212, # 151: Em Dash + 732, # 152: Small Tilde + 8482, # 153: Trade Mark Sign + 353, # 154: Latin Small Letter S With Caron + 8250, # 155: Single Right-Pointing Angle Quotation Mark + 339, # 156: Latin Small Ligature OE + 32, # 157: ??? + 382, # 158: Latin Small Letter Z With Caron + 376 # 159: Latin Capital Letter Y With Diaeresis +] + + +def checkForUnicodeReservedChar(value): + if value >= 0xfffe: + return ord('?') + if value < 127 or value > 159: + return value + return iso88591_remap[value-127] + +def expand(text): + result = u'' + for c in text: + oc = ord(c) + oc = checkForUnicodeReservedChar(oc) + if oc<32 or c==u'&' or c==u'<' or c==u'>' or c==u'"' or oc>127: + result += u'&#'+unicode(oc)+u';' + else: + result += c + return result + +def resolve(text): + pos = 0 + result = u'' + l = len(text) + while True: + prevpos = pos + pos = text.find(u'&', prevpos) + if pos == -1: + ## print "No more &" + break + + if pos >= l-2: + ## print "Too shoort" + break + # here we are sure the next two chars exist + + result += text[prevpos:pos] + c = text[pos+1] + if c == u'#': + ## print "numeric entity" + # This looks like an char whose unicode if given raw + c = text[pos+2] + if c == u'x' or c == u'X' and pos < l-3: + tmppos = text.find(u';', pos+3) + if tmppos != -1: + s = text[pos+3: tmppos] + try: + value = int(s, 16) + value = checkForUnicodeReservedChar(value) # remap unicode char if in range 127-159 + result += unichr(value) + pos = tmppos + 1 + continue # ok, we did it + except ValueError: + # there pos is not updated so that the original escape-like sequence is kept unchanged + pass + else: + # the given unicode value is decimal + # IE behavior: parse until non digital char, no conversion if this is not + sb = u'' + tmppos = pos+2 + while True: + if tmppos >= l: + break # out of range + c = text[tmppos] + if c == u';': + tmppos += 1 + break + if cu'9': + break + sb += c + tmppos += 1 + try: + value = int(sb) + value = checkForUnicodeReservedChar(value); # remap unicode char if in range 127-159 + result += unichr(value) + pos = tmppos + continue # ok, we did it + except ValueError: + # there pos is not updated so that the original escape-like sequence is kept unchanged + pass + else: + # here the first character is not a '#' + # let's try the known html entities + + sb = u'' + tmppos = pos + 1 + while True: + if tmppos >= l or tmppos-pos > longestEntityLen + 1: # 1 more for ';' + c2 = entities_autocomplete.get(sb, 0) + break + c = text[tmppos] + if c == u';': + tmppos += 1 + c2 = entities.get(sb, 0) + break + c2 = entities_autocomplete.get(sb, 0) + if c2: + break + sb += c + tmppos += 1 + if c2: + result += unichr(c2) + pos = tmppos + continue # ok, we did it + + result += u'&' # something went wrong, just skip is '&' + pos += 1 + + result += text[prevpos:] + return result + +def cleanCDATA(text): + """ + resolve entities + removes useless whites, \r, \n and \t with whites + expand back entities + """ + tmp = resolve(text) + result = u'' + isLastWhite = False # so that first white is not removed + for c in tmp: + if c in ' \r\n\t': + if not isLastWhite: + result += u' ' + isLastWhite = True + else: + result += c + isLastWhite = False + + return expand(result) + + +if __name__ == '__main__': + import sys + if len(sys.argv)<2: + print >> sys.stderr, u"Missing required parameter. Try '&test'" + sys.exit(1) + input = unicode(' '.join(sys.argv[1:]), 'utf-8') + #print 'input:', input + #raw = resolve(input) + #print 'resolved:', raw + #print 'expanded:', expand(raw) + print 'cleanCDATA:', cleanCDATA(input) + -- 2.30.2