First version of banque postale scripts
authorJean-Michel Nirgal Vourgère <jmv@nirgal.com>
Sun, 8 May 2011 10:57:27 +0000 (10:57 +0000)
committerJean-Michel Nirgal Vourgère <jmv@nirgal.com>
Sun, 8 May 2011 10:57:27 +0000 (10:57 +0000)
go.py [new file with mode: 0755]
html_parser.py [new file with mode: 0755]
htmlentities.py [new file with mode: 0755]

diff --git a/go.py b/go.py
new file mode 100755 (executable)
index 0000000..cfcf88a
--- /dev/null
+++ b/go.py
@@ -0,0 +1,200 @@
+#!/usr/bin/env python3
+
+import os
+import time
+import re
+import logging
+from datetime import datetime
+import urllib.request
+from http.cookiejar import CookieJar
+from subprocess import Popen, PIPE
+
+import html_parser
+import htmlentities
+
+BASE_URL = 'https://voscomptesenligne.labanquepostale.fr'
+WSOST_PREFIX = '/wsost/OstBrokerWeb/loginform?imgid='
+
+LOCAL_DIR = '/home/nirgal/banquepostale/'
+
+def get_login_password():
+    config=open(LOCAL_DIR + 'config').read()
+    login=None
+    password=None
+    for line in config.splitlines():
+        if line.startswith('login'):
+            login = line[len('login'):].strip()
+        elif line.startswith('password'):
+            password = line[len('password'):].strip()
+    return login, password
+
+__opener__ = None
+def httpopen(url, post_data=None):
+    if post_data:
+        logging.debug('HTTP POST %s %s', url, post_data)
+    else:
+        logging.debug('HTTP GET %s', url)
+    global __opener__
+    if __opener__ is None:
+        cookiejar = CookieJar()
+        __opener__ = urllib.request.build_opener()
+        __opener__.add_handler(urllib.request.HTTPCookieProcessor(cookiejar))
+    http_response = __opener__.open(url, post_data)
+    return http_response
+
+def sleep(seconds):
+    logging.debug('Waiting %s seconds', seconds)
+    time.sleep(seconds)
+
+def main():
+    '''
+    Download all the accounts csv data and store them in LOCAL_DIR
+    Return a list of filenames
+    '''
+    result = []
+
+    logging.info('Downloading password form')
+    httpresponse = httpopen(BASE_URL + '/voscomptes/canalXHTML/securite/authentification/recupererPointEntree-identif.ea')
+    html = httpresponse.read().decode('iso8859-1')
+    #logging.debug(httpresponse.info())
+    open('login.html', 'w', encoding='iso8859-1').write(html)
+
+    root = html_parser.html_parse(html)
+    #html_parser.print_idented_tree(root)
+    for img in html_parser.get_elem(root, 'img'):
+        src = img.attributes.get('src', '')
+        if src.startswith(WSOST_PREFIX):
+            #print(img)
+            img_id = src[len(WSOST_PREFIX)]
+            #print(img_id)
+            httpresponse = httpopen(BASE_URL + src)
+            img = httpresponse.read()
+            open(img_id+'.gif', 'wb').write(img)
+
+    xlt_password = {}
+    for img_id in "0123456789":
+        proc_convert=Popen('convert %s.gif -crop 20x20+5+5 pnm:-' % img_id,
+            shell=True, stdout=PIPE)
+        proc_gocr=Popen('gocr -C 0-9 -i -',
+            shell=True, stdin=proc_convert.stdout, stdout=PIPE)
+        output = proc_gocr.communicate()[0]
+        output = output.decode('utf-8')
+        output = output.strip()
+        #print("image #%s is %s" % (img_id, output))
+        xlt_password[output] = img_id
+    
+    LOGIN, PASSWORD = get_login_password()
+
+    shuffled_password = ''
+    for c in PASSWORD:
+        shuffled_password += xlt_password[c]
+    logging.info("shuffled_password: %s", shuffled_password)
+    #for i in 0 1 2 3 4 5 6 7 8 9; do convert $i.gif -crop 20x20+5+5 pnm:- | gocr -C 0-9 -i -; done
+
+    sleep(10) # We are not supermen
+
+    post_data='urlbackend=%2Fvoscomptes%2FcanalXHTML%2Fsecurite%2Fauthentification%2FrecupererPointEntree-identif.ea%3Forigin%3Dparticuliers&origin=particuliers&password=' + shuffled_password + '&cv=true&cvvs=&username=' + LOGIN
+    httpresponse = httpopen(BASE_URL + '/wsost/OstBrokerWeb/auth', post_data)
+    html = httpresponse.read().decode('iso8859-1')
+    #print(httpresponse.info())
+    open('welcome.html', 'w', encoding='iso8859-1').write(html)
+
+    assert 'initialiser-identif.ea' in html
+    httpresponse = httpopen(BASE_URL + '/voscomptes/canalXHTML/securite/authentification/initialiser-identif.ea')
+    html = httpresponse.read().decode('iso8859-1')
+    #print(httpresponse.info())
+    open('welcome2.html', 'w', encoding='iso8859-1').write(html)
+
+    assert 'verifierMotDePasse-identif.ea' in html
+    httpresponse = httpopen(BASE_URL + '/voscomptes/canalXHTML/securite/authentification/verifierMotDePasse-identif.ea')
+    html = httpresponse.read().decode('iso8859-1')
+    #print(httpresponse.info())
+    open('welcome3.html', 'w', encoding='iso8859-1').write(html)
+    
+    assert 'init-synthese.ea' in html
+    httpresponse = httpopen(BASE_URL + '/voscomptes/canalXHTML/comptesCommun/synthese_assurancesEtComptes/init-synthese.ea')
+    html = httpresponse.read().decode('iso8859-1')
+    #print(httpresponse.info())
+    open('welcome4.html', 'w', encoding='iso8859-1').write(html)
+    sleep(3)
+
+    root = html_parser.html_parse(html)
+    for a in html_parser.get_elem(root, 'a'):
+        href = a.attributes.get('href', '')
+        href = htmlentities.resolve(href)
+        match = re.match('\.\./\.\./(...)/.*compte.numero=(.*)&typeRecherche=(.*)', href)
+        if match:
+            logging.debug(href)
+            #../../CCP/releves_ccp/menuReleve-releve_ccp.ea?compte.numero=*******&amp;typeRecherche=1
+            # https://voscomptesenligne.labanquepostale.fr/voscomptes/canalXHTML/CCP/releves_ccp/menuReleve-releve_ccp.ea?compte.numero=*******&typeRecherche=1
+            cpttype, cptnum, searchtype = match.group(1), match.group(2), match.group(3)
+    
+            logging.info('Found account type %s: %s' % (cpttype, cptnum))
+
+            httpresponse = httpopen(BASE_URL + '/voscomptes/canalXHTML/' + href[len('../../'):])
+            html = httpresponse.read().decode('iso8859-1')
+            open(cptnum+'-init.html', 'w', encoding='iso8859-1').write(html)
+            sleep(4)
+
+            # https://voscomptesenligne.labanquepostale.fr/voscomptes/canalXHTML/comptesCommun/telechargementMouvement/init-telechargementMouvements.ea?compte.numero=*********&typeRecherche=1&typeMouvements=CCP
+            httpresponse = httpopen(BASE_URL + '/voscomptes/canalXHTML/comptesCommun/telechargementMouvement/init-telechargementMouvements.ea?compte.numero=' + cptnum + '&typeRecherche='+ searchtype +'&typeMouvements=' + cpttype)
+            html = httpresponse.read().decode('iso8859-1')
+            #print(httpresponse.info())
+            open(cptnum+'-init2.html', 'w', encoding='iso8859-1').write(html)
+            sleep(4)
+
+            httpresponse = httpopen(BASE_URL + '/voscomptes/canalXHTML/comptesCommun/telechargementMouvement/detailCompte2-telechargementMouvements.ea')
+            html = httpresponse.read().decode('iso8859-1')
+            #print(httpresponse.info())
+            open(cptnum+'-confirm.html', 'w', encoding='iso8859-1').write(html)
+            sleep(9)
+
+            root = html_parser.html_parse(html)
+            #html_parser.print_idented_tree(root)
+            for form in html_parser.get_elem(root, 'form'):
+                if form.attributes.get('id', None) == 'formConfirmAgain':
+                    url = form.attributes['action']
+                if not url:
+                    logging.critical("Can't find link to download csv")
+                    continue
+
+            # /voscomptes/canalXHTML/comptesCommun/telechargementMouvement/preparerRecherche-telechargementMouvements.ea?ts=1304816124318 POST 'format=CSV&duree='
+            httpresponse = httpopen(BASE_URL + '/voscomptes/canalXHTML/comptesCommun/telechargementMouvement/' + url, 'format=CSV&duree=')
+            filename= LOCAL_DIR + cptnum + '.' + datetime.now().strftime('%Y%m%dT%H%M%S') + '.csv'
+            csvdata = httpresponse.read().decode('iso8859-1')
+            logging.info('Save CSV data to %s', filename)
+            open(filename, 'w', encoding='utf-8').write(csvdata)
+            sleep(9)
+
+    logging.info('Disconnecting')
+    httpresponse = httpopen(BASE_URL + '/voscomptes/canalXHTML/securite/deconnexion/init-deconnexion.ea')
+    html = httpresponse.read().decode('iso8859-1')
+    open('bye.html', 'w', encoding='iso8859-1').write(html)
+
+    logging.info('Disconnected')
+    return result
+
+if __name__ == '__main__':
+    from optparse import OptionParser
+    parser = OptionParser()
+    parser.add_option('-d', '--debug',
+        action='store_true', dest='debug', default=False,
+        help="debug mode")
+    (options, args) = parser.parse_args()
+
+    if options.debug:
+        loglevel = logging.DEBUG
+    else:
+        loglevel = logging.INFO
+    logging.basicConfig(level=loglevel, format='%(asctime)s %(levelname)s %(message)s')
+
+    os.umask(0o077)
+    TMP_DIR = LOCAL_DIR + 'tmp/'
+    try:
+        os.mkdir(TMP_DIR)
+    except OSError as err:
+        if err.errno != 17: # File exists
+            raise
+    os.chdir(TMP_DIR)
+
+    main()
diff --git a/html_parser.py b/html_parser.py
new file mode 100755 (executable)
index 0000000..ab33204
--- /dev/null
@@ -0,0 +1,351 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import sys, htmlentities
+from optparse import OptionParser
+
+VERBOSE_PARSER = False
+
+TI_EMPTY    = 1 # there's not content in these tags, ie assume <tagname ... / >
+taginfo = {
+    'meta': TI_EMPTY,
+    'link': TI_EMPTY,
+    'br':  TI_EMPTY,
+    'img':  TI_EMPTY,
+    'hr':  TI_EMPTY,
+}
+
+class Node:
+    class Flags:
+        ROOT    = 1 # this is the root node. There can be only one root
+        CLOSING = 2 # this is a closing tag such as </b>. This tags from the lexer are discarded by the parser
+        CLOSED  = 4 # this is closed. Uncleaned output will only have closing tag if that flag is present.
+
+    def __init__(self):
+        self.father = None
+        self.children = []
+        self.flags = 0
+
+class Tag(Node):
+    def __init__(self):
+        Node.__init__(self)
+        self.name = ''
+        self.attributes = {}
+
+    def get_tag_info(self):
+        """
+        Returns TI_ flags base on the name of the name
+        """
+        return taginfo.get(self.name, 0)
+
+    def __repr__(self):
+        #assert self.name != u''
+        result = '<'
+        if self.flags & Node.Flags.CLOSING:
+            result += '/'
+        result += self.name
+        for k,v in self.attributes.items():
+            #result += u' (('+k+u'))'
+            result += ' '+k
+            if v:
+                result += '="'+v.replace('\\', '\\\\').replace('"', '\\"')+'"'
+        result += '>'
+        return result
+
+    #def __repr__(self):
+    #    return 'Tag'+unicode(self).encode('utf8')
+
+class Leaf(Node):
+    # TODO: rename this to CDATA or whatever
+    def __init__(self, text):
+        Node.__init__(self)
+        self.text = htmlentities.resolve(text)
+    def __repr__(self):
+        return self.text # FIXME escape ?
+    #def __repr__(self):
+    #    return 'Leaf<'+repr(self.text.encode('utf8'))+'>'
+
+
+def html_lexer(page):
+    """
+    That iterator yields Nodes with father/children unset
+    """
+    buf = page # buffer
+    pos = 0 # everything before that position has already been parsed
+    l = len(buf) # constant length
+    state = 0
+
+    def buffind(token):
+        r = buf.find(token, pos)
+        if r==-1:
+            return None
+        return r
+
+    def get_next_tag():
+        state = 'INIT'
+        state_white_skiping = False
+        p = pos # will start with skipping '<'
+        tag = Tag()
+        while True:
+            p += 1
+            if p>=l: # EOS
+                return None, p # what about last?
+            c = buf[p]
+           
+            if state_white_skiping:
+                if ord(c)<=32:
+                    continue
+                else:
+                    state_white_skiping = False
+                
+            if state == 'INIT':
+                if c == '/':
+                    tag.flags += Node.Flags.CLOSING
+                    continue
+                elif c == '>':
+                    return tag, p+1
+                else:
+                    state = 'NAME'
+                    tag.name += c.lower()
+                    continue
+            elif state == 'NAME':
+                if ord(c)<=32 or c=='/':
+                    state = 'ATT_NAME'
+                    att_name = ''
+                    state_white_skiping = True
+                    continue
+                elif c == '>':
+                    return tag, p+1
+                else:
+                    tag.name += c.lower()
+                    continue
+            elif state == 'ATT_NAME':
+                if ord(c)<=32:
+                    state = 'ATT_EQUALS'
+                    state_white_skiping = True
+                    continue
+                elif c == '=':
+                    state = 'ATT_VALUE'
+                    state_white_skiping = True
+                    att_value = ''
+                    continue
+                elif c == '>':
+                    if att_name != '':
+                        tag.attributes[att_name] = ''
+                    return tag, p+1
+                else:   
+                    att_name += c.lower()
+                    continue
+            elif state == 'ATT_EQUALS':
+                if ord(c)<=32:
+                    continue
+                elif c == '=':
+                    state = 'ATT_VALUE'
+                    state_white_skiping = True
+                    att_value = ''
+                    continue
+                elif c == '>':
+                    if att_name != '':
+                        tag.attributes[att_name] = ''
+                    return tag, p+1
+                else:
+                    if att_name != '':
+                        tag.attributes[att_name] = ''
+                    state = 'ATT_NAME'
+                    att_name = c.lower()
+                    state_white_skiping = True
+                    continue
+            elif state == 'ATT_VALUE':
+                if att_value == '': # first char
+                    if c == '"' or c == "'":
+                        att_value_escape = c
+                        state = 'ATT_VALUE_QUOTED'
+                        continue
+                if ord(c)<32:
+                    tag.attributes[att_name] = att_value
+                    state = 'ATT_NAME'
+                    state_white_skiping = True
+                    att_name = ''
+                    continue
+                elif c == '>':
+                    tag.attributes[att_name] = att_value
+                    return tag, p+1
+                else:
+                    att_value += c
+                    continue
+            elif state == 'ATT_VALUE_QUOTED':
+                if c == att_value_escape:
+                    tag.attributes[att_name] = att_value
+                    state = 'ATT_NAME'
+                    state_white_skiping = True
+                    att_name = ''
+                    continue
+                else:
+                    att_value += c
+                    continue
+
+    while True:
+        # get next tag position
+        # TODO: check it's a real tag and not a fragment that should added to that leafnode
+        pt1 = buffind('<')
+        if pt1 != pos:
+            yield Leaf(buf[pos:pt1])
+            if pt1 is None:
+                return
+        pos = pt1
+        
+        tag, pos = get_next_tag()
+        yield tag
+
+
+def html_parse(page):
+    """
+    This function fetches the nodes from the lexer and assemble them in a node tree
+    """
+    root = Tag()
+    root.flags = Node.Flags.ROOT
+    father = root
+    for node in html_lexer(page):
+        if isinstance(node, Leaf):
+            node.father = father
+            father.children.append(node)
+        elif node.flags & Node.Flags.CLOSING:
+            # change current father
+            newfather = father
+            while True:
+                # TODO: optimize with Node.Flags.ROOT
+                if newfather is None:
+                    #TODO: log.debug()
+                    if VERBOSE_PARSER:
+                        print('Closing tag', node, 'does not match any opening tag. Discarding.', file=sys.stderr)
+                    break
+                if newfather.name == node.name:
+                    newfather.flags |= Node.Flags.CLOSED
+                    if VERBOSE_PARSER:
+                        if newfather != father:
+                            print('Closing tag', node, 'has auto-closed other nodes', end=' ', file=sys.stderr)
+                            deb = father
+                            while deb != newfather:
+                                print(deb, end=' ', file=sys.stderr)
+                                deb = deb.father
+                            print(file=sys.stderr)
+                    father = newfather.father
+                    break
+                newfather = newfather.father
+        else:
+            node.father = father
+            father.children.append(node)
+            #print 'node=',node,'info=',node.get_tag_info()
+            if not node.get_tag_info() & TI_EMPTY:
+                father = node
+        #print 'node=',node,'father=',father
+    return root
+
+
+def print_idented_tree(node, identation_level=-1):
+    if not node.flags & Node.Flags.ROOT:
+        print('   '*identation_level+repr(node))
+    for c in node.children:
+        print_idented_tree(c, identation_level+1)
+    if isinstance(node, Tag) and (node.flags&Node.Flags.CLOSED):
+        print('   '*identation_level+'</'+node.name+'>')
+
+def print_lexer_tree(p):
+    identing = 0
+    for item in html_lexer(p):
+        if isinstance(item, Tag) and item.flags & Node.Flags.CLOSING:
+            identing -= 1
+        print('   '*identing, end=' ')
+        if isinstance(item, Tag) and not item.flags & Node.Flags.CLOSING:
+            identing += 1
+        print(repr(item))
+
+
+def get_elem(root, tagname):
+    """
+    Returns all the elements whose name matches
+    But not from the children of thoses
+    """
+    if isinstance(root, Leaf):
+        return []
+    if root.name == tagname:
+        return [ root ]
+    results = []
+    for node in root.children:
+        match = get_elem(node, tagname)
+        if match:
+            results += match
+    return results
+        
+
+def split_table(table):
+    """
+    Returns table content as a list (rows) of list (columns)
+    """
+    ctr = []
+    for tr in get_elem(table, 'tr'):
+        ctd = []
+        for td in get_elem(tr, 'td'):
+            ctd += [ td ]
+        ctr.append(ctd)
+    return ctr
+
+def split_table_r_to_leaf(root):
+    """
+    Recursivly split tables as descibed in split_table
+    Only returns leaf text or list for sub tables
+    """
+    result = []
+    tables = get_elem(root, 'table')
+    if len(tables)==0:
+        return get_merged_leaf_content(root)
+    for table in tables:
+        rrow = []
+        for row in split_table(table):
+            rcol = []
+            for col in row:
+                subr = split_table_r_to_leaf(col)
+                rcol.append(subr)
+            rrow.append(rcol)
+        result.append(rrow)
+    return result
+        
+
+def get_merged_leaf_content(root):
+    """
+    Returns all the leaf content agregated in a string
+    """
+    if isinstance(root, Leaf):
+        return root.text
+
+    result = ''
+    for node in root.children:
+        result += get_merged_leaf_content(node)
+    return result
+
+
+if __name__ == "__main__":
+    parser = OptionParser()
+    parser.add_option("--dump-lexer", help="Debug: Dump idented lexer output", action='store_true', dest='lexer_dump', default=False)
+    parser.add_option("--dump-parser", help="Debug: Dump idented parser output", action='store_true', dest='parser_dump', default=False)
+    parser.add_option("--verbose-parser", help="Debug: Verbose parser errors", action='store_true', dest='verbose_parser', default=False)
+    (options, args) = parser.parse_args()
+
+    try:
+        filename = args[0]
+    except IndexError:
+        print('Need a filename', file=sys.stderr)
+        sys.exit(-1)
+
+    VERBOSE_PARSER = options.verbose_parser
+    p = file(filename, encoding='utf-8').read()
+   
+    if options.lexer_dump:
+        print_lexer_tree(p)
+        sys.exit(0)
+
+    if options.parser_dump:
+        root = html_parse(p)
+        print_idented_tree(root)
+        sys.exit(0)
+
diff --git a/htmlentities.py b/htmlentities.py
new file mode 100755 (executable)
index 0000000..42afcf0
--- /dev/null
@@ -0,0 +1,198 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+
+__all__ = ['resolve', 'expand', 'cleanCDATA']
+
+from html.entities import name2codepoint as entities
+
+entities_autocomplete = {}
+longestEntityLen = 0
+for key,value in entities.items():
+    if value<=255:
+        entities_autocomplete[key] = value
+    l = len(key)
+    if l>longestEntityLen:
+        longestEntityLen = l
+
+# Characters in range 127-159 are illegals, but they are sometimes wrongly used in web pages
+# Internet Explorer assumes it is taken from Microsoft extension to Latin 1 page 8859-1 aka CP1512
+# However, to be clean, we must remap them to their real unicode values
+# Unknown codes are translated into a space
+iso88591_remap = [
+       32,             # 127: ???
+       8364,   # 128: Euro symbol
+       32,             # 129: ???
+       8218,   # 130: Single Low-9 Quotation Mark
+       402,    # 131: Latin Small Letter F With Hook
+       8222,   # 132: Double Low-9 Quotation Mark
+       8230,   # 133: Horizontal Ellipsis
+       8224,   # 134: Dagger
+       8225,   # 135: Double Dagger
+       710,    # 136: Modifier Letter Circumflex Accent
+       8240,   # 137: Per Mille Sign
+       352,    # 138: Latin Capital Letter S With Caron
+       8249,   # 139: Single Left-Pointing Angle Quotation Mark
+       338,    # 140: Latin Capital Ligature OE
+       32,             # 141: ???
+       381,    # 142: Latin Capital Letter Z With Caron
+       32,             # 143: ???
+       32,             # 144: ???
+       8216,   # 145: Left Single Quotation Mark
+       8217,   # 146: Right Single Quotation Mark
+       8220,   # 147: Left Double Quotation Mark
+       8221,   # 148: Right Double Quotation Mark
+       8226,   # 149: Bullet
+       8211,   # 150: En Dash
+       8212,   # 151: Em Dash
+       732,    # 152: Small Tilde
+       8482,   # 153: Trade Mark Sign
+       353,    # 154: Latin Small Letter S With Caron
+       8250,   # 155: Single Right-Pointing Angle Quotation Mark
+       339,    # 156: Latin Small Ligature OE
+       32,             # 157: ???
+       382,    # 158: Latin Small Letter Z With Caron
+       376             # 159: Latin Capital Letter Y With Diaeresis
+]
+
+
+def checkForUnicodeReservedChar(value):
+    if value >= 0xfffe:
+        return ord('?')
+    if value < 127 or value > 159:
+        return value
+    return iso88591_remap[value-127]
+
+def expand(text):
+    result = ''
+    for c in text:
+        oc = ord(c)
+        oc = checkForUnicodeReservedChar(oc)
+        if oc<32 or c=='&' or c=='<' or c=='>' or c=='"' or oc>127:
+            result += '&#'+str(oc)+';'
+        else:
+            result += c
+    return result
+
+def resolve(text):
+    pos = 0
+    result = ''
+    l = len(text)
+    while True:
+        prevpos = pos
+        pos = text.find('&', prevpos)
+        if pos == -1:
+            ## print "No more &"
+            break
+
+        if pos >= l-2:
+            ## print "Too shoort"
+            break
+               # here we are sure the next two chars exist
+        
+        result += text[prevpos:pos]
+        c = text[pos+1]
+        if c == '#':
+            ## print "numeric entity"
+                       # This looks like an char whose unicode if given raw
+            c = text[pos+2]
+            if c == 'x' or c == 'X' and pos < l-3:
+                tmppos = text.find(';', pos+3)
+                if tmppos != -1:
+                    s = text[pos+3: tmppos]
+                    try:
+                        value = int(s, 16)
+                        value = checkForUnicodeReservedChar(value) # remap unicode char if in range 127-159
+                        result += chr(value)
+                        pos = tmppos + 1
+                        continue # ok, we did it
+                    except ValueError:
+                                           # there pos is not updated so that the original escape-like sequence is kept unchanged
+                        pass
+            else:
+                               # the given unicode value is decimal
+                               # IE behavior: parse until non digital char, no conversion if this is not
+                sb = ''
+                tmppos = pos+2
+                while True:
+                    if tmppos >= l:
+                        break # out of range
+                    c = text[tmppos]
+                    if c == ';':
+                        tmppos += 1
+                        break
+                    if c<'0' or c>'9':
+                        break
+                    sb += c
+                    tmppos += 1
+                try:
+                    value = int(sb)
+                    value = checkForUnicodeReservedChar(value); # remap unicode char if in range 127-159
+                    result += chr(value)
+                    pos = tmppos
+                    continue # ok, we did it
+                except ValueError:
+                    # there pos is not updated so that the original escape-like sequence is kept unchanged
+                    pass
+        else:
+            # here the first character is not a '#'
+            # let's try the known html entities
+
+            sb = ''
+            tmppos = pos + 1
+            while True:
+                if tmppos >= l or tmppos-pos > longestEntityLen + 1: # 1 more for ';'
+                    c2 = entities_autocomplete.get(sb, 0)
+                    break
+                c = text[tmppos]
+                if c == ';':
+                    tmppos += 1
+                    c2 = entities.get(sb, 0)
+                    break
+                c2 = entities_autocomplete.get(sb, 0)
+                if c2:
+                    break
+                sb += c
+                tmppos += 1
+            if c2:
+                result += chr(c2)
+                pos = tmppos
+                continue # ok, we did it
+                        
+        result += '&' # something went wrong, just skip is '&'
+        pos += 1
+
+    result += text[prevpos:] 
+    return result
+
+def cleanCDATA(text):
+    """
+    resolve entities
+    removes useless whites, \r, \n and \t with whites
+    expand back entities
+    """
+    tmp = resolve(text)
+    result = ''
+    isLastWhite = False # so that first white is not removed
+    for c in tmp:
+        if c in ' \r\n\t':
+            if not isLastWhite:
+                result += ' '
+                isLastWhite = True
+        else:
+            result += c
+            isLastWhite = False
+
+    return expand(result)
+
+if __name__ == '__main__':
+    import sys
+    if len(sys.argv)<2:
+        print("Missing required parameter. Try '&amp;test'", file=sys.stderr)
+        sys.exit(1)
+    input = ' '.join(sys.argv[1:])
+    #print 'input:', input
+    #raw = resolve(input)
+    #print 'resolved:', raw
+    #print 'expanded:', expand(raw)
+    print('cleanCDATA:', cleanCDATA(input))
+