--- /dev/null
+#!/usr/bin/env python3
+
+import os
+import time
+import re
+import logging
+from datetime import datetime
+import urllib.request
+from http.cookiejar import CookieJar
+from subprocess import Popen, PIPE
+
+import html_parser
+import htmlentities
+
+BASE_URL = 'https://voscomptesenligne.labanquepostale.fr'
+WSOST_PREFIX = '/wsost/OstBrokerWeb/loginform?imgid='
+
+LOCAL_DIR = '/home/nirgal/banquepostale/'
+
+def get_login_password():
+ config=open(LOCAL_DIR + 'config').read()
+ login=None
+ password=None
+ for line in config.splitlines():
+ if line.startswith('login'):
+ login = line[len('login'):].strip()
+ elif line.startswith('password'):
+ password = line[len('password'):].strip()
+ return login, password
+
+__opener__ = None
+def httpopen(url, post_data=None):
+ if post_data:
+ logging.debug('HTTP POST %s %s', url, post_data)
+ else:
+ logging.debug('HTTP GET %s', url)
+ global __opener__
+ if __opener__ is None:
+ cookiejar = CookieJar()
+ __opener__ = urllib.request.build_opener()
+ __opener__.add_handler(urllib.request.HTTPCookieProcessor(cookiejar))
+ http_response = __opener__.open(url, post_data)
+ return http_response
+
+def sleep(seconds):
+ logging.debug('Waiting %s seconds', seconds)
+ time.sleep(seconds)
+
+def main():
+ '''
+ Download all the accounts csv data and store them in LOCAL_DIR
+ Return a list of filenames
+ '''
+ result = []
+
+ logging.info('Downloading password form')
+ httpresponse = httpopen(BASE_URL + '/voscomptes/canalXHTML/securite/authentification/recupererPointEntree-identif.ea')
+ html = httpresponse.read().decode('iso8859-1')
+ #logging.debug(httpresponse.info())
+ open('login.html', 'w', encoding='iso8859-1').write(html)
+
+ root = html_parser.html_parse(html)
+ #html_parser.print_idented_tree(root)
+ for img in html_parser.get_elem(root, 'img'):
+ src = img.attributes.get('src', '')
+ if src.startswith(WSOST_PREFIX):
+ #print(img)
+ img_id = src[len(WSOST_PREFIX)]
+ #print(img_id)
+ httpresponse = httpopen(BASE_URL + src)
+ img = httpresponse.read()
+ open(img_id+'.gif', 'wb').write(img)
+
+ xlt_password = {}
+ for img_id in "0123456789":
+ proc_convert=Popen('convert %s.gif -crop 20x20+5+5 pnm:-' % img_id,
+ shell=True, stdout=PIPE)
+ proc_gocr=Popen('gocr -C 0-9 -i -',
+ shell=True, stdin=proc_convert.stdout, stdout=PIPE)
+ output = proc_gocr.communicate()[0]
+ output = output.decode('utf-8')
+ output = output.strip()
+ #print("image #%s is %s" % (img_id, output))
+ xlt_password[output] = img_id
+
+ LOGIN, PASSWORD = get_login_password()
+
+ shuffled_password = ''
+ for c in PASSWORD:
+ shuffled_password += xlt_password[c]
+ logging.info("shuffled_password: %s", shuffled_password)
+ #for i in 0 1 2 3 4 5 6 7 8 9; do convert $i.gif -crop 20x20+5+5 pnm:- | gocr -C 0-9 -i -; done
+
+ sleep(10) # We are not supermen
+
+ post_data='urlbackend=%2Fvoscomptes%2FcanalXHTML%2Fsecurite%2Fauthentification%2FrecupererPointEntree-identif.ea%3Forigin%3Dparticuliers&origin=particuliers&password=' + shuffled_password + '&cv=true&cvvs=&username=' + LOGIN
+ httpresponse = httpopen(BASE_URL + '/wsost/OstBrokerWeb/auth', post_data)
+ html = httpresponse.read().decode('iso8859-1')
+ #print(httpresponse.info())
+ open('welcome.html', 'w', encoding='iso8859-1').write(html)
+
+ assert 'initialiser-identif.ea' in html
+ httpresponse = httpopen(BASE_URL + '/voscomptes/canalXHTML/securite/authentification/initialiser-identif.ea')
+ html = httpresponse.read().decode('iso8859-1')
+ #print(httpresponse.info())
+ open('welcome2.html', 'w', encoding='iso8859-1').write(html)
+
+ assert 'verifierMotDePasse-identif.ea' in html
+ httpresponse = httpopen(BASE_URL + '/voscomptes/canalXHTML/securite/authentification/verifierMotDePasse-identif.ea')
+ html = httpresponse.read().decode('iso8859-1')
+ #print(httpresponse.info())
+ open('welcome3.html', 'w', encoding='iso8859-1').write(html)
+
+ assert 'init-synthese.ea' in html
+ httpresponse = httpopen(BASE_URL + '/voscomptes/canalXHTML/comptesCommun/synthese_assurancesEtComptes/init-synthese.ea')
+ html = httpresponse.read().decode('iso8859-1')
+ #print(httpresponse.info())
+ open('welcome4.html', 'w', encoding='iso8859-1').write(html)
+ sleep(3)
+
+ root = html_parser.html_parse(html)
+ for a in html_parser.get_elem(root, 'a'):
+ href = a.attributes.get('href', '')
+ href = htmlentities.resolve(href)
+ match = re.match('\.\./\.\./(...)/.*compte.numero=(.*)&typeRecherche=(.*)', href)
+ if match:
+ logging.debug(href)
+ #../../CCP/releves_ccp/menuReleve-releve_ccp.ea?compte.numero=*******&typeRecherche=1
+ # https://voscomptesenligne.labanquepostale.fr/voscomptes/canalXHTML/CCP/releves_ccp/menuReleve-releve_ccp.ea?compte.numero=*******&typeRecherche=1
+ cpttype, cptnum, searchtype = match.group(1), match.group(2), match.group(3)
+
+ logging.info('Found account type %s: %s' % (cpttype, cptnum))
+
+ httpresponse = httpopen(BASE_URL + '/voscomptes/canalXHTML/' + href[len('../../'):])
+ html = httpresponse.read().decode('iso8859-1')
+ open(cptnum+'-init.html', 'w', encoding='iso8859-1').write(html)
+ sleep(4)
+
+ # https://voscomptesenligne.labanquepostale.fr/voscomptes/canalXHTML/comptesCommun/telechargementMouvement/init-telechargementMouvements.ea?compte.numero=*********&typeRecherche=1&typeMouvements=CCP
+ httpresponse = httpopen(BASE_URL + '/voscomptes/canalXHTML/comptesCommun/telechargementMouvement/init-telechargementMouvements.ea?compte.numero=' + cptnum + '&typeRecherche='+ searchtype +'&typeMouvements=' + cpttype)
+ html = httpresponse.read().decode('iso8859-1')
+ #print(httpresponse.info())
+ open(cptnum+'-init2.html', 'w', encoding='iso8859-1').write(html)
+ sleep(4)
+
+ httpresponse = httpopen(BASE_URL + '/voscomptes/canalXHTML/comptesCommun/telechargementMouvement/detailCompte2-telechargementMouvements.ea')
+ html = httpresponse.read().decode('iso8859-1')
+ #print(httpresponse.info())
+ open(cptnum+'-confirm.html', 'w', encoding='iso8859-1').write(html)
+ sleep(9)
+
+ root = html_parser.html_parse(html)
+ #html_parser.print_idented_tree(root)
+ for form in html_parser.get_elem(root, 'form'):
+ if form.attributes.get('id', None) == 'formConfirmAgain':
+ url = form.attributes['action']
+ if not url:
+ logging.critical("Can't find link to download csv")
+ continue
+
+ # /voscomptes/canalXHTML/comptesCommun/telechargementMouvement/preparerRecherche-telechargementMouvements.ea?ts=1304816124318 POST 'format=CSV&duree='
+ httpresponse = httpopen(BASE_URL + '/voscomptes/canalXHTML/comptesCommun/telechargementMouvement/' + url, 'format=CSV&duree=')
+ filename= LOCAL_DIR + cptnum + '.' + datetime.now().strftime('%Y%m%dT%H%M%S') + '.csv'
+ csvdata = httpresponse.read().decode('iso8859-1')
+ logging.info('Save CSV data to %s', filename)
+ open(filename, 'w', encoding='utf-8').write(csvdata)
+ sleep(9)
+
+ logging.info('Disconnecting')
+ httpresponse = httpopen(BASE_URL + '/voscomptes/canalXHTML/securite/deconnexion/init-deconnexion.ea')
+ html = httpresponse.read().decode('iso8859-1')
+ open('bye.html', 'w', encoding='iso8859-1').write(html)
+
+ logging.info('Disconnected')
+ return result
+
+if __name__ == '__main__':
+ from optparse import OptionParser
+ parser = OptionParser()
+ parser.add_option('-d', '--debug',
+ action='store_true', dest='debug', default=False,
+ help="debug mode")
+ (options, args) = parser.parse_args()
+
+ if options.debug:
+ loglevel = logging.DEBUG
+ else:
+ loglevel = logging.INFO
+ logging.basicConfig(level=loglevel, format='%(asctime)s %(levelname)s %(message)s')
+
+ os.umask(0o077)
+ TMP_DIR = LOCAL_DIR + 'tmp/'
+ try:
+ os.mkdir(TMP_DIR)
+ except OSError as err:
+ if err.errno != 17: # File exists
+ raise
+ os.chdir(TMP_DIR)
+
+ main()
--- /dev/null
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import sys, htmlentities
+from optparse import OptionParser
+
+VERBOSE_PARSER = False
+
+TI_EMPTY = 1 # there's not content in these tags, ie assume <tagname ... / >
+taginfo = {
+ 'meta': TI_EMPTY,
+ 'link': TI_EMPTY,
+ 'br': TI_EMPTY,
+ 'img': TI_EMPTY,
+ 'hr': TI_EMPTY,
+}
+
+class Node:
+ class Flags:
+ ROOT = 1 # this is the root node. There can be only one root
+ CLOSING = 2 # this is a closing tag such as </b>. This tags from the lexer are discarded by the parser
+ CLOSED = 4 # this is closed. Uncleaned output will only have closing tag if that flag is present.
+
+ def __init__(self):
+ self.father = None
+ self.children = []
+ self.flags = 0
+
+class Tag(Node):
+ def __init__(self):
+ Node.__init__(self)
+ self.name = ''
+ self.attributes = {}
+
+ def get_tag_info(self):
+ """
+ Returns TI_ flags base on the name of the name
+ """
+ return taginfo.get(self.name, 0)
+
+ def __repr__(self):
+ #assert self.name != u''
+ result = '<'
+ if self.flags & Node.Flags.CLOSING:
+ result += '/'
+ result += self.name
+ for k,v in self.attributes.items():
+ #result += u' (('+k+u'))'
+ result += ' '+k
+ if v:
+ result += '="'+v.replace('\\', '\\\\').replace('"', '\\"')+'"'
+ result += '>'
+ return result
+
+ #def __repr__(self):
+ # return 'Tag'+unicode(self).encode('utf8')
+
+class Leaf(Node):
+ # TODO: rename this to CDATA or whatever
+ def __init__(self, text):
+ Node.__init__(self)
+ self.text = htmlentities.resolve(text)
+ def __repr__(self):
+ return self.text # FIXME escape ?
+ #def __repr__(self):
+ # return 'Leaf<'+repr(self.text.encode('utf8'))+'>'
+
+
+def html_lexer(page):
+ """
+ That iterator yields Nodes with father/children unset
+ """
+ buf = page # buffer
+ pos = 0 # everything before that position has already been parsed
+ l = len(buf) # constant length
+ state = 0
+
+ def buffind(token):
+ r = buf.find(token, pos)
+ if r==-1:
+ return None
+ return r
+
+ def get_next_tag():
+ state = 'INIT'
+ state_white_skiping = False
+ p = pos # will start with skipping '<'
+ tag = Tag()
+ while True:
+ p += 1
+ if p>=l: # EOS
+ return None, p # what about last?
+ c = buf[p]
+
+ if state_white_skiping:
+ if ord(c)<=32:
+ continue
+ else:
+ state_white_skiping = False
+
+ if state == 'INIT':
+ if c == '/':
+ tag.flags += Node.Flags.CLOSING
+ continue
+ elif c == '>':
+ return tag, p+1
+ else:
+ state = 'NAME'
+ tag.name += c.lower()
+ continue
+ elif state == 'NAME':
+ if ord(c)<=32 or c=='/':
+ state = 'ATT_NAME'
+ att_name = ''
+ state_white_skiping = True
+ continue
+ elif c == '>':
+ return tag, p+1
+ else:
+ tag.name += c.lower()
+ continue
+ elif state == 'ATT_NAME':
+ if ord(c)<=32:
+ state = 'ATT_EQUALS'
+ state_white_skiping = True
+ continue
+ elif c == '=':
+ state = 'ATT_VALUE'
+ state_white_skiping = True
+ att_value = ''
+ continue
+ elif c == '>':
+ if att_name != '':
+ tag.attributes[att_name] = ''
+ return tag, p+1
+ else:
+ att_name += c.lower()
+ continue
+ elif state == 'ATT_EQUALS':
+ if ord(c)<=32:
+ continue
+ elif c == '=':
+ state = 'ATT_VALUE'
+ state_white_skiping = True
+ att_value = ''
+ continue
+ elif c == '>':
+ if att_name != '':
+ tag.attributes[att_name] = ''
+ return tag, p+1
+ else:
+ if att_name != '':
+ tag.attributes[att_name] = ''
+ state = 'ATT_NAME'
+ att_name = c.lower()
+ state_white_skiping = True
+ continue
+ elif state == 'ATT_VALUE':
+ if att_value == '': # first char
+ if c == '"' or c == "'":
+ att_value_escape = c
+ state = 'ATT_VALUE_QUOTED'
+ continue
+ if ord(c)<32:
+ tag.attributes[att_name] = att_value
+ state = 'ATT_NAME'
+ state_white_skiping = True
+ att_name = ''
+ continue
+ elif c == '>':
+ tag.attributes[att_name] = att_value
+ return tag, p+1
+ else:
+ att_value += c
+ continue
+ elif state == 'ATT_VALUE_QUOTED':
+ if c == att_value_escape:
+ tag.attributes[att_name] = att_value
+ state = 'ATT_NAME'
+ state_white_skiping = True
+ att_name = ''
+ continue
+ else:
+ att_value += c
+ continue
+
+ while True:
+ # get next tag position
+ # TODO: check it's a real tag and not a fragment that should added to that leafnode
+ pt1 = buffind('<')
+ if pt1 != pos:
+ yield Leaf(buf[pos:pt1])
+ if pt1 is None:
+ return
+ pos = pt1
+
+ tag, pos = get_next_tag()
+ yield tag
+
+
+def html_parse(page):
+ """
+ This function fetches the nodes from the lexer and assemble them in a node tree
+ """
+ root = Tag()
+ root.flags = Node.Flags.ROOT
+ father = root
+ for node in html_lexer(page):
+ if isinstance(node, Leaf):
+ node.father = father
+ father.children.append(node)
+ elif node.flags & Node.Flags.CLOSING:
+ # change current father
+ newfather = father
+ while True:
+ # TODO: optimize with Node.Flags.ROOT
+ if newfather is None:
+ #TODO: log.debug()
+ if VERBOSE_PARSER:
+ print('Closing tag', node, 'does not match any opening tag. Discarding.', file=sys.stderr)
+ break
+ if newfather.name == node.name:
+ newfather.flags |= Node.Flags.CLOSED
+ if VERBOSE_PARSER:
+ if newfather != father:
+ print('Closing tag', node, 'has auto-closed other nodes', end=' ', file=sys.stderr)
+ deb = father
+ while deb != newfather:
+ print(deb, end=' ', file=sys.stderr)
+ deb = deb.father
+ print(file=sys.stderr)
+ father = newfather.father
+ break
+ newfather = newfather.father
+ else:
+ node.father = father
+ father.children.append(node)
+ #print 'node=',node,'info=',node.get_tag_info()
+ if not node.get_tag_info() & TI_EMPTY:
+ father = node
+ #print 'node=',node,'father=',father
+ return root
+
+
+def print_idented_tree(node, identation_level=-1):
+ if not node.flags & Node.Flags.ROOT:
+ print(' '*identation_level+repr(node))
+ for c in node.children:
+ print_idented_tree(c, identation_level+1)
+ if isinstance(node, Tag) and (node.flags&Node.Flags.CLOSED):
+ print(' '*identation_level+'</'+node.name+'>')
+
+def print_lexer_tree(p):
+ identing = 0
+ for item in html_lexer(p):
+ if isinstance(item, Tag) and item.flags & Node.Flags.CLOSING:
+ identing -= 1
+ print(' '*identing, end=' ')
+ if isinstance(item, Tag) and not item.flags & Node.Flags.CLOSING:
+ identing += 1
+ print(repr(item))
+
+
+def get_elem(root, tagname):
+ """
+ Returns all the elements whose name matches
+ But not from the children of thoses
+ """
+ if isinstance(root, Leaf):
+ return []
+ if root.name == tagname:
+ return [ root ]
+ results = []
+ for node in root.children:
+ match = get_elem(node, tagname)
+ if match:
+ results += match
+ return results
+
+
+def split_table(table):
+ """
+ Returns table content as a list (rows) of list (columns)
+ """
+ ctr = []
+ for tr in get_elem(table, 'tr'):
+ ctd = []
+ for td in get_elem(tr, 'td'):
+ ctd += [ td ]
+ ctr.append(ctd)
+ return ctr
+
+def split_table_r_to_leaf(root):
+ """
+ Recursivly split tables as descibed in split_table
+ Only returns leaf text or list for sub tables
+ """
+ result = []
+ tables = get_elem(root, 'table')
+ if len(tables)==0:
+ return get_merged_leaf_content(root)
+ for table in tables:
+ rrow = []
+ for row in split_table(table):
+ rcol = []
+ for col in row:
+ subr = split_table_r_to_leaf(col)
+ rcol.append(subr)
+ rrow.append(rcol)
+ result.append(rrow)
+ return result
+
+
+def get_merged_leaf_content(root):
+ """
+ Returns all the leaf content agregated in a string
+ """
+ if isinstance(root, Leaf):
+ return root.text
+
+ result = ''
+ for node in root.children:
+ result += get_merged_leaf_content(node)
+ return result
+
+
+if __name__ == "__main__":
+ parser = OptionParser()
+ parser.add_option("--dump-lexer", help="Debug: Dump idented lexer output", action='store_true', dest='lexer_dump', default=False)
+ parser.add_option("--dump-parser", help="Debug: Dump idented parser output", action='store_true', dest='parser_dump', default=False)
+ parser.add_option("--verbose-parser", help="Debug: Verbose parser errors", action='store_true', dest='verbose_parser', default=False)
+ (options, args) = parser.parse_args()
+
+ try:
+ filename = args[0]
+ except IndexError:
+ print('Need a filename', file=sys.stderr)
+ sys.exit(-1)
+
+ VERBOSE_PARSER = options.verbose_parser
+ p = file(filename, encoding='utf-8').read()
+
+ if options.lexer_dump:
+ print_lexer_tree(p)
+ sys.exit(0)
+
+ if options.parser_dump:
+ root = html_parse(p)
+ print_idented_tree(root)
+ sys.exit(0)
+
--- /dev/null
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+
+__all__ = ['resolve', 'expand', 'cleanCDATA']
+
+from html.entities import name2codepoint as entities
+
+entities_autocomplete = {}
+longestEntityLen = 0
+for key,value in entities.items():
+ if value<=255:
+ entities_autocomplete[key] = value
+ l = len(key)
+ if l>longestEntityLen:
+ longestEntityLen = l
+
+# Characters in range 127-159 are illegals, but they are sometimes wrongly used in web pages
+# Internet Explorer assumes it is taken from Microsoft extension to Latin 1 page 8859-1 aka CP1512
+# However, to be clean, we must remap them to their real unicode values
+# Unknown codes are translated into a space
+iso88591_remap = [
+ 32, # 127: ???
+ 8364, # 128: Euro symbol
+ 32, # 129: ???
+ 8218, # 130: Single Low-9 Quotation Mark
+ 402, # 131: Latin Small Letter F With Hook
+ 8222, # 132: Double Low-9 Quotation Mark
+ 8230, # 133: Horizontal Ellipsis
+ 8224, # 134: Dagger
+ 8225, # 135: Double Dagger
+ 710, # 136: Modifier Letter Circumflex Accent
+ 8240, # 137: Per Mille Sign
+ 352, # 138: Latin Capital Letter S With Caron
+ 8249, # 139: Single Left-Pointing Angle Quotation Mark
+ 338, # 140: Latin Capital Ligature OE
+ 32, # 141: ???
+ 381, # 142: Latin Capital Letter Z With Caron
+ 32, # 143: ???
+ 32, # 144: ???
+ 8216, # 145: Left Single Quotation Mark
+ 8217, # 146: Right Single Quotation Mark
+ 8220, # 147: Left Double Quotation Mark
+ 8221, # 148: Right Double Quotation Mark
+ 8226, # 149: Bullet
+ 8211, # 150: En Dash
+ 8212, # 151: Em Dash
+ 732, # 152: Small Tilde
+ 8482, # 153: Trade Mark Sign
+ 353, # 154: Latin Small Letter S With Caron
+ 8250, # 155: Single Right-Pointing Angle Quotation Mark
+ 339, # 156: Latin Small Ligature OE
+ 32, # 157: ???
+ 382, # 158: Latin Small Letter Z With Caron
+ 376 # 159: Latin Capital Letter Y With Diaeresis
+]
+
+
+def checkForUnicodeReservedChar(value):
+ if value >= 0xfffe:
+ return ord('?')
+ if value < 127 or value > 159:
+ return value
+ return iso88591_remap[value-127]
+
+def expand(text):
+ result = ''
+ for c in text:
+ oc = ord(c)
+ oc = checkForUnicodeReservedChar(oc)
+ if oc<32 or c=='&' or c=='<' or c=='>' or c=='"' or oc>127:
+ result += '&#'+str(oc)+';'
+ else:
+ result += c
+ return result
+
+def resolve(text):
+ pos = 0
+ result = ''
+ l = len(text)
+ while True:
+ prevpos = pos
+ pos = text.find('&', prevpos)
+ if pos == -1:
+ ## print "No more &"
+ break
+
+ if pos >= l-2:
+ ## print "Too shoort"
+ break
+ # here we are sure the next two chars exist
+
+ result += text[prevpos:pos]
+ c = text[pos+1]
+ if c == '#':
+ ## print "numeric entity"
+ # This looks like an char whose unicode if given raw
+ c = text[pos+2]
+ if c == 'x' or c == 'X' and pos < l-3:
+ tmppos = text.find(';', pos+3)
+ if tmppos != -1:
+ s = text[pos+3: tmppos]
+ try:
+ value = int(s, 16)
+ value = checkForUnicodeReservedChar(value) # remap unicode char if in range 127-159
+ result += chr(value)
+ pos = tmppos + 1
+ continue # ok, we did it
+ except ValueError:
+ # there pos is not updated so that the original escape-like sequence is kept unchanged
+ pass
+ else:
+ # the given unicode value is decimal
+ # IE behavior: parse until non digital char, no conversion if this is not
+ sb = ''
+ tmppos = pos+2
+ while True:
+ if tmppos >= l:
+ break # out of range
+ c = text[tmppos]
+ if c == ';':
+ tmppos += 1
+ break
+ if c<'0' or c>'9':
+ break
+ sb += c
+ tmppos += 1
+ try:
+ value = int(sb)
+ value = checkForUnicodeReservedChar(value); # remap unicode char if in range 127-159
+ result += chr(value)
+ pos = tmppos
+ continue # ok, we did it
+ except ValueError:
+ # there pos is not updated so that the original escape-like sequence is kept unchanged
+ pass
+ else:
+ # here the first character is not a '#'
+ # let's try the known html entities
+
+ sb = ''
+ tmppos = pos + 1
+ while True:
+ if tmppos >= l or tmppos-pos > longestEntityLen + 1: # 1 more for ';'
+ c2 = entities_autocomplete.get(sb, 0)
+ break
+ c = text[tmppos]
+ if c == ';':
+ tmppos += 1
+ c2 = entities.get(sb, 0)
+ break
+ c2 = entities_autocomplete.get(sb, 0)
+ if c2:
+ break
+ sb += c
+ tmppos += 1
+ if c2:
+ result += chr(c2)
+ pos = tmppos
+ continue # ok, we did it
+
+ result += '&' # something went wrong, just skip is '&'
+ pos += 1
+
+ result += text[prevpos:]
+ return result
+
+def cleanCDATA(text):
+ """
+ resolve entities
+ removes useless whites, \r, \n and \t with whites
+ expand back entities
+ """
+ tmp = resolve(text)
+ result = ''
+ isLastWhite = False # so that first white is not removed
+ for c in tmp:
+ if c in ' \r\n\t':
+ if not isLastWhite:
+ result += ' '
+ isLastWhite = True
+ else:
+ result += c
+ isLastWhite = False
+
+ return expand(result)
+
+if __name__ == '__main__':
+ import sys
+ if len(sys.argv)<2:
+ print("Missing required parameter. Try '&test'", file=sys.stderr)
+ sys.exit(1)
+ input = ' '.join(sys.argv[1:])
+ #print 'input:', input
+ #raw = resolve(input)
+ #print 'resolved:', raw
+ #print 'expanded:', expand(raw)
+ print('cleanCDATA:', cleanCDATA(input))
+