From b203a85f8e835b49ebd0a383cdc3e9d56d428730 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Jean-Michel=20Nirgal=20Vourg=C3=A8re?= Date: Sun, 8 May 2011 10:57:27 +0000 Subject: [PATCH] First version of banque postale scripts --- go.py | 200 +++++++++++++++++++++++++++ html_parser.py | 351 ++++++++++++++++++++++++++++++++++++++++++++++++ htmlentities.py | 198 +++++++++++++++++++++++++++ 3 files changed, 749 insertions(+) create mode 100755 go.py create mode 100755 html_parser.py create mode 100755 htmlentities.py diff --git a/go.py b/go.py new file mode 100755 index 0000000..cfcf88a --- /dev/null +++ b/go.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python3 + +import os +import time +import re +import logging +from datetime import datetime +import urllib.request +from http.cookiejar import CookieJar +from subprocess import Popen, PIPE + +import html_parser +import htmlentities + +BASE_URL = 'https://voscomptesenligne.labanquepostale.fr' +WSOST_PREFIX = '/wsost/OstBrokerWeb/loginform?imgid=' + +LOCAL_DIR = '/home/nirgal/banquepostale/' + +def get_login_password(): + config=open(LOCAL_DIR + 'config').read() + login=None + password=None + for line in config.splitlines(): + if line.startswith('login'): + login = line[len('login'):].strip() + elif line.startswith('password'): + password = line[len('password'):].strip() + return login, password + +__opener__ = None +def httpopen(url, post_data=None): + if post_data: + logging.debug('HTTP POST %s %s', url, post_data) + else: + logging.debug('HTTP GET %s', url) + global __opener__ + if __opener__ is None: + cookiejar = CookieJar() + __opener__ = urllib.request.build_opener() + __opener__.add_handler(urllib.request.HTTPCookieProcessor(cookiejar)) + http_response = __opener__.open(url, post_data) + return http_response + +def sleep(seconds): + logging.debug('Waiting %s seconds', seconds) + time.sleep(seconds) + +def main(): + ''' + Download all the accounts csv data and store them in LOCAL_DIR + Return a list of filenames + ''' + result = [] + + logging.info('Downloading password form') + httpresponse = httpopen(BASE_URL + '/voscomptes/canalXHTML/securite/authentification/recupererPointEntree-identif.ea') + html = httpresponse.read().decode('iso8859-1') + #logging.debug(httpresponse.info()) + open('login.html', 'w', encoding='iso8859-1').write(html) + + root = html_parser.html_parse(html) + #html_parser.print_idented_tree(root) + for img in html_parser.get_elem(root, 'img'): + src = img.attributes.get('src', '') + if src.startswith(WSOST_PREFIX): + #print(img) + img_id = src[len(WSOST_PREFIX)] + #print(img_id) + httpresponse = httpopen(BASE_URL + src) + img = httpresponse.read() + open(img_id+'.gif', 'wb').write(img) + + xlt_password = {} + for img_id in "0123456789": + proc_convert=Popen('convert %s.gif -crop 20x20+5+5 pnm:-' % img_id, + shell=True, stdout=PIPE) + proc_gocr=Popen('gocr -C 0-9 -i -', + shell=True, stdin=proc_convert.stdout, stdout=PIPE) + output = proc_gocr.communicate()[0] + output = output.decode('utf-8') + output = output.strip() + #print("image #%s is %s" % (img_id, output)) + xlt_password[output] = img_id + + LOGIN, PASSWORD = get_login_password() + + shuffled_password = '' + for c in PASSWORD: + shuffled_password += xlt_password[c] + logging.info("shuffled_password: %s", shuffled_password) + #for i in 0 1 2 3 4 5 6 7 8 9; do convert $i.gif -crop 20x20+5+5 pnm:- | gocr -C 0-9 -i -; done + + sleep(10) # We are not supermen + + post_data='urlbackend=%2Fvoscomptes%2FcanalXHTML%2Fsecurite%2Fauthentification%2FrecupererPointEntree-identif.ea%3Forigin%3Dparticuliers&origin=particuliers&password=' + shuffled_password + '&cv=true&cvvs=&username=' + LOGIN + httpresponse = httpopen(BASE_URL + '/wsost/OstBrokerWeb/auth', post_data) + html = httpresponse.read().decode('iso8859-1') + #print(httpresponse.info()) + open('welcome.html', 'w', encoding='iso8859-1').write(html) + + assert 'initialiser-identif.ea' in html + httpresponse = httpopen(BASE_URL + '/voscomptes/canalXHTML/securite/authentification/initialiser-identif.ea') + html = httpresponse.read().decode('iso8859-1') + #print(httpresponse.info()) + open('welcome2.html', 'w', encoding='iso8859-1').write(html) + + assert 'verifierMotDePasse-identif.ea' in html + httpresponse = httpopen(BASE_URL + '/voscomptes/canalXHTML/securite/authentification/verifierMotDePasse-identif.ea') + html = httpresponse.read().decode('iso8859-1') + #print(httpresponse.info()) + open('welcome3.html', 'w', encoding='iso8859-1').write(html) + + assert 'init-synthese.ea' in html + httpresponse = httpopen(BASE_URL + '/voscomptes/canalXHTML/comptesCommun/synthese_assurancesEtComptes/init-synthese.ea') + html = httpresponse.read().decode('iso8859-1') + #print(httpresponse.info()) + open('welcome4.html', 'w', encoding='iso8859-1').write(html) + sleep(3) + + root = html_parser.html_parse(html) + for a in html_parser.get_elem(root, 'a'): + href = a.attributes.get('href', '') + href = htmlentities.resolve(href) + match = re.match('\.\./\.\./(...)/.*compte.numero=(.*)&typeRecherche=(.*)', href) + if match: + logging.debug(href) + #../../CCP/releves_ccp/menuReleve-releve_ccp.ea?compte.numero=*******&typeRecherche=1 + # https://voscomptesenligne.labanquepostale.fr/voscomptes/canalXHTML/CCP/releves_ccp/menuReleve-releve_ccp.ea?compte.numero=*******&typeRecherche=1 + cpttype, cptnum, searchtype = match.group(1), match.group(2), match.group(3) + + logging.info('Found account type %s: %s' % (cpttype, cptnum)) + + httpresponse = httpopen(BASE_URL + '/voscomptes/canalXHTML/' + href[len('../../'):]) + html = httpresponse.read().decode('iso8859-1') + open(cptnum+'-init.html', 'w', encoding='iso8859-1').write(html) + sleep(4) + + # https://voscomptesenligne.labanquepostale.fr/voscomptes/canalXHTML/comptesCommun/telechargementMouvement/init-telechargementMouvements.ea?compte.numero=*********&typeRecherche=1&typeMouvements=CCP + httpresponse = httpopen(BASE_URL + '/voscomptes/canalXHTML/comptesCommun/telechargementMouvement/init-telechargementMouvements.ea?compte.numero=' + cptnum + '&typeRecherche='+ searchtype +'&typeMouvements=' + cpttype) + html = httpresponse.read().decode('iso8859-1') + #print(httpresponse.info()) + open(cptnum+'-init2.html', 'w', encoding='iso8859-1').write(html) + sleep(4) + + httpresponse = httpopen(BASE_URL + '/voscomptes/canalXHTML/comptesCommun/telechargementMouvement/detailCompte2-telechargementMouvements.ea') + html = httpresponse.read().decode('iso8859-1') + #print(httpresponse.info()) + open(cptnum+'-confirm.html', 'w', encoding='iso8859-1').write(html) + sleep(9) + + root = html_parser.html_parse(html) + #html_parser.print_idented_tree(root) + for form in html_parser.get_elem(root, 'form'): + if form.attributes.get('id', None) == 'formConfirmAgain': + url = form.attributes['action'] + if not url: + logging.critical("Can't find link to download csv") + continue + + # /voscomptes/canalXHTML/comptesCommun/telechargementMouvement/preparerRecherche-telechargementMouvements.ea?ts=1304816124318 POST 'format=CSV&duree=' + httpresponse = httpopen(BASE_URL + '/voscomptes/canalXHTML/comptesCommun/telechargementMouvement/' + url, 'format=CSV&duree=') + filename= LOCAL_DIR + cptnum + '.' + datetime.now().strftime('%Y%m%dT%H%M%S') + '.csv' + csvdata = httpresponse.read().decode('iso8859-1') + logging.info('Save CSV data to %s', filename) + open(filename, 'w', encoding='utf-8').write(csvdata) + sleep(9) + + logging.info('Disconnecting') + httpresponse = httpopen(BASE_URL + '/voscomptes/canalXHTML/securite/deconnexion/init-deconnexion.ea') + html = httpresponse.read().decode('iso8859-1') + open('bye.html', 'w', encoding='iso8859-1').write(html) + + logging.info('Disconnected') + return result + +if __name__ == '__main__': + from optparse import OptionParser + parser = OptionParser() + parser.add_option('-d', '--debug', + action='store_true', dest='debug', default=False, + help="debug mode") + (options, args) = parser.parse_args() + + if options.debug: + loglevel = logging.DEBUG + else: + loglevel = logging.INFO + logging.basicConfig(level=loglevel, format='%(asctime)s %(levelname)s %(message)s') + + os.umask(0o077) + TMP_DIR = LOCAL_DIR + 'tmp/' + try: + os.mkdir(TMP_DIR) + except OSError as err: + if err.errno != 17: # File exists + raise + os.chdir(TMP_DIR) + + main() diff --git a/html_parser.py b/html_parser.py new file mode 100755 index 0000000..ab33204 --- /dev/null +++ b/html_parser.py @@ -0,0 +1,351 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + +import sys, htmlentities +from optparse import OptionParser + +VERBOSE_PARSER = False + +TI_EMPTY = 1 # there's not content in these tags, ie assume +taginfo = { + 'meta': TI_EMPTY, + 'link': TI_EMPTY, + 'br': TI_EMPTY, + 'img': TI_EMPTY, + 'hr': TI_EMPTY, +} + +class Node: + class Flags: + ROOT = 1 # this is the root node. There can be only one root + CLOSING = 2 # this is a closing tag such as . This tags from the lexer are discarded by the parser + CLOSED = 4 # this is closed. Uncleaned output will only have closing tag if that flag is present. + + def __init__(self): + self.father = None + self.children = [] + self.flags = 0 + +class Tag(Node): + def __init__(self): + Node.__init__(self) + self.name = '' + self.attributes = {} + + def get_tag_info(self): + """ + Returns TI_ flags base on the name of the name + """ + return taginfo.get(self.name, 0) + + def __repr__(self): + #assert self.name != u'' + result = '<' + if self.flags & Node.Flags.CLOSING: + result += '/' + result += self.name + for k,v in self.attributes.items(): + #result += u' (('+k+u'))' + result += ' '+k + if v: + result += '="'+v.replace('\\', '\\\\').replace('"', '\\"')+'"' + result += '>' + return result + + #def __repr__(self): + # return 'Tag'+unicode(self).encode('utf8') + +class Leaf(Node): + # TODO: rename this to CDATA or whatever + def __init__(self, text): + Node.__init__(self) + self.text = htmlentities.resolve(text) + def __repr__(self): + return self.text # FIXME escape ? + #def __repr__(self): + # return 'Leaf<'+repr(self.text.encode('utf8'))+'>' + + +def html_lexer(page): + """ + That iterator yields Nodes with father/children unset + """ + buf = page # buffer + pos = 0 # everything before that position has already been parsed + l = len(buf) # constant length + state = 0 + + def buffind(token): + r = buf.find(token, pos) + if r==-1: + return None + return r + + def get_next_tag(): + state = 'INIT' + state_white_skiping = False + p = pos # will start with skipping '<' + tag = Tag() + while True: + p += 1 + if p>=l: # EOS + return None, p # what about last? + c = buf[p] + + if state_white_skiping: + if ord(c)<=32: + continue + else: + state_white_skiping = False + + if state == 'INIT': + if c == '/': + tag.flags += Node.Flags.CLOSING + continue + elif c == '>': + return tag, p+1 + else: + state = 'NAME' + tag.name += c.lower() + continue + elif state == 'NAME': + if ord(c)<=32 or c=='/': + state = 'ATT_NAME' + att_name = '' + state_white_skiping = True + continue + elif c == '>': + return tag, p+1 + else: + tag.name += c.lower() + continue + elif state == 'ATT_NAME': + if ord(c)<=32: + state = 'ATT_EQUALS' + state_white_skiping = True + continue + elif c == '=': + state = 'ATT_VALUE' + state_white_skiping = True + att_value = '' + continue + elif c == '>': + if att_name != '': + tag.attributes[att_name] = '' + return tag, p+1 + else: + att_name += c.lower() + continue + elif state == 'ATT_EQUALS': + if ord(c)<=32: + continue + elif c == '=': + state = 'ATT_VALUE' + state_white_skiping = True + att_value = '' + continue + elif c == '>': + if att_name != '': + tag.attributes[att_name] = '' + return tag, p+1 + else: + if att_name != '': + tag.attributes[att_name] = '' + state = 'ATT_NAME' + att_name = c.lower() + state_white_skiping = True + continue + elif state == 'ATT_VALUE': + if att_value == '': # first char + if c == '"' or c == "'": + att_value_escape = c + state = 'ATT_VALUE_QUOTED' + continue + if ord(c)<32: + tag.attributes[att_name] = att_value + state = 'ATT_NAME' + state_white_skiping = True + att_name = '' + continue + elif c == '>': + tag.attributes[att_name] = att_value + return tag, p+1 + else: + att_value += c + continue + elif state == 'ATT_VALUE_QUOTED': + if c == att_value_escape: + tag.attributes[att_name] = att_value + state = 'ATT_NAME' + state_white_skiping = True + att_name = '' + continue + else: + att_value += c + continue + + while True: + # get next tag position + # TODO: check it's a real tag and not a fragment that should added to that leafnode + pt1 = buffind('<') + if pt1 != pos: + yield Leaf(buf[pos:pt1]) + if pt1 is None: + return + pos = pt1 + + tag, pos = get_next_tag() + yield tag + + +def html_parse(page): + """ + This function fetches the nodes from the lexer and assemble them in a node tree + """ + root = Tag() + root.flags = Node.Flags.ROOT + father = root + for node in html_lexer(page): + if isinstance(node, Leaf): + node.father = father + father.children.append(node) + elif node.flags & Node.Flags.CLOSING: + # change current father + newfather = father + while True: + # TODO: optimize with Node.Flags.ROOT + if newfather is None: + #TODO: log.debug() + if VERBOSE_PARSER: + print('Closing tag', node, 'does not match any opening tag. Discarding.', file=sys.stderr) + break + if newfather.name == node.name: + newfather.flags |= Node.Flags.CLOSED + if VERBOSE_PARSER: + if newfather != father: + print('Closing tag', node, 'has auto-closed other nodes', end=' ', file=sys.stderr) + deb = father + while deb != newfather: + print(deb, end=' ', file=sys.stderr) + deb = deb.father + print(file=sys.stderr) + father = newfather.father + break + newfather = newfather.father + else: + node.father = father + father.children.append(node) + #print 'node=',node,'info=',node.get_tag_info() + if not node.get_tag_info() & TI_EMPTY: + father = node + #print 'node=',node,'father=',father + return root + + +def print_idented_tree(node, identation_level=-1): + if not node.flags & Node.Flags.ROOT: + print(' '*identation_level+repr(node)) + for c in node.children: + print_idented_tree(c, identation_level+1) + if isinstance(node, Tag) and (node.flags&Node.Flags.CLOSED): + print(' '*identation_level+'') + +def print_lexer_tree(p): + identing = 0 + for item in html_lexer(p): + if isinstance(item, Tag) and item.flags & Node.Flags.CLOSING: + identing -= 1 + print(' '*identing, end=' ') + if isinstance(item, Tag) and not item.flags & Node.Flags.CLOSING: + identing += 1 + print(repr(item)) + + +def get_elem(root, tagname): + """ + Returns all the elements whose name matches + But not from the children of thoses + """ + if isinstance(root, Leaf): + return [] + if root.name == tagname: + return [ root ] + results = [] + for node in root.children: + match = get_elem(node, tagname) + if match: + results += match + return results + + +def split_table(table): + """ + Returns table content as a list (rows) of list (columns) + """ + ctr = [] + for tr in get_elem(table, 'tr'): + ctd = [] + for td in get_elem(tr, 'td'): + ctd += [ td ] + ctr.append(ctd) + return ctr + +def split_table_r_to_leaf(root): + """ + Recursivly split tables as descibed in split_table + Only returns leaf text or list for sub tables + """ + result = [] + tables = get_elem(root, 'table') + if len(tables)==0: + return get_merged_leaf_content(root) + for table in tables: + rrow = [] + for row in split_table(table): + rcol = [] + for col in row: + subr = split_table_r_to_leaf(col) + rcol.append(subr) + rrow.append(rcol) + result.append(rrow) + return result + + +def get_merged_leaf_content(root): + """ + Returns all the leaf content agregated in a string + """ + if isinstance(root, Leaf): + return root.text + + result = '' + for node in root.children: + result += get_merged_leaf_content(node) + return result + + +if __name__ == "__main__": + parser = OptionParser() + parser.add_option("--dump-lexer", help="Debug: Dump idented lexer output", action='store_true', dest='lexer_dump', default=False) + parser.add_option("--dump-parser", help="Debug: Dump idented parser output", action='store_true', dest='parser_dump', default=False) + parser.add_option("--verbose-parser", help="Debug: Verbose parser errors", action='store_true', dest='verbose_parser', default=False) + (options, args) = parser.parse_args() + + try: + filename = args[0] + except IndexError: + print('Need a filename', file=sys.stderr) + sys.exit(-1) + + VERBOSE_PARSER = options.verbose_parser + p = file(filename, encoding='utf-8').read() + + if options.lexer_dump: + print_lexer_tree(p) + sys.exit(0) + + if options.parser_dump: + root = html_parse(p) + print_idented_tree(root) + sys.exit(0) + diff --git a/htmlentities.py b/htmlentities.py new file mode 100755 index 0000000..42afcf0 --- /dev/null +++ b/htmlentities.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +__all__ = ['resolve', 'expand', 'cleanCDATA'] + +from html.entities import name2codepoint as entities + +entities_autocomplete = {} +longestEntityLen = 0 +for key,value in entities.items(): + if value<=255: + entities_autocomplete[key] = value + l = len(key) + if l>longestEntityLen: + longestEntityLen = l + +# Characters in range 127-159 are illegals, but they are sometimes wrongly used in web pages +# Internet Explorer assumes it is taken from Microsoft extension to Latin 1 page 8859-1 aka CP1512 +# However, to be clean, we must remap them to their real unicode values +# Unknown codes are translated into a space +iso88591_remap = [ + 32, # 127: ??? + 8364, # 128: Euro symbol + 32, # 129: ??? + 8218, # 130: Single Low-9 Quotation Mark + 402, # 131: Latin Small Letter F With Hook + 8222, # 132: Double Low-9 Quotation Mark + 8230, # 133: Horizontal Ellipsis + 8224, # 134: Dagger + 8225, # 135: Double Dagger + 710, # 136: Modifier Letter Circumflex Accent + 8240, # 137: Per Mille Sign + 352, # 138: Latin Capital Letter S With Caron + 8249, # 139: Single Left-Pointing Angle Quotation Mark + 338, # 140: Latin Capital Ligature OE + 32, # 141: ??? + 381, # 142: Latin Capital Letter Z With Caron + 32, # 143: ??? + 32, # 144: ??? + 8216, # 145: Left Single Quotation Mark + 8217, # 146: Right Single Quotation Mark + 8220, # 147: Left Double Quotation Mark + 8221, # 148: Right Double Quotation Mark + 8226, # 149: Bullet + 8211, # 150: En Dash + 8212, # 151: Em Dash + 732, # 152: Small Tilde + 8482, # 153: Trade Mark Sign + 353, # 154: Latin Small Letter S With Caron + 8250, # 155: Single Right-Pointing Angle Quotation Mark + 339, # 156: Latin Small Ligature OE + 32, # 157: ??? + 382, # 158: Latin Small Letter Z With Caron + 376 # 159: Latin Capital Letter Y With Diaeresis +] + + +def checkForUnicodeReservedChar(value): + if value >= 0xfffe: + return ord('?') + if value < 127 or value > 159: + return value + return iso88591_remap[value-127] + +def expand(text): + result = '' + for c in text: + oc = ord(c) + oc = checkForUnicodeReservedChar(oc) + if oc<32 or c=='&' or c=='<' or c=='>' or c=='"' or oc>127: + result += '&#'+str(oc)+';' + else: + result += c + return result + +def resolve(text): + pos = 0 + result = '' + l = len(text) + while True: + prevpos = pos + pos = text.find('&', prevpos) + if pos == -1: + ## print "No more &" + break + + if pos >= l-2: + ## print "Too shoort" + break + # here we are sure the next two chars exist + + result += text[prevpos:pos] + c = text[pos+1] + if c == '#': + ## print "numeric entity" + # This looks like an char whose unicode if given raw + c = text[pos+2] + if c == 'x' or c == 'X' and pos < l-3: + tmppos = text.find(';', pos+3) + if tmppos != -1: + s = text[pos+3: tmppos] + try: + value = int(s, 16) + value = checkForUnicodeReservedChar(value) # remap unicode char if in range 127-159 + result += chr(value) + pos = tmppos + 1 + continue # ok, we did it + except ValueError: + # there pos is not updated so that the original escape-like sequence is kept unchanged + pass + else: + # the given unicode value is decimal + # IE behavior: parse until non digital char, no conversion if this is not + sb = '' + tmppos = pos+2 + while True: + if tmppos >= l: + break # out of range + c = text[tmppos] + if c == ';': + tmppos += 1 + break + if c<'0' or c>'9': + break + sb += c + tmppos += 1 + try: + value = int(sb) + value = checkForUnicodeReservedChar(value); # remap unicode char if in range 127-159 + result += chr(value) + pos = tmppos + continue # ok, we did it + except ValueError: + # there pos is not updated so that the original escape-like sequence is kept unchanged + pass + else: + # here the first character is not a '#' + # let's try the known html entities + + sb = '' + tmppos = pos + 1 + while True: + if tmppos >= l or tmppos-pos > longestEntityLen + 1: # 1 more for ';' + c2 = entities_autocomplete.get(sb, 0) + break + c = text[tmppos] + if c == ';': + tmppos += 1 + c2 = entities.get(sb, 0) + break + c2 = entities_autocomplete.get(sb, 0) + if c2: + break + sb += c + tmppos += 1 + if c2: + result += chr(c2) + pos = tmppos + continue # ok, we did it + + result += '&' # something went wrong, just skip is '&' + pos += 1 + + result += text[prevpos:] + return result + +def cleanCDATA(text): + """ + resolve entities + removes useless whites, \r, \n and \t with whites + expand back entities + """ + tmp = resolve(text) + result = '' + isLastWhite = False # so that first white is not removed + for c in tmp: + if c in ' \r\n\t': + if not isLastWhite: + result += ' ' + isLastWhite = True + else: + result += c + isLastWhite = False + + return expand(result) + +if __name__ == '__main__': + import sys + if len(sys.argv)<2: + print("Missing required parameter. Try '&test'", file=sys.stderr) + sys.exit(1) + input = ' '.join(sys.argv[1:]) + #print 'input:', input + #raw = resolve(input) + #print 'resolved:', raw + #print 'expanded:', expand(raw) + print('cleanCDATA:', cleanCDATA(input)) + -- 2.30.2