From: Jean-Michel Nirgal Vourgère Date: Sat, 30 Apr 2011 21:43:27 +0000 (+0000) Subject: New script to get info from europa web site X-Git-Url: https://git.nirgal.com/?p=ais.git;a=commitdiff_plain;h=00dc20d33737ea2596d5c01485b2c2ac249088fc;hp=6fce5abca21a774481751667f18849159aa22ad4 New script to get info from europa web site --- diff --git a/bin/extras/europa.py b/bin/extras/europa.py new file mode 100755 index 0000000..ad4c503 --- /dev/null +++ b/bin/extras/europa.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from __future__ import division +import sys +import os +import logging +import urllib2 +from time import sleep +from ais.html_parser import * + +DOWNLOAD_SLEEP_TIME = 10 +EUROPA_DIR = '/var/lib/ais/europa/' + +# POST http://ec.europa.eu/fisheries/fleet/index.cfm?method=Search.ListSearchSimple +# pays=index.cfm%3Fmethod%3DSearch.SearchSimple%26country%3D&ss_char_main_gear=&ss_Period=A&ss_P_date_from=&ss_indent_CFR=&ss_indent_name=&ss_indent_ext_mark=&ss_indent_reg_nr=&ss_ident_IRCS=MHXK6&search_type=simple&ss_indent_search_id=8982&nbr_event_disp=50 + +def europa_get_html(callsign): + list_file = EUROPA_DIR+callsign+'-list.html' + detail_file = EUROPA_DIR+callsign+'-detail.html' + + if os.path.exists(detail_file): + html = file(detail_file).read() + return unicode(html, 'utf-8') + + if os.path.exists(list_file): + return None + + # STEP 1: get a search ID + + request = urllib2.Request('http://ec.europa.eu/fisheries/fleet/index.cfm?method=Search.SearchSimple&country=') + request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9.0.7) Gecko/2009032018 Firefox/3.0.6 (Debian-3.0.6-1)') + uo = urllib2.urlopen(request) + html = uo.read() + uo.close() + html = unicode(html, 'utf-8') + + root = html_parse(html) + ss_indent_search_id = None + for tag in get_elem(root, 'input'): + if tag.attributes.get('name', None) == 'ss_indent_search_id': + ss_indent_search_id = tag.attributes.get('value', None) + break + assert ss_indent_search_id is not None, 'Can\'t get a search_id' + logging.info('ss_indent_search_id=%s', ss_indent_search_id) + + sleep(DOWNLOAD_SLEEP_TIME) + + # STEP 2: get the list of results + + request = urllib2.Request('http://ec.europa.eu/fisheries/fleet/index.cfm?method=Search.ListSearchSimple') + request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9.0.7) Gecko/2009032018 Firefox/3.0.6 (Debian-3.0.6-1)') + uo = urllib2.urlopen(request, u'pays=index.cfm%3Fmethod%3DSearch.SearchSimple%26country%3D&ss_char_main_gear=&ss_Period=A&ss_P_date_from=&ss_indent_CFR=&ss_indent_name=&ss_indent_ext_mark=&ss_indent_reg_nr=&ss_ident_IRCS=' + callsign + u'&search_type=simple&ss_indent_search_id=' + ss_indent_search_id + u'&nbr_event_disp=50') + html = uo.read() + uo.close() + file(list_file, 'w').write(html) + html = unicode(html, 'utf-8') + + if u'No data found' in html: + logging.error('No data found for ' + callsign) + return + + root = html_parse(html) + lastevent_url = None + for tag in get_elem(root, 'a'): + tagurl = tag.attributes.get('href', '') + if tagurl.startswith(u'index.cfm?method=Search.DetailSearchSimple&event_key='): + lastevent_url = tagurl + assert lastevent_url, 'Internal error: Not appropriate URI found.' + logging.info('Last event is on %s', lastevent_url) + + sleep(DOWNLOAD_SLEEP_TIME) + + # STEP 2: get the last results in html + + request = urllib2.Request('http://ec.europa.eu/fisheries/fleet/' + lastevent_url) + request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9.0.7) Gecko/2009032018 Firefox/3.0.6 (Debian-3.0.6-1)') + uo = urllib2.urlopen(request) + html = uo.read() + uo.close() + file(detail_file, 'w').write(html) + html = unicode(html, 'utf-8') + + sleep(DOWNLOAD_SLEEP_TIME) + + return html + + + +def europa_get(callsign): + html = europa_get_html(callsign) + if not html: + return None + root = html_parse(html) + result = {} + for li in get_elem(root, 'li'): + elem0 = li.children[0] + elem1 = li.children[1] + txt0 = get_merged_leaf_content(elem0).replace(u'\xa0', ' ').strip(u'\t\n :') + txt1 = get_merged_leaf_content(elem1).replace(u'\xa0', ' ').strip(u'\t\n :') + result[txt0] = txt1 + return result + + +if __name__ == '__main__': + from optparse import OptionParser + parser = OptionParser(usage='%prog [options] callsign [callsign] ...') + parser.add_option('-d', '--debug', + action='store_true', dest='debug', default=False, + help="debug mode") + parser.add_option('--download-sleep', help="how many seconds do we sleep after each download. default=%default", action='store', type='int', dest='sleep', default=DOWNLOAD_SLEEP_TIME) + (options, args) = parser.parse_args() + + if len(args) == 0: + print >> sys.stderr, "Need at least a parameter" + sys.exit(1) + + DOWNLOAD_SLEEP_TIME = options.sleep + + if options.debug: + loglevel = logging.DEBUG + else: + loglevel = logging.INFO + + logging.basicConfig(level=loglevel, format='%(asctime)s %(levelname)s %(message)s') + + keys = [ + u'IRCS', + u'Vessel Name', + u'Country Code', + u'Port Code', + u'LOA', + u'Tonnage GT', + u'Year of Construction', + u'Entry Service Year', + u'Main Gear type', + u'Secondary Gear type', + u'External Marking', + ] + for callsign in args: + info = europa_get(callsign) + if not info: + continue + for key in keys: + print key, ':', info[key] + print