New script to get info from europa web site
authorJean-Michel Nirgal Vourgère <jmv@nirgal.com>
Sat, 30 Apr 2011 21:43:27 +0000 (21:43 +0000)
committerJean-Michel Nirgal Vourgère <jmv@nirgal.com>
Sat, 30 Apr 2011 21:43:27 +0000 (21:43 +0000)
bin/extras/europa.py [new file with mode: 0755]

diff --git a/bin/extras/europa.py b/bin/extras/europa.py
new file mode 100755 (executable)
index 0000000..ad4c503
--- /dev/null
@@ -0,0 +1,146 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from __future__ import division
+import sys
+import os
+import logging
+import urllib2
+from time import sleep
+from ais.html_parser import *
+
+DOWNLOAD_SLEEP_TIME = 10
+EUROPA_DIR = '/var/lib/ais/europa/'
+
+# POST http://ec.europa.eu/fisheries/fleet/index.cfm?method=Search.ListSearchSimple
+# pays=index.cfm%3Fmethod%3DSearch.SearchSimple%26country%3D&ss_char_main_gear=&ss_Period=A&ss_P_date_from=&ss_indent_CFR=&ss_indent_name=&ss_indent_ext_mark=&ss_indent_reg_nr=&ss_ident_IRCS=MHXK6&search_type=simple&ss_indent_search_id=8982&nbr_event_disp=50
+
+def europa_get_html(callsign):
+    list_file = EUROPA_DIR+callsign+'-list.html'
+    detail_file = EUROPA_DIR+callsign+'-detail.html'
+
+    if os.path.exists(detail_file):
+        html = file(detail_file).read()
+        return unicode(html, 'utf-8')
+
+    if os.path.exists(list_file):
+        return None
+
+    # STEP 1: get a search ID
+
+    request = urllib2.Request('http://ec.europa.eu/fisheries/fleet/index.cfm?method=Search.SearchSimple&country=')
+    request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9.0.7) Gecko/2009032018 Firefox/3.0.6 (Debian-3.0.6-1)')
+    uo = urllib2.urlopen(request)
+    html = uo.read()
+    uo.close()
+    html = unicode(html, 'utf-8')
+
+    root = html_parse(html)
+    ss_indent_search_id = None
+    for tag in get_elem(root, 'input'):
+        if tag.attributes.get('name', None) == 'ss_indent_search_id':
+            ss_indent_search_id = tag.attributes.get('value', None)
+            break
+    assert ss_indent_search_id is not None, 'Can\'t get a search_id'
+    logging.info('ss_indent_search_id=%s', ss_indent_search_id)
+    
+    sleep(DOWNLOAD_SLEEP_TIME)
+
+    # STEP 2: get the list of results
+    
+    request = urllib2.Request('http://ec.europa.eu/fisheries/fleet/index.cfm?method=Search.ListSearchSimple')
+    request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9.0.7) Gecko/2009032018 Firefox/3.0.6 (Debian-3.0.6-1)')
+    uo = urllib2.urlopen(request, u'pays=index.cfm%3Fmethod%3DSearch.SearchSimple%26country%3D&ss_char_main_gear=&ss_Period=A&ss_P_date_from=&ss_indent_CFR=&ss_indent_name=&ss_indent_ext_mark=&ss_indent_reg_nr=&ss_ident_IRCS=' + callsign + u'&search_type=simple&ss_indent_search_id=' + ss_indent_search_id + u'&nbr_event_disp=50')
+    html = uo.read()
+    uo.close()
+    file(list_file, 'w').write(html)
+    html = unicode(html, 'utf-8')
+
+    if u'No data found' in html:
+        logging.error('No data found for ' + callsign)
+        return
+
+    root = html_parse(html)
+    lastevent_url = None
+    for tag in get_elem(root, 'a'):
+        tagurl = tag.attributes.get('href', '')
+        if tagurl.startswith(u'index.cfm?method=Search.DetailSearchSimple&event_key='):
+            lastevent_url = tagurl
+    assert lastevent_url, 'Internal error: Not appropriate URI found.'
+    logging.info('Last event is on %s', lastevent_url)
+            
+    sleep(DOWNLOAD_SLEEP_TIME)
+
+    # STEP 2: get the last results in html
+    
+    request = urllib2.Request('http://ec.europa.eu/fisheries/fleet/' + lastevent_url)
+    request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9.0.7) Gecko/2009032018 Firefox/3.0.6 (Debian-3.0.6-1)')
+    uo = urllib2.urlopen(request)
+    html = uo.read()
+    uo.close()
+    file(detail_file, 'w').write(html)
+    html = unicode(html, 'utf-8')
+
+    sleep(DOWNLOAD_SLEEP_TIME)
+
+    return html
+
+
+
+def europa_get(callsign):
+    html = europa_get_html(callsign)
+    if not html:
+        return None
+    root = html_parse(html)
+    result = {}
+    for li in get_elem(root, 'li'):
+        elem0 = li.children[0]
+        elem1 = li.children[1]
+        txt0 = get_merged_leaf_content(elem0).replace(u'\xa0', ' ').strip(u'\t\n :')
+        txt1 = get_merged_leaf_content(elem1).replace(u'\xa0', ' ').strip(u'\t\n :')
+        result[txt0] = txt1
+    return result
+
+
+if __name__ == '__main__':
+    from optparse import OptionParser
+    parser = OptionParser(usage='%prog [options] callsign [callsign] ...')
+    parser.add_option('-d', '--debug',
+        action='store_true', dest='debug', default=False,
+        help="debug mode")
+    parser.add_option('--download-sleep', help="how many seconds do we sleep after each download. default=%default", action='store', type='int', dest='sleep', default=DOWNLOAD_SLEEP_TIME)
+    (options, args) = parser.parse_args()
+
+    if len(args) == 0:
+        print >> sys.stderr, "Need at least a parameter"
+        sys.exit(1)
+
+    DOWNLOAD_SLEEP_TIME = options.sleep
+
+    if options.debug:
+        loglevel = logging.DEBUG
+    else:
+        loglevel = logging.INFO
+
+    logging.basicConfig(level=loglevel, format='%(asctime)s %(levelname)s %(message)s')
+
+    keys = [
+        u'IRCS',
+        u'Vessel Name',
+        u'Country Code',
+        u'Port Code',
+        u'LOA',
+        u'Tonnage GT',
+        u'Year of Construction',
+        u'Entry Service Year',
+        u'Main Gear type',
+        u'Secondary Gear type',
+        u'External Marking',
+    ]
+    for callsign in args:
+        info = europa_get(callsign)
+        if not info:
+            continue
+        for key in keys:
+            print key, ':', info[key]
+        print