ad4c503d804af087547da7826142b58dca0ad029
[ais.git] / bin / extras / europa.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import division
5 import sys
6 import os
7 import logging
8 import urllib2
9 from time import sleep
10 from ais.html_parser import *
11
12 DOWNLOAD_SLEEP_TIME = 10
13 EUROPA_DIR = '/var/lib/ais/europa/'
14
15 # POST http://ec.europa.eu/fisheries/fleet/index.cfm?method=Search.ListSearchSimple
16 # pays=index.cfm%3Fmethod%3DSearch.SearchSimple%26country%3D&ss_char_main_gear=&ss_Period=A&ss_P_date_from=&ss_indent_CFR=&ss_indent_name=&ss_indent_ext_mark=&ss_indent_reg_nr=&ss_ident_IRCS=MHXK6&search_type=simple&ss_indent_search_id=8982&nbr_event_disp=50
17
18 def europa_get_html(callsign):
19     list_file = EUROPA_DIR+callsign+'-list.html'
20     detail_file = EUROPA_DIR+callsign+'-detail.html'
21
22     if os.path.exists(detail_file):
23         html = file(detail_file).read()
24         return unicode(html, 'utf-8')
25
26     if os.path.exists(list_file):
27         return None
28
29     # STEP 1: get a search ID
30
31     request = urllib2.Request('http://ec.europa.eu/fisheries/fleet/index.cfm?method=Search.SearchSimple&country=')
32     request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9.0.7) Gecko/2009032018 Firefox/3.0.6 (Debian-3.0.6-1)')
33     uo = urllib2.urlopen(request)
34     html = uo.read()
35     uo.close()
36     html = unicode(html, 'utf-8')
37
38     root = html_parse(html)
39     ss_indent_search_id = None
40     for tag in get_elem(root, 'input'):
41         if tag.attributes.get('name', None) == 'ss_indent_search_id':
42             ss_indent_search_id = tag.attributes.get('value', None)
43             break
44     assert ss_indent_search_id is not None, 'Can\'t get a search_id'
45     logging.info('ss_indent_search_id=%s', ss_indent_search_id)
46     
47     sleep(DOWNLOAD_SLEEP_TIME)
48
49     # STEP 2: get the list of results
50     
51     request = urllib2.Request('http://ec.europa.eu/fisheries/fleet/index.cfm?method=Search.ListSearchSimple')
52     request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9.0.7) Gecko/2009032018 Firefox/3.0.6 (Debian-3.0.6-1)')
53     uo = urllib2.urlopen(request, u'pays=index.cfm%3Fmethod%3DSearch.SearchSimple%26country%3D&ss_char_main_gear=&ss_Period=A&ss_P_date_from=&ss_indent_CFR=&ss_indent_name=&ss_indent_ext_mark=&ss_indent_reg_nr=&ss_ident_IRCS=' + callsign + u'&search_type=simple&ss_indent_search_id=' + ss_indent_search_id + u'&nbr_event_disp=50')
54     html = uo.read()
55     uo.close()
56     file(list_file, 'w').write(html)
57     html = unicode(html, 'utf-8')
58
59     if u'No data found' in html:
60         logging.error('No data found for ' + callsign)
61         return
62
63     root = html_parse(html)
64     lastevent_url = None
65     for tag in get_elem(root, 'a'):
66         tagurl = tag.attributes.get('href', '')
67         if tagurl.startswith(u'index.cfm?method=Search.DetailSearchSimple&event_key='):
68             lastevent_url = tagurl
69     assert lastevent_url, 'Internal error: Not appropriate URI found.'
70     logging.info('Last event is on %s', lastevent_url)
71             
72     sleep(DOWNLOAD_SLEEP_TIME)
73
74     # STEP 2: get the last results in html
75     
76     request = urllib2.Request('http://ec.europa.eu/fisheries/fleet/' + lastevent_url)
77     request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9.0.7) Gecko/2009032018 Firefox/3.0.6 (Debian-3.0.6-1)')
78     uo = urllib2.urlopen(request)
79     html = uo.read()
80     uo.close()
81     file(detail_file, 'w').write(html)
82     html = unicode(html, 'utf-8')
83
84     sleep(DOWNLOAD_SLEEP_TIME)
85
86     return html
87
88
89
90 def europa_get(callsign):
91     html = europa_get_html(callsign)
92     if not html:
93         return None
94     root = html_parse(html)
95     result = {}
96     for li in get_elem(root, 'li'):
97         elem0 = li.children[0]
98         elem1 = li.children[1]
99         txt0 = get_merged_leaf_content(elem0).replace(u'\xa0', ' ').strip(u'\t\n :')
100         txt1 = get_merged_leaf_content(elem1).replace(u'\xa0', ' ').strip(u'\t\n :')
101         result[txt0] = txt1
102     return result
103
104
105 if __name__ == '__main__':
106     from optparse import OptionParser
107     parser = OptionParser(usage='%prog [options] callsign [callsign] ...')
108     parser.add_option('-d', '--debug',
109         action='store_true', dest='debug', default=False,
110         help="debug mode")
111     parser.add_option('--download-sleep', help="how many seconds do we sleep after each download. default=%default", action='store', type='int', dest='sleep', default=DOWNLOAD_SLEEP_TIME)
112     (options, args) = parser.parse_args()
113
114     if len(args) == 0:
115         print >> sys.stderr, "Need at least a parameter"
116         sys.exit(1)
117
118     DOWNLOAD_SLEEP_TIME = options.sleep
119
120     if options.debug:
121         loglevel = logging.DEBUG
122     else:
123         loglevel = logging.INFO
124
125     logging.basicConfig(level=loglevel, format='%(asctime)s %(levelname)s %(message)s')
126
127     keys = [
128         u'IRCS',
129         u'Vessel Name',
130         u'Country Code',
131         u'Port Code',
132         u'LOA',
133         u'Tonnage GT',
134         u'Year of Construction',
135         u'Entry Service Year',
136         u'Main Gear type',
137         u'Secondary Gear type',
138         u'External Marking',
139     ]
140     for callsign in args:
141         info = europa_get(callsign)
142         if not info:
143             continue
144         for key in keys:
145             print key, ':', info[key]
146         print