Improved ec.europa.eu web grabbing
[ais.git] / bin / extras / europa.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import division
5 import sys
6 import os
7 import logging
8 import urllib2
9 from time import sleep
10 from ais.html_parser import *
11
12 DOWNLOAD_SLEEP_TIME = 10
13 EUROPA_DIR = '/var/lib/ais/europa/'
14
15 # POST http://ec.europa.eu/fisheries/fleet/index.cfm?method=Search.ListSearchSimple
16 # pays=index.cfm%3Fmethod%3DSearch.SearchSimple%26country%3D&ss_char_main_gear=&ss_Period=A&ss_P_date_from=&ss_indent_CFR=&ss_indent_name=&ss_indent_ext_mark=&ss_indent_reg_nr=&ss_ident_IRCS=MHXK6&search_type=simple&ss_indent_search_id=8982&nbr_event_disp=50
17
18 def europa_get_html(callsign):
19     list_file = EUROPA_DIR+callsign+'-list.html'
20     detail_file = EUROPA_DIR+callsign+'-detail.html'
21
22     if os.path.exists(detail_file):
23         html = file(detail_file).read()
24         return unicode(html, 'utf-8')
25
26     if os.path.exists(list_file):
27         return None
28
29     # STEP 1: get a search ID
30
31     request = urllib2.Request('http://ec.europa.eu/fisheries/fleet/index.cfm?method=Search.SearchSimple&country=')
32     request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9.0.7) Gecko/2009032018 Firefox/3.0.6 (Debian-3.0.6-1)')
33     uo = urllib2.urlopen(request)
34     html = uo.read()
35     uo.close()
36     html = unicode(html, 'utf-8')
37
38     root = html_parse(html)
39     ss_indent_search_id = None
40     for tag in get_elem(root, 'input'):
41         if tag.attributes.get('name', None) == 'ss_indent_search_id':
42             ss_indent_search_id = tag.attributes.get('value', None)
43             break
44     assert ss_indent_search_id is not None, 'Can\'t get a search_id'
45     logging.info('ss_indent_search_id=%s', ss_indent_search_id)
46     
47     sleep(DOWNLOAD_SLEEP_TIME)
48
49     # STEP 2: get the list of results
50     
51     request = urllib2.Request('http://ec.europa.eu/fisheries/fleet/index.cfm?method=Search.ListSearchSimple')
52     request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9.0.7) Gecko/2009032018 Firefox/3.0.6 (Debian-3.0.6-1)')
53     uo = urllib2.urlopen(request, u'pays=index.cfm%3Fmethod%3DSearch.SearchSimple%26country%3D&ss_char_main_gear=&ss_Period=A&ss_P_date_from=&ss_indent_CFR=&ss_indent_name=&ss_indent_ext_mark=&ss_indent_reg_nr=&ss_ident_IRCS=' + callsign + u'&search_type=simple&ss_indent_search_id=' + ss_indent_search_id + u'&nbr_event_disp=50')
54     html = uo.read()
55     uo.close()
56     file(list_file, 'w').write(html)
57     html = unicode(html, 'utf-8')
58
59     if u'No data found' in html:
60         logging.error('No data found for ' + callsign)
61         return
62
63     root = html_parse(html)
64     lastevent_url = None
65     for tag in get_elem(root, 'a'):
66         tagurl = tag.attributes.get('href', '')
67         if tagurl.startswith(u'index.cfm?method=Search.DetailSearchSimple&event_key='):
68             lastevent_url = tagurl
69     assert lastevent_url, 'Internal error: Not appropriate URI found.'
70     logging.info('Last event is on %s', lastevent_url)
71             
72     sleep(DOWNLOAD_SLEEP_TIME)
73
74     # STEP 2: get the last results in html
75     
76     request = urllib2.Request('http://ec.europa.eu/fisheries/fleet/' + lastevent_url)
77     request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9.0.7) Gecko/2009032018 Firefox/3.0.6 (Debian-3.0.6-1)')
78     uo = urllib2.urlopen(request)
79     html = uo.read()
80     uo.close()
81     file(detail_file, 'w').write(html)
82     html = unicode(html, 'utf-8')
83
84     sleep(DOWNLOAD_SLEEP_TIME)
85
86     return html
87
88
89
90 def europa_get(callsign):
91     html = europa_get_html(callsign)
92     if not html:
93         return None
94     root = html_parse(html)
95     result = {}
96     for li in get_elem(root, 'li'):
97         elem0 = li.children[0]
98         elem1 = li.children[1]
99         txt0 = get_merged_leaf_content(elem0).replace(u'\xa0', ' ').strip(u'\t\n :')
100         txt1 = get_merged_leaf_content(elem1).replace(u'\xa0', ' ').strip(u'\t\n :')
101         result[txt0] = txt1
102     return result
103
104
105 def normalize_info(info, callsign):
106     if not info[u'IRCS']:
107         info[u'IRCS'] = callsign
108     info[u'Country Code'] = info[u'Country Code'].split(u' - ', 1)[1]
109     info[u'Port Code'] = info[u'Port Code'].split(u' - ', 1)[1]
110     info[u'LOA'] = info[u'LOA'].replace(u',', u'.')
111     info[u'Tonnage GT'] = info[u'Tonnage GT'].replace(u',', u'.').rstrip(' T')
112     info[u'Main Gear type'] = info[u'Main Gear type'].split(u' - ', 1)[1]
113     info[u'Secondary Gear type'] = info[u'Secondary Gear type'].split(u' - ', 1)[1]
114     if info[u'Public aid']:
115         info[u'Public aid'] = info[u'Public aid'].split(u' - ', 1)[1]
116     return info
117
118 def myprint(string):
119     sys.stdout.write(string.encode('utf-8'))
120
121 if __name__ == '__main__':
122     from optparse import OptionParser
123     parser = OptionParser(usage='%prog [options] callsign [callsign] ...')
124     parser.add_option('-d', '--debug',
125         action='store_true', dest='debug', default=False,
126         help="debug mode")
127     parser.add_option('--download-sleep',
128         action='store', type='int', dest='sleep', default=DOWNLOAD_SLEEP_TIME,
129         help="how many seconds do we sleep after each download. default=%default")
130     parser.add_option('--table',
131         action='store_true', dest='table_format', default=False,
132         help='Format in table')
133
134     (options, args) = parser.parse_args()
135
136     if len(args) == 0:
137         print >> sys.stderr, "Need at least a parameter"
138         sys.exit(1)
139
140     DOWNLOAD_SLEEP_TIME = options.sleep
141
142     if options.debug:
143         loglevel = logging.DEBUG
144     else:
145         loglevel = logging.INFO
146
147     logging.basicConfig(level=loglevel, format='%(asctime)s %(levelname)s %(message)s')
148
149     keys = [
150         u'IRCS',
151         u'Vessel Name',
152         u'Country Code',
153         u'Port Code',
154         u'LOA',
155         u'Tonnage GT',
156         u'Year of Construction',
157         u'Entry Service Year',
158         u'Main Gear type',
159         u'Secondary Gear type',
160         u'External Marking',
161         u'Public aid',
162     ]
163     if options.table_format:
164         for i, key in enumerate(keys):
165             if i:
166                 myprint(u'\t')
167             myprint(key)
168         myprint(u'\n')
169     for callsign in args:
170         info = europa_get(callsign)
171         if not info:
172             continue
173         info = normalize_info(info, callsign)
174         if options.table_format:
175             for i, key in enumerate(keys):
176                 if i:
177                     myprint(u'\t')
178                 myprint(info[key])
179             myprint(u'\n')
180         else:
181             for key in keys:
182                 myprint(key+u': '+info[key]+u'\n')
183             myprint(u'\n')