2 # -*- coding: utf-8 -*-
4 from __future__ import division
10 from ais.html_parser import *
12 DOWNLOAD_SLEEP_TIME = 10
13 EUROPA_DIR = '/var/lib/ais/europa/'
15 # POST http://ec.europa.eu/fisheries/fleet/index.cfm?method=Search.ListSearchSimple
16 # pays=index.cfm%3Fmethod%3DSearch.SearchSimple%26country%3D&ss_char_main_gear=&ss_Period=A&ss_P_date_from=&ss_indent_CFR=&ss_indent_name=&ss_indent_ext_mark=&ss_indent_reg_nr=&ss_ident_IRCS=MHXK6&search_type=simple&ss_indent_search_id=8982&nbr_event_disp=50
18 def europa_get_html(callsign):
19 list_file = EUROPA_DIR+callsign+'-list.html'
20 detail_file = EUROPA_DIR+callsign+'-detail.html'
22 if os.path.exists(detail_file):
23 html = file(detail_file).read()
24 return unicode(html, 'utf-8')
26 if os.path.exists(list_file):
29 # STEP 1: get a search ID
31 request = urllib2.Request('http://ec.europa.eu/fisheries/fleet/index.cfm?method=Search.SearchSimple&country=')
32 request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9.0.7) Gecko/2009032018 Firefox/3.0.6 (Debian-3.0.6-1)')
33 uo = urllib2.urlopen(request)
36 html = unicode(html, 'utf-8')
38 root = html_parse(html)
39 ss_indent_search_id = None
40 for tag in get_elem(root, 'input'):
41 if tag.attributes.get('name', None) == 'ss_indent_search_id':
42 ss_indent_search_id = tag.attributes.get('value', None)
44 assert ss_indent_search_id is not None, 'Can\'t get a search_id'
45 logging.info('ss_indent_search_id=%s', ss_indent_search_id)
47 sleep(DOWNLOAD_SLEEP_TIME)
49 # STEP 2: get the list of results
51 request = urllib2.Request('http://ec.europa.eu/fisheries/fleet/index.cfm?method=Search.ListSearchSimple')
52 request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9.0.7) Gecko/2009032018 Firefox/3.0.6 (Debian-3.0.6-1)')
53 uo = urllib2.urlopen(request, u'pays=index.cfm%3Fmethod%3DSearch.SearchSimple%26country%3D&ss_char_main_gear=&ss_Period=A&ss_P_date_from=&ss_indent_CFR=&ss_indent_name=&ss_indent_ext_mark=&ss_indent_reg_nr=&ss_ident_IRCS=' + callsign + u'&search_type=simple&ss_indent_search_id=' + ss_indent_search_id + u'&nbr_event_disp=50')
56 file(list_file, 'w').write(html)
57 html = unicode(html, 'utf-8')
59 if u'No data found' in html:
60 logging.error('No data found for ' + callsign)
63 root = html_parse(html)
65 for tag in get_elem(root, 'a'):
66 tagurl = tag.attributes.get('href', '')
67 if tagurl.startswith(u'index.cfm?method=Search.DetailSearchSimple&event_key='):
68 lastevent_url = tagurl
69 assert lastevent_url, 'Internal error: Not appropriate URI found.'
70 logging.info('Last event is on %s', lastevent_url)
72 sleep(DOWNLOAD_SLEEP_TIME)
74 # STEP 2: get the last results in html
76 request = urllib2.Request('http://ec.europa.eu/fisheries/fleet/' + lastevent_url)
77 request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9.0.7) Gecko/2009032018 Firefox/3.0.6 (Debian-3.0.6-1)')
78 uo = urllib2.urlopen(request)
81 file(detail_file, 'w').write(html)
82 html = unicode(html, 'utf-8')
84 sleep(DOWNLOAD_SLEEP_TIME)
90 def europa_get(callsign):
91 html = europa_get_html(callsign)
94 root = html_parse(html)
96 for li in get_elem(root, 'li'):
97 elem0 = li.children[0]
98 elem1 = li.children[1]
99 txt0 = get_merged_leaf_content(elem0).replace(u'\xa0', ' ').strip(u'\t\n :')
100 txt1 = get_merged_leaf_content(elem1).replace(u'\xa0', ' ').strip(u'\t\n :')
105 if __name__ == '__main__':
106 from optparse import OptionParser
107 parser = OptionParser(usage='%prog [options] callsign [callsign] ...')
108 parser.add_option('-d', '--debug',
109 action='store_true', dest='debug', default=False,
111 parser.add_option('--download-sleep', help="how many seconds do we sleep after each download. default=%default", action='store', type='int', dest='sleep', default=DOWNLOAD_SLEEP_TIME)
112 (options, args) = parser.parse_args()
115 print >> sys.stderr, "Need at least a parameter"
118 DOWNLOAD_SLEEP_TIME = options.sleep
121 loglevel = logging.DEBUG
123 loglevel = logging.INFO
125 logging.basicConfig(level=loglevel, format='%(asctime)s %(levelname)s %(message)s')
134 u'Year of Construction',
135 u'Entry Service Year',
137 u'Secondary Gear type',
140 for callsign in args:
141 info = europa_get(callsign)
145 print key, ':', info[key]