31868456d8ba2a8ad74579319950f030cc50f2a4
[ais.git] / bin / extras / marinetraffic.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import division
5 import sys, os, urllib2, time
6 from pprint import pprint
7 from datetime import datetime, date
8 from time import sleep
9 from optparse import OptionParser
10
11 from ais.db import *
12 from ais.ntools import clean_alnum, clean_ais_charset, open_with_mkdirs, datetime_to_timestamp
13 from ais.common import *
14 from ais.html_parser import *
15
16 DOWNLOAD_SLEEP_TIME = 10
17 DISABLE_DOWNLOAD = False
18 MARINETRAFFIC_DIR = '/var/lib/ais/marinetraffic/'
19
20
21 def go_summary(reference_date, mmsi):
22     def get_raw_summary(html):
23         root = html_parse(html)
24         divs = get_elem(root, 'div')
25         divdetail = divs[3]
26         
27         info_raw = {}
28         section = u''
29         boldtext = u''
30         text = u''
31         for node in divdetail.children:
32             if isinstance(node, Tag) and node.name == 'h1':
33                 info_raw[u'name'] = get_inner_text(node)
34                 continue
35             if isinstance(node, Tag) and node.name == 'h2':
36                 if boldtext or text:
37                     info_raw[section][boldtext] = text
38                     boldtext = text = u''
39                 section = get_inner_text(node)
40                 info_raw[section] = {}
41                 continue
42         
43             if isinstance(node, Tag) and node.name == 'br':
44                 if boldtext or text:
45                     info_raw[section][boldtext] = text
46                     boldtext = text = u''
47             elif isinstance(node, Tag) and node.name == 'b':
48                 if boldtext or text:
49                     info_raw[section][boldtext] = text
50                     boldtext = text = u''
51                 boldtext = get_inner_text(node)
52             else:
53                 text += get_inner_text(node)
54         if boldtext or text:
55             info_raw[section][boldtext] = text
56             boldtext = text = u''
57         return info_raw
58     
59     def qualify_summary(info_raw):
60         #pprint(info_raw)
61         info = {}
62         info['name'] = info_raw['name']
63     
64         try:
65             details = info_raw[u"Vessel's Details:"]
66         except KeyError:
67             details = info_raw[u"Vessel's Details"]
68         info['callsign'] = clean_alnum(details[u'Call Sign:'].encode('ascii', 'replace'))
69         tmp = details.get(u'IMO:', None)
70         if tmp:
71             tmp = tmp.replace(u',', u'').strip()
72             if tmp != u'999999999':
73                 info['imo'] = tmp.replace(u', ', u'').strip()
74         tmp = details.get(u'Length x Breadth:', None)
75         if tmp:
76             length, breadth = tmp.replace(u'm', u'').split('X')
77             info['length'], info['breadth'] = int(length), int(breadth)
78         info['mmsi'] = details[u'MMSI:'].strip()
79         tmp = details.get(u'Ship Type:', None)
80         if tmp:
81             tmp = tmp.strip()
82             reverse_types = {
83                 u'Fishing': 30,
84                 #u'Towing': 31,
85                 #u'Towing': 32,
86                 u'Dredger': 33,
87                 u'Dive Vessel': 34,
88                 u'Military Ops': 35,
89                 u'Sailing Vessel': 36,
90                 u'Pleasure Craft': 37,
91                 u'Pilot Vessel': 50,
92                 u'SAR': 51,
93                 u'Tug': 52,
94                 u'Port Tender': 53,
95                 u'Anti-Pollution': 54,
96                 u'Law Enforce': 55,
97                 #u'Local Vessel': 56,
98                 #u'Local Vessel': 57,
99                 u'Medical Trans': 58,
100                 u'Special Craft': 59,
101                 # Cargo is repported for types 70, 75, 76 .. 79
102                 u'Cargo - Hazard A (Major)': 71,
103                 u'Cargo - Hazard B': 72,
104                 u'Cargo - Hazard C (Minor)': 73,
105                 u'Cargo - Hazard D (Recognizable)': 74,
106                 u'Tanker - Hazard A (Major)': 81,
107                 u'Tanker - Hazard B': 82,
108                 u'Tanker - Hazard C (Minor)': 83,
109                 u'Tanker - Hazard D (Recognizable)': 84,
110             }
111             _type = reverse_types.get(tmp, None)
112             if _type is not None:
113                 info['type'] = _type
114             else:
115                 print >> sys.stderr , "NOTICE: can't properly qualify ship of type", tmp
116         # TODO year built .... ?
117     
118         try:
119             voyage = info_raw[u'Voyage Related Info (Last Received):']
120         except KeyError:
121             voyage = info_raw[u'Voyage Related Info (Last Received)']
122         tmp = voyage.get(u'Destination:', None)
123         if tmp:
124             tmp = tmp.strip()
125             if tmp != 'CLASS B':
126                 info['destination'] = tmp
127         tmp = voyage.get(u'Draught:', None)
128         if tmp:
129             info['draught'] = float(tmp.replace(u'm', u''))
130         tmp = voyage.get(u'ETA:', None)
131         if tmp:
132             tmp = tmp.strip()
133             try:
134                 tmp = datetime.strptime(tmp, '%Y-%m-%d %H:%M')
135             except ValueError:
136                 print "Failed to parse ETA date. Trying old format ...",
137                 tmp = datetime.strptime(tmp, '%d/%m/%Y %H:%M:%S')
138                 print "Success"
139
140             if tmp != datetime(1900, 1, 1):
141                 info['eta'] = tmp.strftime('%m%d%H%M')
142         tmp = voyage.get(u'Info Received:', None)
143         if tmp:
144             voyage_updated = tmp.split(u'(')[0].strip()
145             try:
146                 info['voyage_updated'] = datetime.strptime(voyage_updated, '%Y-%m-%d %H:%M')
147             except ValueError:
148                 print "Failed to parse voyage updated date. Trying old format ...",
149                 info['voyage_updated'] = datetime.strptime(voyage_updated, '%d/%m/%Y %H:%M:%S')
150                 print "Success"
151
152     
153         return info
154
155     filename = MARINETRAFFIC_DIR+reference_date+'/'+mmsi+'-sum.html'
156     if not os.path.exists(filename):
157         if DISABLE_DOWNLOAD:
158             print >> sys.stderr, filename, 'not found and downloads disabled.'
159             return False
160         request = urllib2.Request('http://www.marinetraffic.com/ais/shipdetails.aspx?MMSI='+mmsi)
161         request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9.0.7) Gecko/2009032018 Firefox/3.0.6 (Debian-3.0.6-1)')
162         uo = urllib2.urlopen(request)
163         html = uo.read()
164         uo.close()
165         sleep(DOWNLOAD_SLEEP_TIME)
166
167         f = open_with_mkdirs(filename, 'w')
168         f.write(html)
169         f.close()
170     else:
171         html = file(filename).read()
172     html = unicode(html, 'utf8')
173
174     if u'The requested service is unavailable.' in html or u'Η λειτουργία που ζητήσατε δεν είναι διαθέσιμη.' in html:
175         print >> sys.stderr, 'WARNING: The requested service is unavailable.'
176         os.unlink(filename)
177         return False
178     if u'Non-existent Vessel' in html:
179         print >> sys.stderr, 'WARNING: Vessel unknown'
180         return False
181
182     info_raw = get_raw_summary(html)
183     info = qualify_summary(info_raw)
184     #pprint(info)
185  
186     assert info['mmsi'] == mmsi
187     
188     def warning(*args):
189         print >> sys.stderr, "WARNING:"
190         for arg in args:
191             print >> sys.stderr, args,
192         print >> sys.stderr
193
194     voyage_updated = info.get('voyage_updated', None)
195     if voyage_updated:
196         timestamp = datetime_to_timestamp(voyage_updated)
197     else:
198         timestamp = datetime_to_timestamp(datetime.strptime(reference_date, '%Y%m%d'))
199     imo = int(info.get('imo', 0))
200     if imo >= 1 << 31:
201         warning('imo', imo, 'is too big')
202         imo = 0
203     name = info.get('name', u'').encode('utf8')
204     if len(name) > 20:
205         warning('name', name, 'is too big, truncating')
206         name = name[:20]
207     name = clean_ais_charset(name)
208     callsign = clean_alnum(info.get('callsign', u'').encode('utf8'))
209     type = info.get('type', 0)
210     if type < 0 or type > 100:
211         type = 0 #TODO check
212     eta = info.get('eta', u'00002460')
213     if len(eta)==8:
214         eta_M = int(eta[0:2])
215         eta_D = int(eta[2:4])
216         eta_h = int(eta[4:6])
217         eta_m = int(eta[6:8])
218     else:
219         eta_M = eta_D = 0
220         eta_h = 24
221         eta_m = 60
222     draught = int(info.get('draught', 0)*10)
223     destination = clean_ais_charset(info.get('destination', u'').encode('utf8'))
224     
225     add_nmea5_partial(mmsi, timestamp, imo, name, callsign, type, 0, 0, 0, 0, eta_M, eta_D, eta_h, eta_m, draught, destination, 'MTWW')
226
227     return True
228
229
230 def import_last_pos(reference_date, mmsi, page=None):
231     if page is None or page == 1:
232         filename = MARINETRAFFIC_DIR+reference_date+'/'+mmsi+'-mov.html'
233     else:
234         filename = MARINETRAFFIC_DIR+reference_date+'/'+mmsi+'-mov'+str(page)+'.html'
235
236     if not os.path.exists(filename):
237         if DISABLE_DOWNLOAD:
238             print >> sys.stderr, filename, 'not found and downloads disabled.'
239             return False
240
241         if page is None or page == 1:
242             request = urllib2.Request('http://www.marinetraffic.com/ais/datasheet.aspx?datasource=ITINERARIES&MMSI='+mmsi)
243         else:
244             request = urllib2.Request('http://www.marinetraffic.com/ais/datasheet.aspx?datasource=ITINERARIES&MMSI='+mmsi+'&orderby=MINTIME&sort_order=DESC&var_page='+str(page))
245         request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9.0.7) Gecko/2009032018 Firefox/3.0.6 (Debian-3.0.6-1)')
246         uo = urllib2.urlopen(request)
247         html = uo.read()
248         uo.close()
249         sleep(DOWNLOAD_SLEEP_TIME)
250
251         f = open_with_mkdirs(filename, 'w')
252         f.write(html)
253         f.close()
254     else:
255         html = file(filename).read()
256     html = unicode(html, 'utf8')
257
258     if u'No Records Found' in html:
259         print >> sys.stderr, 'NOTICE: No Records found.'
260         return
261     if u'The requested service is unavailable.' in html or u'Η λειτουργία που ζητήσατε δεν είναι διαθέσιμη.' in html:
262         print >> sys.stderr, 'WARNING: The requested service is unavailable.'
263         os.unlink(filename)
264         return
265
266     root = html_parse(html)
267     table = get_elem(root, u'table')[0]
268
269     infosrc = u'MT'
270     
271     # Now, import each track
272     for row in table.children[1:]: # ignore first line with headers
273         line = row.children
274
275         latlong = get_merged_leaf_content(line[5])
276         latlong = latlong.replace(u'\xa0', u'').strip()
277         if latlong:
278             lon, lat = latlong.split(' ')
279         
280         show_on_map = line[8]
281         assert get_merged_leaf_content(show_on_map).strip() == 'Show on Map'
282         link = show_on_map.children[0].children[0].attributes['href']
283         tmp = link.split(u'?', 2)
284         assert tmp[0] == u'default.aspx'
285         tmp = tmp[1]
286         tmp = tmp.split(u'&')
287         assert len(tmp)==3
288         assert tmp[0] == u'zoom=9'
289         assert tmp[1] == u'oldmmsi='+mmsi
290         tmp = tmp[2]
291         assert tmp.startswith(u'olddate=')
292         dt = tmp[len(u'olddate='):]
293
294         isodt = datetime.strptime(dt, '%m/%d/%Y %I:%M:%S %p')
295
296
297         if latlong:
298             speed = float(get_merged_leaf_content(line[6]))
299             course = float(get_merged_leaf_content(line[7]))
300             #print dt, isodt, lat, long, speed, course
301
302             strmmsi = mmsi
303             timestamp = datetime_to_timestamp(isodt)
304             status = AIS_STATUS_NOT_AVAILABLE
305             rot = AIS_ROT_NOT_AVAILABLE
306             sog = int(speed*AIS_SOG_SCALE)
307             latitude = int(float(lat)*AIS_LATLON_SCALE)
308             longitude = int(float(lon)*AIS_LATLON_SCALE)
309             cog = int(course*AIS_COG_SCALE)
310             heading = AIS_NO_HEADING
311             source = 'MTWW'
312             add_nmea1(strmmsi, timestamp, status, rot, sog, latitude, longitude, cog, heading, source)
313
314         import_track(mmsi, dt, isodt)
315
316     if 'Next page' in html and page is not 2:
317         print 'There is another page!'
318         return True
319     else:
320         return False
321
322
323 def import_track(mmsi, dt, isodt):
324     filename = MARINETRAFFIC_DIR+isodt.strftime('%Y%m%d')+'/'+mmsi+'-trk.xml'
325     if not os.path.exists(filename):
326         if DISABLE_DOWNLOAD:
327             print >> sys.stderr, filename, 'not found and downloads disabled.'
328             return
329
330         url = 'http://www.marinetraffic.com/ais/gettrackxml.aspx?mmsi=%s&date=%s' % (mmsi, dt.replace(' ','%20'))
331         request = urllib2.Request(url)
332         request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9.0.7) Gecko/2009032018 Firefox/3.0.6 (Debian-3.0.6-1)')
333         request.add_header('Accept' , 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
334         request.add_header('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7')
335         request.add_header('Referer', 'http://www.marinetraffic.com/ais/default.aspx?zoom=9&oldmmsi=%(mmsi)s+&olddate=%(date)s' % { 'mmsi': mmsi, 'date': dt.replace(' ', '%20') })
336         uo = urllib2.urlopen(request)
337         html = uo.read()
338         uo.close()
339         sleep(DOWNLOAD_SLEEP_TIME)
340
341         f = open_with_mkdirs(filename, 'w')
342         f.write(html)
343         f.close()
344     else:
345         html = file(filename).read()
346
347     #print filename
348     xml = unicode(html, 'utf8')
349     info = { 'mmsi': mmsi, 'infosrc': u'MT' }
350     for node in html_lexer(xml):
351         if isinstance(node, Tag) and node.name==u'pos':
352             info['updated'] = node.attributes['timestamp']
353             info['lat'] = float(node.attributes['lat'])
354             info['lon'] = float(node.attributes['lon'])
355             info['course'] = float(node.attributes['course'])
356             info['speed'] = float(node.attributes['speed'])/10.
357             strmmsi = mmsi
358             timestamp = datetime_to_timestamp(datetime.strptime(info['updated'], '%Y-%m-%dT%H:%M:%S'))
359             status = AIS_STATUS_NOT_AVAILABLE
360             rot = AIS_ROT_NOT_AVAILABLE
361             sog = int(info['speed']*AIS_SOG_SCALE)
362             latitude = int(float(info['lat'])*AIS_LATLON_SCALE)
363             longitude = int(float(info['lon'])*AIS_LATLON_SCALE)
364             cog = int(info['course'])*AIS_COG_SCALE
365             heading = AIS_NO_HEADING
366             source = 'MTTR'
367             #print datetime.utcfromtimestamp(timestamp),
368             #for i in strmmsi, timestamp, status, rot, sog, latitude, longitude, cog, heading, source:
369             #    print repr(i),
370             #print
371             add_nmea1(strmmsi, timestamp, status, rot, sog, latitude, longitude, cog, heading, source)
372     #dbcommit()
373     
374     
375
376 if __name__ == '__main__':
377     parser = OptionParser(usage='%prog [options] mmsi [mmsi]...')
378     parser.add_option('--no-download', help="don't download any file", action='store_true', dest='no_download', default=False)
379     parser.add_option('--download-sleep', help="how many seconds do we sleep after each download. default=%default", action='store', type='int', dest='sleep', default=DOWNLOAD_SLEEP_TIME)
380     parser.add_option('--debug-sql', help="print all sql queries to stdout before running them", action='store_true', dest='debug_sql', default=False)
381     parser.add_option('--date', help="force reference date. default=%default\nDo NOT use without --date", action='store', dest='reference_date', default=datetime.utcnow().date().strftime('%Y%m%d'))
382     parser.add_option('--print-mmsi', help="prints each mmsi before it's processed", action='store_true', dest='print_mmsi', default=False)
383     (options, args) = parser.parse_args()
384
385     if len(args)==0:
386         print >> sys.stderr, "Need parameters"
387         sys.exit(1)
388
389     DISABLE_DOWNLOAD = options.no_download
390     DOWNLOAD_SLEEP_TIME = options.sleep
391     if options.debug_sql:
392         sql_setdebug(True)
393     reference_date = options.reference_date
394
395     for mmsi in args:
396         while len(mmsi) and mmsi[-1] in '\r\n':
397             mmsi = mmsi[:-1]
398         if not mmsi.isdigit():
399             print 'MMSI', mmsi, 'is not numeric. Ignoring.'
400             continue
401         if options.print_mmsi:
402             print 'MMSI', mmsi
403         found = go_summary(reference_date, mmsi)
404         if found:
405             page = 1
406             while True:
407                 if import_last_pos(reference_date, mmsi, page):
408                     page += 1
409                 else:
410                     break