Fix for marinetraffic web change
[ais.git] / bin / extras / marinetraffic.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import division
5 import sys, os, urllib2, time
6 from pprint import pprint
7 from datetime import datetime, date
8 from time import sleep
9 from optparse import OptionParser
10
11 from ais.db import *
12 from ais.ntools import clean_alnum, clean_ais_charset, open_with_mkdirs, datetime_to_timestamp
13 from ais.common import *
14 from ais.html_parser import *
15
16 DOWNLOAD_SLEEP_TIME = 10
17 DISABLE_DOWNLOAD = False
18 MARINETRAFFIC_DIR = '/var/lib/ais/marinetraffic/'
19
20
21 def go_summary(reference_date, mmsi):
22     def get_raw_summary(html):
23         root = html_parse(html)
24         divs = get_elem(root, 'div')
25         divdetail = divs[3]
26         #print_idented_tree(divdetail, 0)
27         info_raw = {}
28         section = u''
29         boldtext = u''
30         text = u''
31         for node in divdetail.children:
32             if isinstance(node, Tag) and node.name == 'h1':
33                 info_raw[u'name'] = get_inner_text(node)
34                 continue
35             if isinstance(node, Tag) and node.name == 'h2':
36                 if boldtext or text:
37                     if not section:
38                         print >> sys.stderr, "WARNING: section is empty for setting", boldtext, "=", text
39                     else:
40                         info_raw[section][boldtext] = text
41                     boldtext = text = u''
42                 section = get_inner_text(node)
43                 info_raw[section] = {}
44                 continue
45         
46             if isinstance(node, Tag) and node.name == 'br':
47                 if boldtext or text:
48                     info_raw[section][boldtext] = text
49                     boldtext = text = u''
50             elif isinstance(node, Tag) and node.name == 'b':
51                 if boldtext or text:
52                     if not section:
53                         print >> sys.stderr, "WARNING: section is empty for setting", boldtext, "=", text
54                     else:
55                         info_raw[section][boldtext] = text
56                     boldtext = text = u''
57                 boldtext = get_inner_text(node)
58             else:
59                 text += get_inner_text(node)
60         if boldtext or text:
61             info_raw[section][boldtext] = text
62             boldtext = text = u''
63         return info_raw
64     
65     def qualify_summary(info_raw):
66         #pprint(info_raw)
67         info = {}
68         info['name'] = info_raw['name']
69     
70         try:
71             details = info_raw[u"Vessel's Details:"]
72         except KeyError:
73             details = info_raw[u"Vessel's Details"]
74         info['callsign'] = clean_alnum(details[u'Call Sign:'].encode('ascii', 'replace'))
75         tmp = details.get(u'IMO:', None)
76         if tmp:
77             tmp = tmp.replace(u',', u'').strip()
78             if tmp != u'999999999':
79                 info['imo'] = tmp.replace(u', ', u'').strip()
80         tmp = details.get(u'Length x Breadth:', None)
81         if tmp:
82             length, breadth = tmp.replace(u'm', u'').split('X')
83             info['length'], info['breadth'] = int(length), int(breadth)
84         info['mmsi'] = details[u'MMSI:'].strip()
85         tmp = details.get(u'Ship Type:', None)
86         if tmp:
87             tmp = tmp.strip()
88             reverse_types = {
89                 u'Fishing': 30,
90                 #u'Towing': 31,
91                 #u'Towing': 32,
92                 u'Dredger': 33,
93                 u'Dive Vessel': 34,
94                 u'Military Ops': 35,
95                 u'Sailing Vessel': 36,
96                 u'Pleasure Craft': 37,
97                 u'Pilot Vessel': 50,
98                 u'SAR': 51,
99                 u'Tug': 52,
100                 u'Port Tender': 53,
101                 u'Anti-Pollution': 54,
102                 u'Law Enforce': 55,
103                 #u'Local Vessel': 56,
104                 #u'Local Vessel': 57,
105                 u'Medical Trans': 58,
106                 u'Special Craft': 59,
107                 # Cargo is repported for types 70, 75, 76 .. 79
108                 u'Cargo - Hazard A (Major)': 71,
109                 u'Cargo - Hazard B': 72,
110                 u'Cargo - Hazard C (Minor)': 73,
111                 u'Cargo - Hazard D (Recognizable)': 74,
112                 u'Tanker - Hazard A (Major)': 81,
113                 u'Tanker - Hazard B': 82,
114                 u'Tanker - Hazard C (Minor)': 83,
115                 u'Tanker - Hazard D (Recognizable)': 84,
116             }
117             _type = reverse_types.get(tmp, None)
118             if _type is not None:
119                 info['type'] = _type
120             else:
121                 print >> sys.stderr , "NOTICE: can't properly qualify ship of type", tmp
122         # TODO year built .... ?
123     
124         try:
125             voyage = info_raw[u'Voyage Related Info (Last Received):']
126         except KeyError:
127             voyage = info_raw[u'Voyage Related Info (Last Received)']
128         tmp = voyage.get(u'Destination:', None)
129         if tmp:
130             tmp = tmp.strip()
131             if tmp != 'CLASS B':
132                 info['destination'] = tmp
133         tmp = voyage.get(u'Draught:', None)
134         if tmp:
135             info['draught'] = float(tmp.replace(u'm', u''))
136         tmp = voyage.get(u'ETA:', None)
137         if tmp:
138             tmp = tmp.strip()
139             try:
140                 tmp = datetime.strptime(tmp, '%Y-%m-%d %H:%M')
141             except ValueError:
142                 print "Failed to parse ETA date. Trying old format ...",
143                 tmp = datetime.strptime(tmp, '%d/%m/%Y %H:%M:%S')
144                 print "Success"
145
146             if tmp != datetime(1900, 1, 1):
147                 info['eta'] = tmp.strftime('%m%d%H%M')
148         tmp = voyage.get(u'Info Received:', None)
149         if tmp:
150             voyage_updated = tmp.split(u'(')[0].strip()
151             try:
152                 info['voyage_updated'] = datetime.strptime(voyage_updated, '%Y-%m-%d %H:%M')
153             except ValueError:
154                 print "Failed to parse voyage updated date. Trying old format ...",
155                 info['voyage_updated'] = datetime.strptime(voyage_updated, '%d/%m/%Y %H:%M:%S')
156                 print "Success"
157
158     
159         return info
160
161     filename = MARINETRAFFIC_DIR+reference_date+'/'+mmsi+'-sum.html'
162     if not os.path.exists(filename):
163         if DISABLE_DOWNLOAD:
164             print >> sys.stderr, filename, 'not found and downloads disabled.'
165             return False
166         request = urllib2.Request('http://www.marinetraffic.com/ais/shipdetails.aspx?MMSI='+mmsi)
167         request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9.0.7) Gecko/2009032018 Firefox/3.0.6 (Debian-3.0.6-1)')
168         uo = urllib2.urlopen(request)
169         html = uo.read()
170         uo.close()
171         sleep(DOWNLOAD_SLEEP_TIME)
172
173         f = open_with_mkdirs(filename, 'w')
174         f.write(html)
175         f.close()
176     else:
177         html = file(filename).read()
178     html = unicode(html, 'utf8')
179
180     if u'The requested service is unavailable.' in html or u'Η λειτουργία που ζητήσατε δεν είναι διαθέσιμη.' in html:
181         print >> sys.stderr, 'WARNING: The requested service is unavailable.'
182         os.unlink(filename)
183         return False
184     if u'Non-existent Vessel' in html:
185         print >> sys.stderr, 'WARNING: Vessel unknown'
186         return False
187
188     info_raw = get_raw_summary(html)
189     info = qualify_summary(info_raw)
190     pprint(info)
191  
192     assert info['mmsi'] == mmsi
193     
194     def warning(*args):
195         print >> sys.stderr, "WARNING:"
196         for arg in args:
197             print >> sys.stderr, args,
198         print >> sys.stderr
199
200     voyage_updated = info.get('voyage_updated', None)
201     if voyage_updated:
202         timestamp = datetime_to_timestamp(voyage_updated)
203     else:
204         timestamp = datetime_to_timestamp(datetime.strptime(reference_date, '%Y%m%d'))
205     imo = int(info.get('imo', 0))
206     if imo >= 1 << 31:
207         warning('imo', imo, 'is too big')
208         imo = 0
209     name = info.get('name', u'').encode('utf8')
210     if len(name) > 20:
211         warning('name', name, 'is too big, truncating')
212         name = name[:20]
213     name = clean_ais_charset(name)
214     callsign = clean_alnum(info.get('callsign', u'').encode('utf8'))
215     type = info.get('type', 0)
216     if type < 0 or type > 100:
217         type = 0 #TODO check
218     eta = info.get('eta', u'00002460')
219     if len(eta)==8:
220         eta_M = int(eta[0:2])
221         eta_D = int(eta[2:4])
222         eta_h = int(eta[4:6])
223         eta_m = int(eta[6:8])
224     else:
225         eta_M = eta_D = 0
226         eta_h = 24
227         eta_m = 60
228     draught = int(info.get('draught', 0)*10)
229     destination = clean_ais_charset(info.get('destination', u'').encode('utf8'))
230     
231     add_nmea5_partial(mmsi, timestamp, imo, name, callsign, type, 0, 0, 0, 0, eta_M, eta_D, eta_h, eta_m, draught, destination, 'MTWW')
232
233     return True
234
235
236 def import_last_pos(reference_date, mmsi, page=None):
237     if page is None or page == 1:
238         filename = MARINETRAFFIC_DIR+reference_date+'/'+mmsi+'-mov.html'
239     else:
240         filename = MARINETRAFFIC_DIR+reference_date+'/'+mmsi+'-mov'+str(page)+'.html'
241
242     if not os.path.exists(filename):
243         if DISABLE_DOWNLOAD:
244             print >> sys.stderr, filename, 'not found and downloads disabled.'
245             return False
246
247         if page is None or page == 1:
248             request = urllib2.Request('http://www.marinetraffic.com/ais/datasheet.aspx?datasource=ITINERARIES&MMSI='+mmsi)
249         else:
250             request = urllib2.Request('http://www.marinetraffic.com/ais/datasheet.aspx?datasource=ITINERARIES&MMSI='+mmsi+'&orderby=MINTIME&sort_order=DESC&var_page='+str(page))
251         request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9.0.7) Gecko/2009032018 Firefox/3.0.6 (Debian-3.0.6-1)')
252         uo = urllib2.urlopen(request)
253         html = uo.read()
254         uo.close()
255         sleep(DOWNLOAD_SLEEP_TIME)
256
257         f = open_with_mkdirs(filename, 'w')
258         f.write(html)
259         f.close()
260     else:
261         html = file(filename).read()
262     html = unicode(html, 'utf8')
263
264     if u'No Records Found' in html:
265         print >> sys.stderr, 'NOTICE: No Records found.'
266         return
267     if u'The requested service is unavailable.' in html or u'Η λειτουργία που ζητήσατε δεν είναι διαθέσιμη.' in html:
268         print >> sys.stderr, 'WARNING: The requested service is unavailable.'
269         os.unlink(filename)
270         return
271
272     root = html_parse(html)
273     table = get_elem(root, u'table')[0]
274
275     infosrc = u'MT'
276     
277     # Now, import each track
278     for row in table.children[1:]: # ignore first line with headers
279         line = row.children
280
281         latlong = get_merged_leaf_content(line[5])
282         latlong = latlong.replace(u'\xa0', u'').strip()
283         if latlong:
284             lat, lon = latlong.split(' ')
285         
286         show_on_map = line[8]
287         assert get_merged_leaf_content(show_on_map).strip() == 'Show on Map'
288         link = show_on_map.children[0].children[0].attributes['href']
289         tmp = link.split(u'?', 2)
290         assert tmp[0] == u'default.aspx'
291         tmp = tmp[1]
292         tmp = tmp.split(u'&')
293         assert len(tmp)==3
294         assert tmp[0] == u'zoom=9'
295         assert tmp[1] == u'oldmmsi='+mmsi
296         tmp = tmp[2]
297         assert tmp.startswith(u'olddate=')
298         dt = tmp[len(u'olddate='):]
299
300         isodt = datetime.strptime(dt, '%m/%d/%Y %I:%M:%S %p')
301
302
303         if latlong:
304             speed = float(get_merged_leaf_content(line[6]))
305             course = float(get_merged_leaf_content(line[7]))
306             #print dt, isodt, lat, long, speed, course
307
308             strmmsi = mmsi
309             timestamp = datetime_to_timestamp(isodt)
310             status = AIS_STATUS_NOT_AVAILABLE
311             rot = AIS_ROT_NOT_AVAILABLE
312             sog = int(speed*AIS_SOG_SCALE)
313             latitude = int(float(lat)*AIS_LATLON_SCALE)
314             longitude = int(float(lon)*AIS_LATLON_SCALE)
315             cog = int(course*AIS_COG_SCALE)
316             heading = AIS_NO_HEADING
317             source = 'MTWW'
318             print strmmsi, timestamp, status, rot, sog, latitude, longitude, cog, heading, source
319             add_nmea1(strmmsi, timestamp, status, rot, sog, latitude, longitude, cog, heading, source)
320
321         import_track(mmsi, dt, isodt)
322
323     if 'Next page' in html and page is not 2:
324         print 'There is another page!'
325         return True
326     else:
327         return False
328
329
330 def import_track(mmsi, dt, isodt):
331     filename = MARINETRAFFIC_DIR+isodt.strftime('%Y%m%d')+'/'+mmsi+'-trk.xml'
332     if not os.path.exists(filename):
333         if DISABLE_DOWNLOAD:
334             print >> sys.stderr, filename, 'not found and downloads disabled.'
335             return
336
337         url = 'http://www.marinetraffic.com/ais/gettrackxml.aspx?mmsi=%s&date=%s' % (mmsi, dt.replace(' ','%20'))
338         request = urllib2.Request(url)
339         request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9.0.7) Gecko/2009032018 Firefox/3.0.6 (Debian-3.0.6-1)')
340         request.add_header('Accept' , 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
341         request.add_header('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7')
342         request.add_header('Referer', 'http://www.marinetraffic.com/ais/default.aspx?zoom=9&oldmmsi=%(mmsi)s+&olddate=%(date)s' % { 'mmsi': mmsi, 'date': dt.replace(' ', '%20') })
343         uo = urllib2.urlopen(request)
344         html = uo.read()
345         uo.close()
346         sleep(DOWNLOAD_SLEEP_TIME)
347
348         f = open_with_mkdirs(filename, 'w')
349         f.write(html)
350         f.close()
351     else:
352         html = file(filename).read()
353
354     #print filename
355     xml = unicode(html, 'utf8')
356     info = { 'mmsi': mmsi, 'infosrc': u'MT' }
357     for node in html_lexer(xml):
358         if isinstance(node, Tag) and node.name==u'pos':
359             info['updated'] = node.attributes['timestamp']
360             info['lat'] = float(node.attributes['lat'])
361             info['lon'] = float(node.attributes['lon'])
362             info['course'] = float(node.attributes['course'])
363             info['speed'] = float(node.attributes['speed'])/10.
364             strmmsi = mmsi
365             timestamp = datetime_to_timestamp(datetime.strptime(info['updated'], '%Y-%m-%dT%H:%M:%S'))
366             status = AIS_STATUS_NOT_AVAILABLE
367             rot = AIS_ROT_NOT_AVAILABLE
368             sog = int(info['speed']*AIS_SOG_SCALE)
369             latitude = int(float(info['lat'])*AIS_LATLON_SCALE)
370             longitude = int(float(info['lon'])*AIS_LATLON_SCALE)
371             cog = int(info['course'])*AIS_COG_SCALE
372             heading = AIS_NO_HEADING
373             source = 'MTTR'
374             print datetime.utcfromtimestamp(timestamp),
375             for i in strmmsi, timestamp, status, rot, sog, latitude, longitude, cog, heading, source:
376                 print repr(i),
377             print
378             add_nmea1(strmmsi, timestamp, status, rot, sog, latitude, longitude, cog, heading, source)
379     #dbcommit()
380     
381     
382
383 if __name__ == '__main__':
384     parser = OptionParser(usage='%prog [options] mmsi [mmsi]...')
385     parser.add_option('--no-download', help="don't download any file", action='store_true', dest='no_download', default=False)
386     parser.add_option('--download-sleep', help="how many seconds do we sleep after each download. default=%default", action='store', type='int', dest='sleep', default=DOWNLOAD_SLEEP_TIME)
387     parser.add_option('--debug-sql', help="print all sql queries to stdout before running them", action='store_true', dest='debug_sql', default=False)
388     parser.add_option('--date', help="force reference date. default=%default\nDo NOT use without --date", action='store', dest='reference_date', default=datetime.utcnow().date().strftime('%Y%m%d'))
389     parser.add_option('--print-mmsi', help="prints each mmsi before it's processed", action='store_true', dest='print_mmsi', default=False)
390     (options, args) = parser.parse_args()
391
392     if len(args)==0:
393         print >> sys.stderr, "Need parameters"
394         sys.exit(1)
395
396     DISABLE_DOWNLOAD = options.no_download
397     DOWNLOAD_SLEEP_TIME = options.sleep
398     if options.debug_sql:
399         sql_setdebug(True)
400     reference_date = options.reference_date
401
402     for mmsi in args:
403         while len(mmsi) and mmsi[-1] in '\r\n':
404             mmsi = mmsi[:-1]
405         if not mmsi.isdigit():
406             print 'MMSI', mmsi, 'is not numeric. Ignoring.'
407             continue
408         if options.print_mmsi:
409             print 'MMSI', mmsi
410         found = go_summary(reference_date, mmsi)
411         if found:
412             page = 1
413             while True:
414                 if import_last_pos(reference_date, mmsi, page):
415                     page += 1
416                 else:
417                     break