Added old scripts iccat & marine traffic to subversion
[ais.git] / bin / extras / marinetraffic.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 DOWNLOAD_SLEEP_TIME = 10
5 DISABLE_DOWNLOAD = False
6 MARINETRAFFIC_DIR = '/var/lib/ais/marinetraffic/'
7
8 import sys, os, urllib2, time
9 from pprint import pprint
10 from datetime import datetime, date
11 from time import sleep
12 from optparse import OptionParser
13
14 from ais.db import *
15 from ais.ntools import clean_alnum, clean_ais_charset, open_with_mkdirs, datetime_to_timestamp
16 from ais.common import *
17 from ais.html_parser import *
18
19
20 def go_summary(reference_date, mmsi):
21     def get_raw_summary(html):
22         root = html_parse(html)
23         divs = get_elem(root, 'div')
24         divdetail = divs[3]
25         
26         info_raw = {}
27         section = u''
28         boldtext = u''
29         text = u''
30         for node in divdetail.children:
31             if isinstance(node, Tag) and node.name == 'h1':
32                 info_raw[u'name'] = get_inner_text(node)
33                 continue
34             if isinstance(node, Tag) and node.name == 'h2':
35                 if boldtext or text:
36                     info_raw[section][boldtext] = text
37                     boldtext = text = u''
38                 section = get_inner_text(node)
39                 info_raw[section] = {}
40                 continue
41         
42             if isinstance(node, Tag) and node.name == 'br':
43                 if boldtext or text:
44                     info_raw[section][boldtext] = text
45                     boldtext = text = u''
46             elif isinstance(node, Tag) and node.name == 'b':
47                 if boldtext or text:
48                     info_raw[section][boldtext] = text
49                     boldtext = text = u''
50                 boldtext = get_inner_text(node)
51             else:
52                 text += get_inner_text(node)
53         if boldtext or text:
54             info_raw[section][boldtext] = text
55             boldtext = text = u''
56         return info_raw
57     
58     def qualify_summary(info_raw):
59         #pprint(info_raw)
60         info = {}
61         info['name'] = info_raw['name']
62     
63         try:
64             details = info_raw[u"Vessel's Details:"]
65         except KeyError:
66             details = info_raw[u"Vessel's Details"]
67         info['callsign'] = clean_alnum(details[u'Call Sign:'].encode('ascii', 'replace'))
68         tmp = details.get(u'IMO:', None)
69         if tmp:
70             tmp = tmp.replace(u',', u'').strip()
71             if tmp != u'999999999':
72                 info['imo'] = tmp.replace(u', ', u'').strip()
73         tmp = details.get(u'Length x Breadth:', None)
74         if tmp:
75             length, breadth = tmp.replace(u'm', u'').split('X')
76             info['length'], info['breadth'] = int(length), int(breadth)
77         info['mmsi'] = details[u'MMSI:'].strip()
78         tmp = details.get(u'Ship Type:', None)
79         if tmp:
80             tmp = tmp.strip()
81             reverse_types = {
82                 u'Fishing': 30,
83                 #u'Towing': 31,
84                 #u'Towing': 32,
85                 u'Dredger': 33,
86                 u'Dive Vessel': 34,
87                 u'Military Ops': 35,
88                 u'Sailing Vessel': 36,
89                 u'Pleasure Craft': 37,
90                 u'Pilot Vessel': 50,
91                 u'SAR': 51,
92                 u'Tug': 52,
93                 u'Port Tender': 53,
94                 u'Anti-Pollution': 54,
95                 u'Law Enforce': 55,
96                 #u'Local Vessel': 56,
97                 #u'Local Vessel': 57,
98                 u'Medical Trans': 58,
99                 u'Special Craft': 59,
100                 # Cargo is repported for types 70, 75, 76 .. 79
101                 u'Cargo - Hazard A (Major)': 71,
102                 u'Cargo - Hazard B': 72,
103                 u'Cargo - Hazard C (Minor)': 73,
104                 u'Cargo - Hazard D (Recognizable)': 74,
105                 u'Tanker - Hazard A (Major)': 81,
106                 u'Tanker - Hazard B': 82,
107                 u'Tanker - Hazard C (Minor)': 83,
108                 u'Tanker - Hazard D (Recognizable)': 84,
109             }
110             _type = reverse_types.get(tmp, None)
111             if _type is not None:
112                 info['type'] = _type
113             else:
114                 print >> sys.stderr , "NOTICE: can't properly qualify ship of type", tmp
115         # TODO year built .... ?
116     
117         try:
118             voyage = info_raw[u'Voyage Related Info (Last Received):']
119         except KeyError:
120             voyage = info_raw[u'Voyage Related Info (Last Received)']
121         tmp = voyage.get(u'Destination:', None)
122         if tmp:
123             tmp = tmp.strip()
124             if tmp != 'CLASS B':
125                 info['destination'] = tmp
126         tmp = voyage.get(u'Draught:', None)
127         if tmp:
128             info['draught'] = float(tmp.replace(u'm', u''))
129         tmp = voyage.get(u'ETA:', None)
130         if tmp:
131             tmp = tmp.strip()
132             try:
133                 tmp = datetime.strptime(tmp, '%Y-%m-%d %H:%M')
134             except ValueError:
135                 print "Failed to parse ETA date. Trying old format ...",
136                 tmp = datetime.strptime(tmp, '%d/%m/%Y %H:%M:%S')
137                 print "Success"
138
139             if tmp != datetime(1900, 1, 1):
140                 info['eta'] = tmp.strftime('%m%d%H%M')
141         tmp = voyage.get(u'Info Received:', None)
142         if tmp:
143             voyage_updated = tmp.split(u'(')[0].strip()
144             try:
145                 info['voyage_updated'] = datetime.strptime(voyage_updated, '%Y-%m-%d %H:%M')
146             except ValueError:
147                 print "Failed to parse voyage updated date. Trying old format ...",
148                 info['voyage_updated'] = datetime.strptime(voyage_updated, '%d/%m/%Y %H:%M:%S')
149                 print "Success"
150
151     
152         return info
153
154     filename = MARINETRAFFIC_DIR+reference_date+'/'+mmsi+'-sum.html'
155     if not os.path.exists(filename):
156         if DISABLE_DOWNLOAD:
157             print >> sys.stderr, filename, 'not found and downloads disabled.'
158             return False
159         request = urllib2.Request('http://www.marinetraffic.com/ais/shipdetails.aspx?MMSI='+mmsi)
160         request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9.0.7) Gecko/2009032018 Firefox/3.0.6 (Debian-3.0.6-1)')
161         uo = urllib2.urlopen(request)
162         html = uo.read()
163         uo.close()
164         sleep(DOWNLOAD_SLEEP_TIME)
165
166         f = open_with_mkdirs(filename, 'w')
167         f.write(html)
168         f.close()
169     else:
170         html = file(filename).read()
171     html = unicode(html, 'utf8')
172
173     if u'The requested service is unavailable.' in html or u'Η λειτουργία που ζητήσατε δεν είναι διαθέσιμη.' in html:
174         print >> sys.stderr, 'WARNING: The requested service is unavailable.'
175         os.unlink(filename)
176         return False
177     if u'Non-existent Vessel' in html:
178         print >> sys.stderr, 'WARNING: Vessel unknown'
179         return False
180
181     info_raw = get_raw_summary(html)
182     info = qualify_summary(info_raw)
183     #pprint(info)
184  
185     assert info['mmsi'] == mmsi
186     
187     def warning(*args):
188         print >> sys.stderr, "WARNING:"
189         for arg in args:
190             print >> sys.stderr, args,
191         print >> sys.stderr
192
193     voyage_updated = info.get('voyage_updated', None)
194     if voyage_updated:
195         timestamp = datetime_to_timestamp(voyage_updated)
196     else:
197         timestamp = datetime_to_timestamp(datetime.strptime(reference_date, '%Y%m%d'))
198     imo = int(info.get('imo', 0))
199     if imo >= 1 << 31:
200         warning('imo', imo, 'is too big')
201         imo = 0
202     name = info.get('name', u'').encode('utf8')
203     if len(name) > 20:
204         warning('name', name, 'is too big, truncating')
205         name = name[:20]
206     name = clean_ais_charset(name)
207     callsign = clean_alnum(info.get('callsign', u'').encode('utf8'))
208     type = info.get('type', 0)
209     if type < 0 or type > 100:
210         type = 0 #TODO check
211     eta = info.get('eta', u'00002460')
212     if len(eta)==8:
213         eta_M = int(eta[0:2])
214         eta_D = int(eta[2:4])
215         eta_h = int(eta[4:6])
216         eta_m = int(eta[6:8])
217     else:
218         eta_M = eta_D = 0
219         eta_h = 24
220         eta_m = 60
221     draught = int(info.get('draught', 0)*10)
222     destination = clean_ais_charset(info.get('destination', u'').encode('utf8'))
223     
224     add_nmea5_partial(mmsi, timestamp, imo, name, callsign, type, 0, 0, 0, 0, eta_M, eta_D, eta_h, eta_m, draught, destination, 'MTWW')
225
226     return True
227
228
229 def import_last_pos(reference_date, mmsi, page=None):
230     if page is None or page == 1:
231         filename = MARINETRAFFIC_DIR+reference_date+'/'+mmsi+'-mov.html'
232     else:
233         filename = MARINETRAFFIC_DIR+reference_date+'/'+mmsi+'-mov'+str(page)+'.html'
234
235     if not os.path.exists(filename):
236         if DISABLE_DOWNLOAD:
237             print >> sys.stderr, filename, 'not found and downloads disabled.'
238             return False
239
240         if page is None or page == 1:
241             request = urllib2.Request('http://www.marinetraffic.com/ais/datasheet.aspx?datasource=ITINERARIES&MMSI='+mmsi)
242         else:
243             request = urllib2.Request('http://www.marinetraffic.com/ais/datasheet.aspx?datasource=ITINERARIES&MMSI='+mmsi+'&orderby=MINTIME&sort_order=DESC&var_page='+str(page))
244         request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9.0.7) Gecko/2009032018 Firefox/3.0.6 (Debian-3.0.6-1)')
245         uo = urllib2.urlopen(request)
246         html = uo.read()
247         uo.close()
248         sleep(DOWNLOAD_SLEEP_TIME)
249
250         f = open_with_mkdirs(filename, 'w')
251         f.write(html)
252         f.close()
253     else:
254         html = file(filename).read()
255     html = unicode(html, 'utf8')
256
257     if u'No Records Found' in html:
258         print >> sys.stderr, 'NOTICE: No Records found.'
259         return
260     if u'The requested service is unavailable.' in html or u'Η λειτουργία που ζητήσατε δεν είναι διαθέσιμη.' in html:
261         print >> sys.stderr, 'WARNING: The requested service is unavailable.'
262         os.unlink(filename)
263         return
264
265     root = html_parse(html)
266     table = get_elem(root, u'table')[0]
267
268     infosrc = u'MT'
269     
270     # Now, import each track
271     for row in table.children[1:]: # ignore first line with headers
272         line = row.children
273
274         latlong = get_merged_leaf_content(line[5])
275         latlong = latlong.replace(u'\xa0', u'').strip()
276         if latlong:
277             lon, lat = latlong.split(' ')
278         
279         show_on_map = line[8]
280         assert get_merged_leaf_content(show_on_map).strip() == 'Show on Map'
281         link = show_on_map.children[0].children[0].attributes['href']
282         tmp = link.split(u'?', 2)
283         assert tmp[0] == u'default.aspx'
284         tmp = tmp[1]
285         tmp = tmp.split(u'&')
286         assert len(tmp)==3
287         assert tmp[0] == u'zoom=9'
288         assert tmp[1] == u'oldmmsi='+mmsi
289         tmp = tmp[2]
290         assert tmp.startswith(u'olddate=')
291         dt = tmp[len(u'olddate='):]
292
293         isodt = datetime.strptime(dt, '%m/%d/%Y %I:%M:%S %p')
294
295
296         if latlong:
297             speed = float(get_merged_leaf_content(line[6]))
298             course = float(get_merged_leaf_content(line[7]))
299             #print dt, isodt, lat, long, speed, course
300
301             strmmsi = mmsi
302             timestamp = datetime_to_timestamp(isodt)
303             status = AIS_STATUS_NOT_AVAILABLE
304             rot = AIS_ROT_NOT_AVAILABLE
305             sog = int(speed*AIS_SOG_SCALE)
306             latitude = int(float(lat)*AIS_LATLON_SCALE)
307             longitude = int(float(lon)*AIS_LATLON_SCALE)
308             cog = int(course*AIS_COG_SCALE)
309             heading = AIS_NO_HEADING
310             source = 'MTWW'
311             add_nmea1(strmmsi, timestamp, status, rot, sog, latitude, longitude, cog, heading, source)
312
313         import_track(mmsi, dt, isodt)
314
315     if 'Next page' in html and page is not 2:
316         print 'There is another page!'
317         return True
318     else:
319         return False
320
321
322 def import_track(mmsi, dt, isodt):
323     filename = MARINETRAFFIC_DIR+isodt.strftime('%Y%m%d')+'/'+mmsi+'-trk.xml'
324     if not os.path.exists(filename):
325         if DISABLE_DOWNLOAD:
326             print >> sys.stderr, filename, 'not found and downloads disabled.'
327             return
328
329         url = 'http://www.marinetraffic.com/ais/gettrackxml.aspx?mmsi=%s&date=%s' % (mmsi, dt.replace(' ','%20'))
330         request = urllib2.Request(url)
331         request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9.0.7) Gecko/2009032018 Firefox/3.0.6 (Debian-3.0.6-1)')
332         request.add_header('Accept' , 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
333         request.add_header('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7')
334         request.add_header('Referer', 'http://www.marinetraffic.com/ais/default.aspx?zoom=9&oldmmsi=%(mmsi)s+&olddate=%(date)s' % { 'mmsi': mmsi, 'date': dt.replace(' ', '%20') })
335         uo = urllib2.urlopen(request)
336         html = uo.read()
337         uo.close()
338         sleep(DOWNLOAD_SLEEP_TIME)
339
340         f = open_with_mkdirs(filename, 'w')
341         f.write(html)
342         f.close()
343     else:
344         html = file(filename).read()
345
346     #print filename
347     xml = unicode(html, 'utf8')
348     info = { 'mmsi': mmsi, 'infosrc': u'MT' }
349     for node in html_lexer(xml):
350         if isinstance(node, Tag) and node.name==u'pos':
351             info['updated'] = node.attributes['timestamp']
352             info['lat'] = float(node.attributes['lat'])
353             info['lon'] = float(node.attributes['lon'])
354             info['course'] = float(node.attributes['course'])
355             info['speed'] = float(node.attributes['speed'])/10.
356             strmmsi = mmsi
357             timestamp = datetime_to_timestamp(datetime.strptime(info['updated'], '%Y-%m-%dT%H:%M:%S'))
358             status = AIS_STATUS_NOT_AVAILABLE
359             rot = AIS_ROT_NOT_AVAILABLE
360             sog = int(info['speed']*AIS_SOG_SCALE)
361             latitude = int(float(info['lat'])*AIS_LATLON_SCALE)
362             longitude = int(float(info['lon'])*AIS_LATLON_SCALE)
363             cog = int(info['course'])*AIS_COG_SCALE
364             heading = AIS_NO_HEADING
365             source = 'MTTR'
366             #print datetime.utcfromtimestamp(timestamp),
367             #for i in strmmsi, timestamp, status, rot, sog, latitude, longitude, cog, heading, source:
368             #    print repr(i),
369             #print
370             add_nmea1(strmmsi, timestamp, status, rot, sog, latitude, longitude, cog, heading, source)
371     #dbcommit()
372     
373     
374
375 if __name__ == '__main__':
376     parser = OptionParser(usage='%prog [options] mmsi [mmsi]...')
377     parser.add_option('--no-download', help="don't download any file", action='store_true', dest='no_download', default=False)
378     parser.add_option('--download-sleep', help="how many seconds do we sleep after each download. default=%default", action='store', type='int', dest='sleep', default=DOWNLOAD_SLEEP_TIME)
379     parser.add_option('--debug-sql', help="print all sql queries to stdout before running them", action='store_true', dest='debug_sql', default=False)
380     parser.add_option('--date', help="force reference date. default=%default\nDo NOT use without --date", action='store', dest='reference_date', default=datetime.utcnow().date().strftime('%Y%m%d'))
381     parser.add_option('--print-mmsi', help="prints each mmsi before it's processed", action='store_true', dest='print_mmsi', default=False)
382     (options, args) = parser.parse_args()
383
384     if len(args)==0:
385         print >> sys.stderr, "Need parameters"
386         sys.exit(1)
387
388     DISABLE_DOWNLOAD = options.no_download
389     DOWNLOAD_SLEEP_TIME = options.sleep
390     if options.debug_sql:
391         sql_setdebug(True)
392     reference_date = options.reference_date
393
394     for mmsi in args:
395         while len(mmsi) and mmsi[-1] in '\r\n':
396             mmsi = mmsi[:-1]
397         if not mmsi.isdigit():
398             print 'MMSI', mmsi, 'is not numeric. Ignoring.'
399             continue
400         if options.print_mmsi:
401             print 'MMSI', mmsi
402         found = go_summary(reference_date, mmsi)
403         if found:
404             page = 1
405             while True:
406                 if import_last_pos(reference_date, mmsi, page):
407                     page += 1
408                 else:
409                     break