Added old scripts iccat & marine traffic to subversion
authorJean-Michel Nirgal Vourgère <jmv@nirgal.com>
Wed, 11 Aug 2010 13:38:19 +0000 (13:38 +0000)
committerJean-Michel Nirgal Vourgère <jmv@nirgal.com>
Wed, 11 Aug 2010 13:38:19 +0000 (13:38 +0000)
bin/extras/iccat_getrarfilename.py [new file with mode: 0755]
bin/extras/marinetraffic.py [new file with mode: 0755]
cron/iccat.cron

diff --git a/bin/extras/iccat_getrarfilename.py b/bin/extras/iccat_getrarfilename.py
new file mode 100755 (executable)
index 0000000..0b98e47
--- /dev/null
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+from ais.html_parser import *
+
+if __name__ == '__main__':
+    from optparse import OptionParser
+    parser = OptionParser()
+    options, args = parser.parse_args()
+
+    html = file(args[0]).read()
+    html = unicode(html, 'iso-8859-1')
+    root = html_parse(html)
+    links = get_elem(root, 'a')
+    count = 0
+    for link in links:
+        href = link.attributes['href']
+        if href.startswith('/Data/VesselsZIP/'):
+            sys.stdout.write(href+'\n')
+            count += 1
+    if count == 1:
+        sys.exit(0)
+    else:
+        sys.exit(1)
diff --git a/bin/extras/marinetraffic.py b/bin/extras/marinetraffic.py
new file mode 100755 (executable)
index 0000000..3d721b7
--- /dev/null
@@ -0,0 +1,409 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+DOWNLOAD_SLEEP_TIME = 10
+DISABLE_DOWNLOAD = False
+MARINETRAFFIC_DIR = '/var/lib/ais/marinetraffic/'
+
+import sys, os, urllib2, time
+from pprint import pprint
+from datetime import datetime, date
+from time import sleep
+from optparse import OptionParser
+
+from ais.db import *
+from ais.ntools import clean_alnum, clean_ais_charset, open_with_mkdirs, datetime_to_timestamp
+from ais.common import *
+from ais.html_parser import *
+
+
+def go_summary(reference_date, mmsi):
+    def get_raw_summary(html):
+        root = html_parse(html)
+        divs = get_elem(root, 'div')
+        divdetail = divs[3]
+        
+        info_raw = {}
+        section = u''
+        boldtext = u''
+        text = u''
+        for node in divdetail.children:
+            if isinstance(node, Tag) and node.name == 'h1':
+                info_raw[u'name'] = get_inner_text(node)
+                continue
+            if isinstance(node, Tag) and node.name == 'h2':
+                if boldtext or text:
+                    info_raw[section][boldtext] = text
+                    boldtext = text = u''
+                section = get_inner_text(node)
+                info_raw[section] = {}
+                continue
+        
+            if isinstance(node, Tag) and node.name == 'br':
+                if boldtext or text:
+                    info_raw[section][boldtext] = text
+                    boldtext = text = u''
+            elif isinstance(node, Tag) and node.name == 'b':
+                if boldtext or text:
+                    info_raw[section][boldtext] = text
+                    boldtext = text = u''
+                boldtext = get_inner_text(node)
+            else:
+                text += get_inner_text(node)
+        if boldtext or text:
+            info_raw[section][boldtext] = text
+            boldtext = text = u''
+        return info_raw
+    
+    def qualify_summary(info_raw):
+        #pprint(info_raw)
+        info = {}
+        info['name'] = info_raw['name']
+    
+        try:
+            details = info_raw[u"Vessel's Details:"]
+        except KeyError:
+            details = info_raw[u"Vessel's Details"]
+        info['callsign'] = clean_alnum(details[u'Call Sign:'].encode('ascii', 'replace'))
+        tmp = details.get(u'IMO:', None)
+        if tmp:
+            tmp = tmp.replace(u',', u'').strip()
+            if tmp != u'999999999':
+                info['imo'] = tmp.replace(u', ', u'').strip()
+        tmp = details.get(u'Length x Breadth:', None)
+        if tmp:
+            length, breadth = tmp.replace(u'm', u'').split('X')
+            info['length'], info['breadth'] = int(length), int(breadth)
+        info['mmsi'] = details[u'MMSI:'].strip()
+        tmp = details.get(u'Ship Type:', None)
+        if tmp:
+            tmp = tmp.strip()
+            reverse_types = {
+                u'Fishing': 30,
+                #u'Towing': 31,
+                #u'Towing': 32,
+                u'Dredger': 33,
+                u'Dive Vessel': 34,
+                u'Military Ops': 35,
+                u'Sailing Vessel': 36,
+                u'Pleasure Craft': 37,
+                u'Pilot Vessel': 50,
+                u'SAR': 51,
+                u'Tug': 52,
+                u'Port Tender': 53,
+                u'Anti-Pollution': 54,
+                u'Law Enforce': 55,
+                #u'Local Vessel': 56,
+                #u'Local Vessel': 57,
+                u'Medical Trans': 58,
+                u'Special Craft': 59,
+                # Cargo is repported for types 70, 75, 76 .. 79
+                u'Cargo - Hazard A (Major)': 71,
+                u'Cargo - Hazard B': 72,
+                u'Cargo - Hazard C (Minor)': 73,
+                u'Cargo - Hazard D (Recognizable)': 74,
+                u'Tanker - Hazard A (Major)': 81,
+                u'Tanker - Hazard B': 82,
+                u'Tanker - Hazard C (Minor)': 83,
+                u'Tanker - Hazard D (Recognizable)': 84,
+            }
+            _type = reverse_types.get(tmp, None)
+            if _type is not None:
+                info['type'] = _type
+            else:
+                print >> sys.stderr , "NOTICE: can't properly qualify ship of type", tmp
+        # TODO year built .... ?
+    
+        try:
+            voyage = info_raw[u'Voyage Related Info (Last Received):']
+        except KeyError:
+            voyage = info_raw[u'Voyage Related Info (Last Received)']
+        tmp = voyage.get(u'Destination:', None)
+        if tmp:
+            tmp = tmp.strip()
+            if tmp != 'CLASS B':
+                info['destination'] = tmp
+        tmp = voyage.get(u'Draught:', None)
+        if tmp:
+            info['draught'] = float(tmp.replace(u'm', u''))
+        tmp = voyage.get(u'ETA:', None)
+        if tmp:
+            tmp = tmp.strip()
+            try:
+                tmp = datetime.strptime(tmp, '%Y-%m-%d %H:%M')
+            except ValueError:
+                print "Failed to parse ETA date. Trying old format ...",
+                tmp = datetime.strptime(tmp, '%d/%m/%Y %H:%M:%S')
+                print "Success"
+
+            if tmp != datetime(1900, 1, 1):
+                info['eta'] = tmp.strftime('%m%d%H%M')
+        tmp = voyage.get(u'Info Received:', None)
+        if tmp:
+            voyage_updated = tmp.split(u'(')[0].strip()
+            try:
+                info['voyage_updated'] = datetime.strptime(voyage_updated, '%Y-%m-%d %H:%M')
+            except ValueError:
+                print "Failed to parse voyage updated date. Trying old format ...",
+                info['voyage_updated'] = datetime.strptime(voyage_updated, '%d/%m/%Y %H:%M:%S')
+                print "Success"
+
+    
+        return info
+
+    filename = MARINETRAFFIC_DIR+reference_date+'/'+mmsi+'-sum.html'
+    if not os.path.exists(filename):
+        if DISABLE_DOWNLOAD:
+            print >> sys.stderr, filename, 'not found and downloads disabled.'
+            return False
+        request = urllib2.Request('http://www.marinetraffic.com/ais/shipdetails.aspx?MMSI='+mmsi)
+        request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9.0.7) Gecko/2009032018 Firefox/3.0.6 (Debian-3.0.6-1)')
+        uo = urllib2.urlopen(request)
+        html = uo.read()
+        uo.close()
+        sleep(DOWNLOAD_SLEEP_TIME)
+
+        f = open_with_mkdirs(filename, 'w')
+        f.write(html)
+        f.close()
+    else:
+        html = file(filename).read()
+    html = unicode(html, 'utf8')
+
+    if u'The requested service is unavailable.' in html or u'Η λειτουργία που ζητήσατε δεν είναι διαθέσιμη.' in html:
+        print >> sys.stderr, 'WARNING: The requested service is unavailable.'
+        os.unlink(filename)
+        return False
+    if u'Non-existent Vessel' in html:
+        print >> sys.stderr, 'WARNING: Vessel unknown'
+        return False
+
+    info_raw = get_raw_summary(html)
+    info = qualify_summary(info_raw)
+    #pprint(info)
+    assert info['mmsi'] == mmsi
+    
+    def warning(*args):
+        print >> sys.stderr, "WARNING:"
+        for arg in args:
+            print >> sys.stderr, args,
+        print >> sys.stderr
+
+    voyage_updated = info.get('voyage_updated', None)
+    if voyage_updated:
+        timestamp = datetime_to_timestamp(voyage_updated)
+    else:
+        timestamp = datetime_to_timestamp(datetime.strptime(reference_date, '%Y%m%d'))
+    imo = int(info.get('imo', 0))
+    if imo >= 1 << 31:
+        warning('imo', imo, 'is too big')
+        imo = 0
+    name = info.get('name', u'').encode('utf8')
+    if len(name) > 20:
+        warning('name', name, 'is too big, truncating')
+        name = name[:20]
+    name = clean_ais_charset(name)
+    callsign = clean_alnum(info.get('callsign', u'').encode('utf8'))
+    type = info.get('type', 0)
+    if type < 0 or type > 100:
+        type = 0 #TODO check
+    eta = info.get('eta', u'00002460')
+    if len(eta)==8:
+        eta_M = int(eta[0:2])
+        eta_D = int(eta[2:4])
+        eta_h = int(eta[4:6])
+        eta_m = int(eta[6:8])
+    else:
+        eta_M = eta_D = 0
+        eta_h = 24
+        eta_m = 60
+    draught = int(info.get('draught', 0)*10)
+    destination = clean_ais_charset(info.get('destination', u'').encode('utf8'))
+    
+    add_nmea5_partial(mmsi, timestamp, imo, name, callsign, type, 0, 0, 0, 0, eta_M, eta_D, eta_h, eta_m, draught, destination, 'MTWW')
+
+    return True
+
+
+def import_last_pos(reference_date, mmsi, page=None):
+    if page is None or page == 1:
+        filename = MARINETRAFFIC_DIR+reference_date+'/'+mmsi+'-mov.html'
+    else:
+        filename = MARINETRAFFIC_DIR+reference_date+'/'+mmsi+'-mov'+str(page)+'.html'
+
+    if not os.path.exists(filename):
+        if DISABLE_DOWNLOAD:
+            print >> sys.stderr, filename, 'not found and downloads disabled.'
+            return False
+
+        if page is None or page == 1:
+            request = urllib2.Request('http://www.marinetraffic.com/ais/datasheet.aspx?datasource=ITINERARIES&MMSI='+mmsi)
+        else:
+            request = urllib2.Request('http://www.marinetraffic.com/ais/datasheet.aspx?datasource=ITINERARIES&MMSI='+mmsi+'&orderby=MINTIME&sort_order=DESC&var_page='+str(page))
+        request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9.0.7) Gecko/2009032018 Firefox/3.0.6 (Debian-3.0.6-1)')
+        uo = urllib2.urlopen(request)
+        html = uo.read()
+        uo.close()
+        sleep(DOWNLOAD_SLEEP_TIME)
+
+        f = open_with_mkdirs(filename, 'w')
+        f.write(html)
+        f.close()
+    else:
+        html = file(filename).read()
+    html = unicode(html, 'utf8')
+
+    if u'No Records Found' in html:
+        print >> sys.stderr, 'NOTICE: No Records found.'
+        return
+    if u'The requested service is unavailable.' in html or u'Η λειτουργία που ζητήσατε δεν είναι διαθέσιμη.' in html:
+        print >> sys.stderr, 'WARNING: The requested service is unavailable.'
+        os.unlink(filename)
+        return
+
+    root = html_parse(html)
+    table = get_elem(root, u'table')[0]
+
+    infosrc = u'MT'
+    
+    # Now, import each track
+    for row in table.children[1:]: # ignore first line with headers
+        line = row.children
+
+        latlong = get_merged_leaf_content(line[5])
+        latlong = latlong.replace(u'\xa0', u'').strip()
+        if latlong:
+            lon, lat = latlong.split(' ')
+        
+        show_on_map = line[8]
+        assert get_merged_leaf_content(show_on_map).strip() == 'Show on Map'
+        link = show_on_map.children[0].children[0].attributes['href']
+        tmp = link.split(u'?', 2)
+        assert tmp[0] == u'default.aspx'
+        tmp = tmp[1]
+        tmp = tmp.split(u'&')
+        assert len(tmp)==3
+        assert tmp[0] == u'zoom=9'
+        assert tmp[1] == u'oldmmsi='+mmsi
+        tmp = tmp[2]
+        assert tmp.startswith(u'olddate=')
+        dt = tmp[len(u'olddate='):]
+
+        isodt = datetime.strptime(dt, '%m/%d/%Y %I:%M:%S %p')
+
+
+        if latlong:
+            speed = float(get_merged_leaf_content(line[6]))
+            course = float(get_merged_leaf_content(line[7]))
+            #print dt, isodt, lat, long, speed, course
+
+            strmmsi = mmsi
+            timestamp = datetime_to_timestamp(isodt)
+            status = AIS_STATUS_NOT_AVAILABLE
+            rot = AIS_ROT_NOT_AVAILABLE
+            sog = int(speed*AIS_SOG_SCALE)
+            latitude = int(float(lat)*AIS_LATLON_SCALE)
+            longitude = int(float(lon)*AIS_LATLON_SCALE)
+            cog = int(course*AIS_COG_SCALE)
+            heading = AIS_NO_HEADING
+            source = 'MTWW'
+            add_nmea1(strmmsi, timestamp, status, rot, sog, latitude, longitude, cog, heading, source)
+
+        import_track(mmsi, dt, isodt)
+
+    if 'Next page' in html and page is not 2:
+        print 'There is another page!'
+        return True
+    else:
+        return False
+
+
+def import_track(mmsi, dt, isodt):
+    filename = MARINETRAFFIC_DIR+isodt.strftime('%Y%m%d')+'/'+mmsi+'-trk.xml'
+    if not os.path.exists(filename):
+        if DISABLE_DOWNLOAD:
+            print >> sys.stderr, filename, 'not found and downloads disabled.'
+            return
+
+        url = 'http://www.marinetraffic.com/ais/gettrackxml.aspx?mmsi=%s&date=%s' % (mmsi, dt.replace(' ','%20'))
+        request = urllib2.Request(url)
+        request.add_header('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9.0.7) Gecko/2009032018 Firefox/3.0.6 (Debian-3.0.6-1)')
+        request.add_header('Accept' , 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
+        request.add_header('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7')
+        request.add_header('Referer', 'http://www.marinetraffic.com/ais/default.aspx?zoom=9&oldmmsi=%(mmsi)s+&olddate=%(date)s' % { 'mmsi': mmsi, 'date': dt.replace(' ', '%20') })
+        uo = urllib2.urlopen(request)
+        html = uo.read()
+        uo.close()
+        sleep(DOWNLOAD_SLEEP_TIME)
+
+        f = open_with_mkdirs(filename, 'w')
+        f.write(html)
+        f.close()
+    else:
+        html = file(filename).read()
+
+    #print filename
+    xml = unicode(html, 'utf8')
+    info = { 'mmsi': mmsi, 'infosrc': u'MT' }
+    for node in html_lexer(xml):
+        if isinstance(node, Tag) and node.name==u'pos':
+            info['updated'] = node.attributes['timestamp']
+            info['lat'] = float(node.attributes['lat'])
+            info['lon'] = float(node.attributes['lon'])
+            info['course'] = float(node.attributes['course'])
+            info['speed'] = float(node.attributes['speed'])/10.
+            strmmsi = mmsi
+            timestamp = datetime_to_timestamp(datetime.strptime(info['updated'], '%Y-%m-%dT%H:%M:%S'))
+            status = AIS_STATUS_NOT_AVAILABLE
+            rot = AIS_ROT_NOT_AVAILABLE
+            sog = int(info['speed']*AIS_SOG_SCALE)
+            latitude = int(float(info['lat'])*AIS_LATLON_SCALE)
+            longitude = int(float(info['lon'])*AIS_LATLON_SCALE)
+            cog = int(info['course'])*AIS_COG_SCALE
+            heading = AIS_NO_HEADING
+            source = 'MTTR'
+            #print datetime.utcfromtimestamp(timestamp),
+            #for i in strmmsi, timestamp, status, rot, sog, latitude, longitude, cog, heading, source:
+            #    print repr(i),
+            #print
+            add_nmea1(strmmsi, timestamp, status, rot, sog, latitude, longitude, cog, heading, source)
+    #dbcommit()
+    
+    
+
+if __name__ == '__main__':
+    parser = OptionParser(usage='%prog [options] mmsi [mmsi]...')
+    parser.add_option('--no-download', help="don't download any file", action='store_true', dest='no_download', default=False)
+    parser.add_option('--download-sleep', help="how many seconds do we sleep after each download. default=%default", action='store', type='int', dest='sleep', default=DOWNLOAD_SLEEP_TIME)
+    parser.add_option('--debug-sql', help="print all sql queries to stdout before running them", action='store_true', dest='debug_sql', default=False)
+    parser.add_option('--date', help="force reference date. default=%default\nDo NOT use without --date", action='store', dest='reference_date', default=datetime.utcnow().date().strftime('%Y%m%d'))
+    parser.add_option('--print-mmsi', help="prints each mmsi before it's processed", action='store_true', dest='print_mmsi', default=False)
+    (options, args) = parser.parse_args()
+
+    if len(args)==0:
+        print >> sys.stderr, "Need parameters"
+        sys.exit(1)
+
+    DISABLE_DOWNLOAD = options.no_download
+    DOWNLOAD_SLEEP_TIME = options.sleep
+    if options.debug_sql:
+        sql_setdebug(True)
+    reference_date = options.reference_date
+
+    for mmsi in args:
+        while len(mmsi) and mmsi[-1] in '\r\n':
+            mmsi = mmsi[:-1]
+        if not mmsi.isdigit():
+            print 'MMSI', mmsi, 'is not numeric. Ignoring.'
+            continue
+        if options.print_mmsi:
+            print 'MMSI', mmsi
+        found = go_summary(reference_date, mmsi)
+        if found:
+            page = 1
+            while True:
+                if import_last_pos(reference_date, mmsi, page):
+                    page += 1
+                else:
+                    break
index 9171312908f2c866a5509c50f418b5af00bdc2cd..d8efc7f67c9e64485fe1ad0d93c555747871bd65 100755 (executable)
@@ -8,7 +8,7 @@ set -e
 VARDIR=/var/lib/ais/iccat
 HTMLFILE=$VARDIR/`date -u +%Y%m%d`.html
 wget -q http://www.iccat.int/en/vesselsrecord.asp -O $HTMLFILE
-FILENAME=`python -m ais.iccat_getrarfilename $HTMLFILE`
+FILENAME=`python -m ais.extras.iccat_getrarfilename $HTMLFILE`
 RARURL=http://www.iccat.int$FILENAME
 FILENAME=`basename $FILENAME`