Added ST source explanation
[ais.git] / bin / html_parser.py
1 #!/usr/bin/env python
2 # -*- encoding: utf-8 -*-
3
4 from __future__ import division
5 import sys, htmlentities
6 from optparse import OptionParser
7
8 VERBOSE_PARSER = False
9
10 TI_EMPTY    = 1 # there's not content in these tags, ie assume <tagname ... / >
11 taginfo = {
12     u'meta': TI_EMPTY,
13     u'link': TI_EMPTY,
14     u'br':  TI_EMPTY,
15     u'img':  TI_EMPTY,
16     u'hr':  TI_EMPTY,
17 }
18
19 class Node:
20     class Flags:
21         ROOT    = 1 # this is the root node. There can be only one root
22         CLOSING = 2 # this is a closing tag such as </b>. This tags from the lexer are discarded by the parser
23         CLOSED  = 4 # this is closed. Uncleaned output will only have closing tag if that flag is present.
24
25     def __init__(self):
26         self.father = None
27         self.children = []
28         self.flags = 0
29
30 class Tag(Node):
31     def __init__(self):
32         Node.__init__(self)
33         self.name = u''
34         self.attributes = {}
35
36     def get_tag_info(self):
37         """
38         Returns TI_ flags base on the name of the name
39         """
40         return taginfo.get(self.name, 0)
41
42     def __unicode__(self):
43         #assert self.name != u''
44         result = u'<'
45         if self.flags & Node.Flags.CLOSING:
46             result += u'/'
47         result += self.name
48         for k,v in self.attributes.iteritems():
49             #result += u' (('+k+u'))'
50             result += u' '+k
51             if v:
52                 result += u'="'+v.replace(u'\\', u'\\\\').replace(u'"', u'\\"')+'"'
53         result += u'>'
54         return result
55
56     def __repr__(self):
57         #return 'Tag'+unicode(self).encode('utf8')
58         return unicode(self).encode('utf8')
59
60 class Leaf(Node):
61     # TODO: rename this to CDATA or whatever
62     def __init__(self, text):
63         Node.__init__(self)
64         self.text = htmlentities.resolve(text)
65     def __unicode__(self):
66         return self.text # FIXME escape ?
67     def __repr__(self):
68         #return 'Leaf<'+repr(self.text.encode('utf8'))+'>'
69         return repr(self.text.encode('utf8'))
70
71
72 def html_lexer(page):
73     """
74     That iterator yields Nodes with father/children unset
75     """
76     buf = page # buffer
77     pos = 0 # everything before that position has already been parsed
78     l = len(buf) # constant length
79     state = 0
80
81     def buffind(token):
82         r = buf.find(token, pos)
83         if r==-1:
84             return None
85         return r
86
87     def get_next_tag():
88         state = 'INIT'
89         state_white_skiping = False
90         p = pos # will start with skipping '<'
91         tag = Tag()
92         while True:
93             p += 1
94             if p>=l: # EOS
95                 return None, p # what about last?
96             c = buf[p]
97            
98             if state_white_skiping:
99                 if ord(c)<=32:
100                     continue
101                 else:
102                     state_white_skiping = False
103                 
104             if state == 'INIT':
105                 if c == u'/':
106                     tag.flags += Node.Flags.CLOSING
107                     continue
108                 elif c == u'>':
109                     return tag, p+1
110                 else:
111                     state = 'NAME'
112                     tag.name += c.lower()
113                     continue
114             elif state == 'NAME':
115                 if ord(c)<=32 or c==u'/':
116                     state = 'ATT_NAME'
117                     att_name = u''
118                     state_white_skiping = True
119                     continue
120                 elif c == u'>':
121                     return tag, p+1
122                 else:
123                     tag.name += c.lower()
124                     continue
125             elif state == 'ATT_NAME':
126                 if ord(c)<=32:
127                     state = 'ATT_EQUALS'
128                     state_white_skiping = True
129                     continue
130                 elif c == u'=':
131                     state = 'ATT_VALUE'
132                     state_white_skiping = True
133                     att_value = u''
134                     continue
135                 elif c == u'>':
136                     if att_name != u'':
137                         tag.attributes[att_name] = u''
138                     return tag, p+1
139                 else:   
140                     att_name += c.lower()
141                     continue
142             elif state == 'ATT_EQUALS':
143                 if ord(c)<=32:
144                     continue
145                 elif c == u'=':
146                     state = 'ATT_VALUE'
147                     state_white_skiping = True
148                     att_value = u''
149                     continue
150                 elif c == u'>':
151                     if att_name != u'':
152                         tag.attributes[att_name] = u''
153                     return tag, p+1
154                 else:
155                     if att_name != u'':
156                         tag.attributes[att_name] = u''
157                     state = 'ATT_NAME'
158                     att_name = c.lower()
159                     state_white_skiping = True
160                     continue
161             elif state == 'ATT_VALUE':
162                 if att_value == u'': # first char
163                     if c == u'"' or c == u"'":
164                         att_value_escape = c
165                         state = 'ATT_VALUE_QUOTED'
166                         continue
167                 if ord(c)<32:
168                     tag.attributes[att_name] = att_value
169                     state = 'ATT_NAME'
170                     state_white_skiping = True
171                     att_name = u''
172                     continue
173                 elif c == u'>':
174                     tag.attributes[att_name] = att_value
175                     return tag, p+1
176                 else:
177                     att_value += c
178                     continue
179             elif state == 'ATT_VALUE_QUOTED':
180                 if c == att_value_escape:
181                     tag.attributes[att_name] = att_value
182                     state = 'ATT_NAME'
183                     state_white_skiping = True
184                     att_name = u''
185                     continue
186                 else:
187                     att_value += c
188                     continue
189
190     while True:
191         # get next tag position
192         # TODO: check it's a real tag and not a fragment that should added to that leafnode
193         pt1 = buffind(u'<')
194         if pt1 != pos:
195             yield Leaf(buf[pos:pt1])
196             if pt1 is None:
197                 return
198         pos = pt1
199         
200         tag, pos = get_next_tag()
201         yield tag
202
203
204 def html_parse(page):
205     """
206     This function fetches the nodes from the lexer and assemble them in a node tree
207     """
208     root = Tag()
209     root.flags = Node.Flags.ROOT
210     father = root
211     for node in html_lexer(page):
212         if isinstance(node, Leaf):
213             node.father = father
214             father.children.append(node)
215         elif node.flags & Node.Flags.CLOSING:
216             # change current father
217             newfather = father
218             while True:
219                 # TODO: optimize with Node.Flags.ROOT
220                 if newfather is None:
221                     #TODO: log.debug()
222                     if VERBOSE_PARSER:
223                         print >> sys.stderr, 'Closing tag', node, 'does not match any opening tag. Discarding.'
224                     break
225                 if newfather.name == node.name:
226                     newfather.flags |= Node.Flags.CLOSED
227                     if VERBOSE_PARSER:
228                         if newfather != father:
229                             print >> sys.stderr, 'Closing tag', node, 'has auto-closed other nodes',
230                             deb = father
231                             while deb != newfather:
232                                 print >> sys.stderr, deb,
233                                 deb = deb.father
234                             print >> sys.stderr
235                     father = newfather.father
236                     break
237                 newfather = newfather.father
238         else:
239             node.father = father
240             father.children.append(node)
241             #print 'node=',node,'info=',node.get_tag_info()
242             if not node.get_tag_info() & TI_EMPTY:
243                 father = node
244         #print 'node=',node,'father=',father
245     return root
246
247
248 def print_idented_tree(node, identation_level=-1):
249     if not node.flags & Node.Flags.ROOT:
250         print '   '*identation_level+repr(node)
251     for c in node.children:
252         print_idented_tree(c, identation_level+1)
253     if isinstance(node, Tag) and (node.flags&Node.Flags.CLOSED):
254         print '   '*identation_level+'</'+node.name.encode('utf8')+'>'
255
256 def print_lexer_tree(p):
257     identing = 0
258     for item in html_lexer(p):
259         if isinstance(item, Tag) and item.flags & Node.Flags.CLOSING:
260             identing -= 1
261         print '   '*identing,
262         if isinstance(item, Tag) and not item.flags & Node.Flags.CLOSING:
263             identing += 1
264         print repr(item)
265
266
267 def get_elem(root, tagname):
268     """
269     Returns all the elements whose name matches
270     But not from the children of thoses
271     """
272     if isinstance(root, Leaf):
273         return []
274     if root.name == tagname:
275         return [ root ]
276     results = []
277     for node in root.children:
278         match = get_elem(node, tagname)
279         if match:
280             results += match
281     return results
282         
283
284 def split_table(table):
285     """
286     Returns table content as a list (rows) of list (columns)
287     """
288     ctr = []
289     for tr in get_elem(table, u'tr'):
290         ctd = []
291         for td in get_elem(tr, u'td'):
292             ctd += [ td ]
293         ctr.append(ctd)
294     return ctr
295
296 def split_table_r_to_leaf(root):
297     """
298     Recursivly split tables as descibed in split_table
299     Only returns leaf text or list for sub tables
300     """
301     result = []
302     tables = get_elem(root, u'table')
303     if len(tables)==0:
304         return get_merged_leaf_content(root)
305     for table in tables:
306         rrow = []
307         for row in split_table(table):
308             rcol = []
309             for col in row:
310                 subr = split_table_r_to_leaf(col)
311                 rcol.append(subr)
312             rrow.append(rcol)
313         result.append(rrow)
314     return result
315         
316
317 def get_merged_leaf_content(root):
318     """
319     Returns all the leaf content agregated in a string
320     """
321     if isinstance(root, Leaf):
322         return root.text
323
324     result = u''
325     for node in root.children:
326         result += get_merged_leaf_content(node)
327     return result
328
329
330 get_inner_text = get_merged_leaf_content
331
332
333 def ets_decode_html(event_html):
334     def ets_cleanup(target):
335         if isinstance(target, unicode):
336             return target.replace(u'\xa0',u'').strip()
337         else:
338             return [ ets_cleanup(i) for i in target ]
339
340     def ets_print(prefix, target):
341         has_sublists = False
342         for i in target:
343             if isinstance(i, list):
344                 has_sublists = True
345         if isinstance(target, unicode):
346             if target:
347                 print prefix, repr(target.encode('utf8'))
348             return
349         if not has_sublists:
350             if len(target) == 1 and target[0]==u'':
351                 return
352             print prefix, [ i.encode('utf8') for i in target ]
353             return
354         for i,sub in enumerate(target):
355             ets_print(prefix+u'-'+unicode(i), sub)
356
357     def list_pack(root):
358         if isinstance(root, unicode):
359             return root
360         result = []
361         for i in root:
362             i = list_pack(i)
363             if i and i!=u'':
364                 result.append(i)
365         if len(result)==1:
366             result = result[0]
367         return result
368
369     assert isinstance(event_html, unicode)
370     root = html_parse(event_html)
371     lists = ets_cleanup(split_table_r_to_leaf(root))
372     main = lists[0][1][1]
373     
374     result = {}
375     
376     header = main[0][0][0][0][0]
377     #ets_print(u'header', header)
378     result[u'GSM No'] = header[1]
379     result[u'Unit name'] = header[2][1:] # skip '-'
380     result[u'SIMICC'] = header[4]
381
382     #ets_print(u'block1', main[0])
383     for row in main[0]:
384         if len(row)>1:
385             result[row[0]] = row[2]
386     del main[0]
387
388     # FIXME move in the main loop bellow
389     if main[0][0][0] == u'Cell No.':
390         #ets_print('Cells',  main[0])
391         cells = []
392         for row in main[0][1:]:
393             if len(row)<2:
394                 continue
395             cell = {}
396             for i, key in enumerate(main[0][0]):
397                 if not key or not i: # skip "Cell No." column
398                     continue
399                 value = row[i]
400                 if i==8:
401                     if value: # In some rare case, RSSI is invalid see event 9547789
402                         value = value[0][0][0] # RSSI
403                 cell[key] = value
404             cells.append(cell)
405         result[u'cells'] = cells
406         del main[0]
407
408     for block in main:
409         blockname = list_pack(block[0])
410         assert isinstance(blockname, unicode)
411         packed_block = list_pack(block)
412
413         if blockname == u'Decoding of message':
414             assert packed_block[1] == [ u'Parameter', u'Value', u'Para', u'UC', u'Description' ]
415             decoding = []
416             for row in packed_block[2:]:
417                 if not isinstance(row, list):
418                     continue # not interested in incomplete information such as 'IncomingData ID' with no data!
419                 if row[0] == 'IncomingData ID':
420                     result[u'IncomingData ID'] = row[1]
421                     continue
422                 if len(row) != len(packed_block[1]):
423                     continue # discard lines that have incorrect length, such as RSSI request to propos
424                 line = {}
425                 for i, key in enumerate(packed_block[1]):
426                     line[key] = row[i]
427                 decoding.append(line)
428             result['decoding'] = decoding
429
430         elif blockname == u'Message contains debug information':
431             debug={}
432             for k,v in packed_block[1:]:
433                 debug[k] = v
434             result['debug'] = debug
435         elif blockname == u'Positions related':
436             assert packed_block[1] == [ u'ID', u'TimeStamp', u'PosType', u'LAT', u'LON', u'Status' ]
437             positions_related=[]
438             for row in packed_block[2:]:
439                 line = {}
440                 for i, key in enumerate(packed_block[1]):
441                     line[key] = row[i]
442                 positions_related.append(line)
443             result['positions_related'] = positions_related
444         elif blockname == u'Outgoing requests':
445             assert not list_pack(block[1])
446             table_header = block[2]
447             assert list_pack(table_header) == [ u'Time', u'Outg.ID', u'ParaNo', u'Parameter', u'Value', u'Status', u'User', u'UC' ]
448             
449             result_table = []
450             for row in block[3:]:
451                 if row == [ u'' ]:
452                     continue # ignore empty lines
453                 result_line = {}
454                 for i, key in enumerate(table_header):
455                     if not key:
456                         continue # ignore empty columns
457                     result_line[key] = row[i]
458                 result_table.append(result_line)
459
460             result['outgoing_requests'] = result_table
461
462         else:
463             ets_print(u'unsupported block - ]', packed_block)
464
465     return result
466
467 if __name__ == "__main__":
468     parser = OptionParser()
469     parser.add_option("--dump-lexer", help="Debug: Dump idented lexer output", action='store_true', dest='lexer_dump', default=False)
470     parser.add_option("--dump-parser", help="Debug: Dump idented parser output", action='store_true', dest='parser_dump', default=False)
471     parser.add_option("--verbose-parser", help="Debug: Verbose parser errors", action='store_true', dest='verbose_parser', default=False)
472     (options, args) = parser.parse_args()
473
474     try:
475         filename = args[0]
476     except IndexError:
477         print >> sys.stderr, 'Need a filename'
478         sys.exit(-1)
479
480     VERBOSE_PARSER = options.verbose_parser
481     p = unicode(file(filename).read(), 'utf-8')
482    
483     if options.lexer_dump:
484         print_lexer_tree(p)
485         sys.exit(0)
486
487     if options.parser_dump:
488         root = html_parse(p)
489         print_idented_tree(root)
490         sys.exit(0)
491