Support for empty input tag
[ais.git] / bin / html_parser.py
1 #!/usr/bin/env python
2 # -*- encoding: utf-8 -*-
3
4 from __future__ import division
5 import sys, htmlentities
6 from optparse import OptionParser
7
8 VERBOSE_PARSER = False
9
10 TI_EMPTY    = 1 # there's not content in these tags, ie assume <tagname ... / >
11 taginfo = {
12     u'meta': TI_EMPTY,
13     u'link': TI_EMPTY,
14     u'br':  TI_EMPTY,
15     u'img':  TI_EMPTY,
16     u'hr':  TI_EMPTY,
17     u'input':  TI_EMPTY,
18 }
19
20 class Node:
21     class Flags:
22         ROOT    = 1 # this is the root node. There can be only one root
23         CLOSING = 2 # this is a closing tag such as </b>. This tags from the lexer are discarded by the parser
24         CLOSED  = 4 # this is closed. Uncleaned output will only have closing tag if that flag is present.
25
26     def __init__(self):
27         self.father = None
28         self.children = []
29         self.flags = 0
30
31 class Tag(Node):
32     def __init__(self):
33         Node.__init__(self)
34         self.name = u''
35         self.attributes = {}
36
37     def get_tag_info(self):
38         """
39         Returns TI_ flags base on the name of the name
40         """
41         return taginfo.get(self.name, 0)
42
43     def __unicode__(self):
44         #assert self.name != u''
45         result = u'<'
46         if self.flags & Node.Flags.CLOSING:
47             result += u'/'
48         result += self.name
49         for k,v in self.attributes.iteritems():
50             #result += u' (('+k+u'))'
51             result += u' '+k
52             if v:
53                 result += u'="'+v.replace(u'\\', u'\\\\').replace(u'"', u'\\"')+'"'
54         result += u'>'
55         return result
56
57     def __repr__(self):
58         #return 'Tag'+unicode(self).encode('utf8')
59         return unicode(self).encode('utf8')
60
61 class Leaf(Node):
62     # TODO: rename this to CDATA or whatever
63     def __init__(self, text):
64         Node.__init__(self)
65         self.text = htmlentities.resolve(text)
66     def __unicode__(self):
67         return self.text # FIXME escape ?
68     def __repr__(self):
69         #return 'Leaf<'+repr(self.text.encode('utf8'))+'>'
70         return repr(self.text.encode('utf8'))
71
72
73 def html_lexer(page):
74     """
75     That iterator yields Nodes with father/children unset
76     """
77     buf = page # buffer
78     pos = 0 # everything before that position has already been parsed
79     l = len(buf) # constant length
80     state = 0
81
82     def buffind(token):
83         r = buf.find(token, pos)
84         if r==-1:
85             return None
86         return r
87
88     def get_next_tag():
89         state = 'INIT'
90         state_white_skiping = False
91         p = pos # will start with skipping '<'
92         tag = Tag()
93         while True:
94             p += 1
95             if p>=l: # EOS
96                 return None, p # what about last?
97             c = buf[p]
98            
99             if state_white_skiping:
100                 if ord(c)<=32:
101                     continue
102                 else:
103                     state_white_skiping = False
104                 
105             if state == 'INIT':
106                 if c == u'/':
107                     tag.flags += Node.Flags.CLOSING
108                     continue
109                 elif c == u'>':
110                     return tag, p+1
111                 else:
112                     state = 'NAME'
113                     tag.name += c.lower()
114                     continue
115             elif state == 'NAME':
116                 if ord(c)<=32 or c==u'/':
117                     state = 'ATT_NAME'
118                     att_name = u''
119                     state_white_skiping = True
120                     continue
121                 elif c == u'>':
122                     return tag, p+1
123                 else:
124                     tag.name += c.lower()
125                     continue
126             elif state == 'ATT_NAME':
127                 if ord(c)<=32:
128                     state = 'ATT_EQUALS'
129                     state_white_skiping = True
130                     continue
131                 elif c == u'=':
132                     state = 'ATT_VALUE'
133                     state_white_skiping = True
134                     att_value = u''
135                     continue
136                 elif c == u'>':
137                     if att_name != u'':
138                         tag.attributes[att_name] = u''
139                     return tag, p+1
140                 else:   
141                     att_name += c.lower()
142                     continue
143             elif state == 'ATT_EQUALS':
144                 if ord(c)<=32:
145                     continue
146                 elif c == u'=':
147                     state = 'ATT_VALUE'
148                     state_white_skiping = True
149                     att_value = u''
150                     continue
151                 elif c == u'>':
152                     if att_name != u'':
153                         tag.attributes[att_name] = u''
154                     return tag, p+1
155                 else:
156                     if att_name != u'':
157                         tag.attributes[att_name] = u''
158                     state = 'ATT_NAME'
159                     att_name = c.lower()
160                     state_white_skiping = True
161                     continue
162             elif state == 'ATT_VALUE':
163                 if att_value == u'': # first char
164                     if c == u'"' or c == u"'":
165                         att_value_escape = c
166                         state = 'ATT_VALUE_QUOTED'
167                         continue
168                 if ord(c)<32:
169                     tag.attributes[att_name] = att_value
170                     state = 'ATT_NAME'
171                     state_white_skiping = True
172                     att_name = u''
173                     continue
174                 elif c == u'>':
175                     tag.attributes[att_name] = att_value
176                     return tag, p+1
177                 else:
178                     att_value += c
179                     continue
180             elif state == 'ATT_VALUE_QUOTED':
181                 if c == att_value_escape:
182                     tag.attributes[att_name] = att_value
183                     state = 'ATT_NAME'
184                     state_white_skiping = True
185                     att_name = u''
186                     continue
187                 else:
188                     att_value += c
189                     continue
190
191     while True:
192         # get next tag position
193         # TODO: check it's a real tag and not a fragment that should added to that leafnode
194         pt1 = buffind(u'<')
195         if pt1 != pos:
196             yield Leaf(buf[pos:pt1])
197             if pt1 is None:
198                 return
199         pos = pt1
200         
201         tag, pos = get_next_tag()
202         yield tag
203
204
205 def html_parse(page):
206     """
207     This function fetches the nodes from the lexer and assemble them in a node tree
208     """
209     root = Tag()
210     root.flags = Node.Flags.ROOT
211     father = root
212     for node in html_lexer(page):
213         if isinstance(node, Leaf):
214             node.father = father
215             father.children.append(node)
216         elif node.flags & Node.Flags.CLOSING:
217             # change current father
218             newfather = father
219             while True:
220                 # TODO: optimize with Node.Flags.ROOT
221                 if newfather is None:
222                     #TODO: log.debug()
223                     if VERBOSE_PARSER:
224                         print >> sys.stderr, 'Closing tag', node, 'does not match any opening tag. Discarding.'
225                     break
226                 if newfather.name == node.name:
227                     newfather.flags |= Node.Flags.CLOSED
228                     if VERBOSE_PARSER:
229                         if newfather != father:
230                             print >> sys.stderr, 'Closing tag', node, 'has auto-closed other nodes',
231                             deb = father
232                             while deb != newfather:
233                                 print >> sys.stderr, deb,
234                                 deb = deb.father
235                             print >> sys.stderr
236                     father = newfather.father
237                     break
238                 newfather = newfather.father
239         else:
240             node.father = father
241             father.children.append(node)
242             #print 'node=',node,'info=',node.get_tag_info()
243             if not node.get_tag_info() & TI_EMPTY:
244                 father = node
245         #print 'node=',node,'father=',father
246     return root
247
248
249 def print_idented_tree(node, identation_level=-1):
250     if not node.flags & Node.Flags.ROOT:
251         print '   '*identation_level+repr(node)
252     for c in node.children:
253         print_idented_tree(c, identation_level+1)
254     if isinstance(node, Tag) and (node.flags&Node.Flags.CLOSED):
255         print '   '*identation_level+'</'+node.name.encode('utf8')+'>'
256
257 def print_lexer_tree(p):
258     identing = 0
259     for item in html_lexer(p):
260         if isinstance(item, Tag) and item.flags & Node.Flags.CLOSING:
261             identing -= 1
262         print '   '*identing,
263         if isinstance(item, Tag) and not item.flags & Node.Flags.CLOSING:
264             identing += 1
265         print repr(item)
266
267
268 def get_elem(root, tagname):
269     """
270     Returns all the elements whose name matches
271     But not from the children of thoses
272     """
273     if isinstance(root, Leaf):
274         return []
275     if root.name == tagname:
276         return [ root ]
277     results = []
278     for node in root.children:
279         match = get_elem(node, tagname)
280         if match:
281             results += match
282     return results
283         
284
285 def split_table(table):
286     """
287     Returns table content as a list (rows) of list (columns)
288     """
289     ctr = []
290     for tr in get_elem(table, u'tr'):
291         ctd = []
292         for td in get_elem(tr, u'td'):
293             ctd += [ td ]
294         ctr.append(ctd)
295     return ctr
296
297 def split_table_r_to_leaf(root):
298     """
299     Recursivly split tables as descibed in split_table
300     Only returns leaf text or list for sub tables
301     """
302     result = []
303     tables = get_elem(root, u'table')
304     if len(tables)==0:
305         return get_merged_leaf_content(root)
306     for table in tables:
307         rrow = []
308         for row in split_table(table):
309             rcol = []
310             for col in row:
311                 subr = split_table_r_to_leaf(col)
312                 rcol.append(subr)
313             rrow.append(rcol)
314         result.append(rrow)
315     return result
316         
317
318 def get_merged_leaf_content(root):
319     """
320     Returns all the leaf content agregated in a string
321     """
322     if isinstance(root, Leaf):
323         return root.text
324
325     result = u''
326     for node in root.children:
327         result += get_merged_leaf_content(node)
328     return result
329
330
331 get_inner_text = get_merged_leaf_content
332
333
334 def ets_decode_html(event_html):
335     def ets_cleanup(target):
336         if isinstance(target, unicode):
337             return target.replace(u'\xa0',u'').strip()
338         else:
339             return [ ets_cleanup(i) for i in target ]
340
341     def ets_print(prefix, target):
342         has_sublists = False
343         for i in target:
344             if isinstance(i, list):
345                 has_sublists = True
346         if isinstance(target, unicode):
347             if target:
348                 print prefix, repr(target.encode('utf8'))
349             return
350         if not has_sublists:
351             if len(target) == 1 and target[0]==u'':
352                 return
353             print prefix, [ i.encode('utf8') for i in target ]
354             return
355         for i,sub in enumerate(target):
356             ets_print(prefix+u'-'+unicode(i), sub)
357
358     def list_pack(root):
359         if isinstance(root, unicode):
360             return root
361         result = []
362         for i in root:
363             i = list_pack(i)
364             if i and i!=u'':
365                 result.append(i)
366         if len(result)==1:
367             result = result[0]
368         return result
369
370     assert isinstance(event_html, unicode)
371     root = html_parse(event_html)
372     lists = ets_cleanup(split_table_r_to_leaf(root))
373     main = lists[0][1][1]
374     
375     result = {}
376     
377     header = main[0][0][0][0][0]
378     #ets_print(u'header', header)
379     result[u'GSM No'] = header[1]
380     result[u'Unit name'] = header[2][1:] # skip '-'
381     result[u'SIMICC'] = header[4]
382
383     #ets_print(u'block1', main[0])
384     for row in main[0]:
385         if len(row)>1:
386             result[row[0]] = row[2]
387     del main[0]
388
389     # FIXME move in the main loop bellow
390     if main[0][0][0] == u'Cell No.':
391         #ets_print('Cells',  main[0])
392         cells = []
393         for row in main[0][1:]:
394             if len(row)<2:
395                 continue
396             cell = {}
397             for i, key in enumerate(main[0][0]):
398                 if not key or not i: # skip "Cell No." column
399                     continue
400                 value = row[i]
401                 if i==8:
402                     if value: # In some rare case, RSSI is invalid see event 9547789
403                         value = value[0][0][0] # RSSI
404                 cell[key] = value
405             cells.append(cell)
406         result[u'cells'] = cells
407         del main[0]
408
409     for block in main:
410         blockname = list_pack(block[0])
411         assert isinstance(blockname, unicode)
412         packed_block = list_pack(block)
413
414         if blockname == u'Decoding of message':
415             assert packed_block[1] == [ u'Parameter', u'Value', u'Para', u'UC', u'Description' ]
416             decoding = []
417             for row in packed_block[2:]:
418                 if not isinstance(row, list):
419                     continue # not interested in incomplete information such as 'IncomingData ID' with no data!
420                 if row[0] == 'IncomingData ID':
421                     result[u'IncomingData ID'] = row[1]
422                     continue
423                 if len(row) != len(packed_block[1]):
424                     continue # discard lines that have incorrect length, such as RSSI request to propos
425                 line = {}
426                 for i, key in enumerate(packed_block[1]):
427                     line[key] = row[i]
428                 decoding.append(line)
429             result['decoding'] = decoding
430
431         elif blockname == u'Message contains debug information':
432             debug={}
433             for k,v in packed_block[1:]:
434                 debug[k] = v
435             result['debug'] = debug
436         elif blockname == u'Positions related':
437             assert packed_block[1] == [ u'ID', u'TimeStamp', u'PosType', u'LAT', u'LON', u'Status' ]
438             positions_related=[]
439             for row in packed_block[2:]:
440                 line = {}
441                 for i, key in enumerate(packed_block[1]):
442                     line[key] = row[i]
443                 positions_related.append(line)
444             result['positions_related'] = positions_related
445         elif blockname == u'Outgoing requests':
446             assert not list_pack(block[1])
447             table_header = block[2]
448             assert list_pack(table_header) == [ u'Time', u'Outg.ID', u'ParaNo', u'Parameter', u'Value', u'Status', u'User', u'UC' ]
449             
450             result_table = []
451             for row in block[3:]:
452                 if row == [ u'' ]:
453                     continue # ignore empty lines
454                 result_line = {}
455                 for i, key in enumerate(table_header):
456                     if not key:
457                         continue # ignore empty columns
458                     result_line[key] = row[i]
459                 result_table.append(result_line)
460
461             result['outgoing_requests'] = result_table
462
463         else:
464             ets_print(u'unsupported block - ]', packed_block)
465
466     return result
467
468 if __name__ == "__main__":
469     parser = OptionParser()
470     parser.add_option("--dump-lexer", help="Debug: Dump idented lexer output", action='store_true', dest='lexer_dump', default=False)
471     parser.add_option("--dump-parser", help="Debug: Dump idented parser output", action='store_true', dest='parser_dump', default=False)
472     parser.add_option("--verbose-parser", help="Debug: Verbose parser errors", action='store_true', dest='verbose_parser', default=False)
473     (options, args) = parser.parse_args()
474
475     try:
476         filename = args[0]
477     except IndexError:
478         print >> sys.stderr, 'Need a filename'
479         sys.exit(-1)
480
481     VERBOSE_PARSER = options.verbose_parser
482     p = unicode(file(filename).read(), 'utf-8')
483    
484     if options.lexer_dump:
485         print_lexer_tree(p)
486         sys.exit(0)
487
488     if options.parser_dump:
489         root = html_parse(p)
490         print_idented_tree(root)
491         sys.exit(0)
492