2 # -*- encoding: utf-8 -*-
4 from __future__ import division
5 import sys, htmlentities
6 from optparse import OptionParser
10 TI_EMPTY = 1 # there's not content in these tags, ie assume <tagname ... / >
21 ROOT = 1 # this is the root node. There can be only one root
22 CLOSING = 2 # this is a closing tag such as </b>. This tags from the lexer are discarded by the parser
23 CLOSED = 4 # this is closed. Uncleaned output will only have closing tag if that flag is present.
36 def get_tag_info(self):
38 Returns TI_ flags base on the name of the name
40 return taginfo.get(self.name, 0)
42 def __unicode__(self):
43 #assert self.name != u''
45 if self.flags & Node.Flags.CLOSING:
48 for k,v in self.attributes.iteritems():
49 #result += u' (('+k+u'))'
52 result += u'="'+v.replace(u'\\', u'\\\\').replace(u'"', u'\\"')+'"'
57 #return 'Tag'+unicode(self).encode('utf8')
58 return unicode(self).encode('utf8')
61 # TODO: rename this to CDATA or whatever
62 def __init__(self, text):
64 self.text = htmlentities.resolve(text)
65 def __unicode__(self):
66 return self.text # FIXME escape ?
68 #return 'Leaf<'+repr(self.text.encode('utf8'))+'>'
69 return repr(self.text.encode('utf8'))
74 That iterator yields Nodes with father/children unset
77 pos = 0 # everything before that position has already been parsed
78 l = len(buf) # constant length
82 r = buf.find(token, pos)
89 state_white_skiping = False
90 p = pos # will start with skipping '<'
95 return None, p # what about last?
98 if state_white_skiping:
102 state_white_skiping = False
106 tag.flags += Node.Flags.CLOSING
112 tag.name += c.lower()
114 elif state == 'NAME':
115 if ord(c)<=32 or c==u'/':
118 state_white_skiping = True
123 tag.name += c.lower()
125 elif state == 'ATT_NAME':
128 state_white_skiping = True
132 state_white_skiping = True
137 tag.attributes[att_name] = u''
140 att_name += c.lower()
142 elif state == 'ATT_EQUALS':
147 state_white_skiping = True
152 tag.attributes[att_name] = u''
156 tag.attributes[att_name] = u''
159 state_white_skiping = True
161 elif state == 'ATT_VALUE':
162 if att_value == u'': # first char
163 if c == u'"' or c == u"'":
165 state = 'ATT_VALUE_QUOTED'
168 tag.attributes[att_name] = att_value
170 state_white_skiping = True
174 tag.attributes[att_name] = att_value
179 elif state == 'ATT_VALUE_QUOTED':
180 if c == att_value_escape:
181 tag.attributes[att_name] = att_value
183 state_white_skiping = True
191 # get next tag position
192 # TODO: check it's a real tag and not a fragment that should added to that leafnode
195 yield Leaf(buf[pos:pt1])
200 tag, pos = get_next_tag()
204 def html_parse(page):
206 This function fetches the nodes from the lexer and assemble them in a node tree
209 root.flags = Node.Flags.ROOT
211 for node in html_lexer(page):
212 if isinstance(node, Leaf):
214 father.children.append(node)
215 elif node.flags & Node.Flags.CLOSING:
216 # change current father
219 # TODO: optimize with Node.Flags.ROOT
220 if newfather is None:
223 print >> sys.stderr, 'Closing tag', node, 'does not match any opening tag. Discarding.'
225 if newfather.name == node.name:
226 newfather.flags |= Node.Flags.CLOSED
228 if newfather != father:
229 print >> sys.stderr, 'Closing tag', node, 'has auto-closed other nodes',
231 while deb != newfather:
232 print >> sys.stderr, deb,
235 father = newfather.father
237 newfather = newfather.father
240 father.children.append(node)
241 #print 'node=',node,'info=',node.get_tag_info()
242 if not node.get_tag_info() & TI_EMPTY:
244 #print 'node=',node,'father=',father
248 def print_idented_tree(node, identation_level=-1):
249 if not node.flags & Node.Flags.ROOT:
250 print ' '*identation_level+repr(node)
251 for c in node.children:
252 print_idented_tree(c, identation_level+1)
253 if isinstance(node, Tag) and (node.flags&Node.Flags.CLOSED):
254 print ' '*identation_level+'</'+node.name.encode('utf8')+'>'
256 def print_lexer_tree(p):
258 for item in html_lexer(p):
259 if isinstance(item, Tag) and item.flags & Node.Flags.CLOSING:
262 if isinstance(item, Tag) and not item.flags & Node.Flags.CLOSING:
267 def get_elem(root, tagname):
269 Returns all the elements whose name matches
270 But not from the children of thoses
272 if isinstance(root, Leaf):
274 if root.name == tagname:
277 for node in root.children:
278 match = get_elem(node, tagname)
284 def split_table(table):
286 Returns table content as a list (rows) of list (columns)
289 for tr in get_elem(table, u'tr'):
291 for td in get_elem(tr, u'td'):
296 def split_table_r_to_leaf(root):
298 Recursivly split tables as descibed in split_table
299 Only returns leaf text or list for sub tables
302 tables = get_elem(root, u'table')
304 return get_merged_leaf_content(root)
307 for row in split_table(table):
310 subr = split_table_r_to_leaf(col)
317 def get_merged_leaf_content(root):
319 Returns all the leaf content agregated in a string
321 if isinstance(root, Leaf):
325 for node in root.children:
326 result += get_merged_leaf_content(node)
330 get_inner_text = get_merged_leaf_content
333 def ets_decode_html(event_html):
334 def ets_cleanup(target):
335 if isinstance(target, unicode):
336 return target.replace(u'\xa0',u'').strip()
338 return [ ets_cleanup(i) for i in target ]
340 def ets_print(prefix, target):
343 if isinstance(i, list):
345 if isinstance(target, unicode):
347 print prefix, repr(target.encode('utf8'))
350 if len(target) == 1 and target[0]==u'':
352 print prefix, [ i.encode('utf8') for i in target ]
354 for i,sub in enumerate(target):
355 ets_print(prefix+u'-'+unicode(i), sub)
358 if isinstance(root, unicode):
369 assert isinstance(event_html, unicode)
370 root = html_parse(event_html)
371 lists = ets_cleanup(split_table_r_to_leaf(root))
372 main = lists[0][1][1]
376 header = main[0][0][0][0][0]
377 #ets_print(u'header', header)
378 result[u'GSM No'] = header[1]
379 result[u'Unit name'] = header[2][1:] # skip '-'
380 result[u'SIMICC'] = header[4]
382 #ets_print(u'block1', main[0])
385 result[row[0]] = row[2]
388 # FIXME move in the main loop bellow
389 if main[0][0][0] == u'Cell No.':
390 #ets_print('Cells', main[0])
392 for row in main[0][1:]:
396 for i, key in enumerate(main[0][0]):
397 if not key or not i: # skip "Cell No." column
401 if value: # In some rare case, RSSI is invalid see event 9547789
402 value = value[0][0][0] # RSSI
405 result[u'cells'] = cells
409 blockname = list_pack(block[0])
410 assert isinstance(blockname, unicode)
411 packed_block = list_pack(block)
413 if blockname == u'Decoding of message':
414 assert packed_block[1] == [ u'Parameter', u'Value', u'Para', u'UC', u'Description' ]
416 for row in packed_block[2:]:
417 if not isinstance(row, list):
418 continue # not interested in incomplete information such as 'IncomingData ID' with no data!
419 if row[0] == 'IncomingData ID':
420 result[u'IncomingData ID'] = row[1]
422 if len(row) != len(packed_block[1]):
423 continue # discard lines that have incorrect length, such as RSSI request to propos
425 for i, key in enumerate(packed_block[1]):
427 decoding.append(line)
428 result['decoding'] = decoding
430 elif blockname == u'Message contains debug information':
432 for k,v in packed_block[1:]:
434 result['debug'] = debug
435 elif blockname == u'Positions related':
436 assert packed_block[1] == [ u'ID', u'TimeStamp', u'PosType', u'LAT', u'LON', u'Status' ]
438 for row in packed_block[2:]:
440 for i, key in enumerate(packed_block[1]):
442 positions_related.append(line)
443 result['positions_related'] = positions_related
444 elif blockname == u'Outgoing requests':
445 assert not list_pack(block[1])
446 table_header = block[2]
447 assert list_pack(table_header) == [ u'Time', u'Outg.ID', u'ParaNo', u'Parameter', u'Value', u'Status', u'User', u'UC' ]
450 for row in block[3:]:
452 continue # ignore empty lines
454 for i, key in enumerate(table_header):
456 continue # ignore empty columns
457 result_line[key] = row[i]
458 result_table.append(result_line)
460 result['outgoing_requests'] = result_table
463 ets_print(u'unsupported block - ]', packed_block)
467 if __name__ == "__main__":
468 parser = OptionParser()
469 parser.add_option("--dump-lexer", help="Debug: Dump idented lexer output", action='store_true', dest='lexer_dump', default=False)
470 parser.add_option("--dump-parser", help="Debug: Dump idented parser output", action='store_true', dest='parser_dump', default=False)
471 parser.add_option("--verbose-parser", help="Debug: Verbose parser errors", action='store_true', dest='verbose_parser', default=False)
472 (options, args) = parser.parse_args()
477 print >> sys.stderr, 'Need a filename'
480 VERBOSE_PARSER = options.verbose_parser
481 p = unicode(file(filename).read(), 'utf-8')
483 if options.lexer_dump:
487 if options.parser_dump:
489 print_idented_tree(root)