2 # -*- encoding: utf-8 -*-
4 import sys, htmlentities
5 from optparse import OptionParser
9 TI_EMPTY = 1 # there's not content in these tags, ie assume <tagname ... / >
20 ROOT = 1 # this is the root node. There can be only one root
21 CLOSING = 2 # this is a closing tag such as </b>. This tags from the lexer are discarded by the parser
22 CLOSED = 4 # this is closed. Uncleaned output will only have closing tag if that flag is present.
35 def get_tag_info(self):
37 Returns TI_ flags base on the name of the name
39 return taginfo.get(self.name, 0)
42 #assert self.name != u''
44 if self.flags & Node.Flags.CLOSING:
47 for k,v in self.attributes.items():
48 #result += u' (('+k+u'))'
51 result += '="'+v.replace('\\', '\\\\').replace('"', '\\"')+'"'
56 # return 'Tag'+unicode(self).encode('utf8')
59 # TODO: rename this to CDATA or whatever
60 def __init__(self, text):
62 self.text = htmlentities.resolve(text)
64 return self.text # FIXME escape ?
66 # return 'Leaf<'+repr(self.text.encode('utf8'))+'>'
71 That iterator yields Nodes with father/children unset
74 pos = 0 # everything before that position has already been parsed
75 l = len(buf) # constant length
79 r = buf.find(token, pos)
86 state_white_skiping = False
87 p = pos # will start with skipping '<'
92 return None, p # what about last?
95 if state_white_skiping:
99 state_white_skiping = False
103 tag.flags += Node.Flags.CLOSING
109 tag.name += c.lower()
111 elif state == 'NAME':
112 if ord(c)<=32 or c=='/':
115 state_white_skiping = True
120 tag.name += c.lower()
122 elif state == 'ATT_NAME':
125 state_white_skiping = True
129 state_white_skiping = True
134 tag.attributes[att_name] = ''
137 att_name += c.lower()
139 elif state == 'ATT_EQUALS':
144 state_white_skiping = True
149 tag.attributes[att_name] = ''
153 tag.attributes[att_name] = ''
156 state_white_skiping = True
158 elif state == 'ATT_VALUE':
159 if att_value == '': # first char
160 if c == '"' or c == "'":
162 state = 'ATT_VALUE_QUOTED'
165 tag.attributes[att_name] = att_value
167 state_white_skiping = True
171 tag.attributes[att_name] = att_value
176 elif state == 'ATT_VALUE_QUOTED':
177 if c == att_value_escape:
178 tag.attributes[att_name] = att_value
180 state_white_skiping = True
188 # get next tag position
189 # TODO: check it's a real tag and not a fragment that should added to that leafnode
192 yield Leaf(buf[pos:pt1])
197 tag, pos = get_next_tag()
201 def html_parse(page):
203 This function fetches the nodes from the lexer and assemble them in a node tree
206 root.flags = Node.Flags.ROOT
208 for node in html_lexer(page):
209 if isinstance(node, Leaf):
211 father.children.append(node)
212 elif node.flags & Node.Flags.CLOSING:
213 # change current father
216 # TODO: optimize with Node.Flags.ROOT
217 if newfather is None:
220 print('Closing tag', node, 'does not match any opening tag. Discarding.', file=sys.stderr)
222 if newfather.name == node.name:
223 newfather.flags |= Node.Flags.CLOSED
225 if newfather != father:
226 print('Closing tag', node, 'has auto-closed other nodes', end=' ', file=sys.stderr)
228 while deb != newfather:
229 print(deb, end=' ', file=sys.stderr)
231 print(file=sys.stderr)
232 father = newfather.father
234 newfather = newfather.father
237 father.children.append(node)
238 #print 'node=',node,'info=',node.get_tag_info()
239 if not node.get_tag_info() & TI_EMPTY:
241 #print 'node=',node,'father=',father
245 def print_idented_tree(node, identation_level=-1):
246 if not node.flags & Node.Flags.ROOT:
247 print(' '*identation_level+repr(node))
248 for c in node.children:
249 print_idented_tree(c, identation_level+1)
250 if isinstance(node, Tag) and (node.flags&Node.Flags.CLOSED):
251 print(' '*identation_level+'</'+node.name+'>')
253 def print_lexer_tree(p):
255 for item in html_lexer(p):
256 if isinstance(item, Tag) and item.flags & Node.Flags.CLOSING:
258 print(' '*identing, end=' ')
259 if isinstance(item, Tag) and not item.flags & Node.Flags.CLOSING:
264 def get_elem(root, tagname):
266 Returns all the elements whose name matches
267 But not from the children of thoses
269 if isinstance(root, Leaf):
271 if root.name == tagname:
274 for node in root.children:
275 match = get_elem(node, tagname)
281 def split_table(table):
283 Returns table content as a list (rows) of list (columns)
286 for tr in get_elem(table, 'tr'):
288 for td in get_elem(tr, 'td'):
293 def split_table_r_to_leaf(root):
295 Recursivly split tables as descibed in split_table
296 Only returns leaf text or list for sub tables
299 tables = get_elem(root, 'table')
301 return get_merged_leaf_content(root)
304 for row in split_table(table):
307 subr = split_table_r_to_leaf(col)
314 def get_merged_leaf_content(root):
316 Returns all the leaf content agregated in a string
318 if isinstance(root, Leaf):
322 for node in root.children:
323 result += get_merged_leaf_content(node)
327 if __name__ == "__main__":
328 parser = OptionParser()
329 parser.add_option("--dump-lexer", help="Debug: Dump idented lexer output", action='store_true', dest='lexer_dump', default=False)
330 parser.add_option("--dump-parser", help="Debug: Dump idented parser output", action='store_true', dest='parser_dump', default=False)
331 parser.add_option("--verbose-parser", help="Debug: Verbose parser errors", action='store_true', dest='verbose_parser', default=False)
332 (options, args) = parser.parse_args()
337 print('Need a filename', file=sys.stderr)
340 VERBOSE_PARSER = options.verbose_parser
341 p = file(filename, encoding='utf-8').read()
343 if options.lexer_dump:
347 if options.parser_dump:
349 print_idented_tree(root)