Ignore GOCR error message when image is empty
[banquepostale.git] / html_parser.py
1 #!/usr/bin/env python
2 # -*- encoding: utf-8 -*-
3
4 import sys, htmlentities
5 from optparse import OptionParser
6
7 VERBOSE_PARSER = False
8
9 TI_EMPTY    = 1 # there's not content in these tags, ie assume <tagname ... / >
10 taginfo = {
11     'meta': TI_EMPTY,
12     'link': TI_EMPTY,
13     'br':  TI_EMPTY,
14     'img':  TI_EMPTY,
15     'hr':  TI_EMPTY,
16 }
17
18 class Node:
19     class Flags:
20         ROOT    = 1 # this is the root node. There can be only one root
21         CLOSING = 2 # this is a closing tag such as </b>. This tags from the lexer are discarded by the parser
22         CLOSED  = 4 # this is closed. Uncleaned output will only have closing tag if that flag is present.
23
24     def __init__(self):
25         self.father = None
26         self.children = []
27         self.flags = 0
28
29 class Tag(Node):
30     def __init__(self):
31         Node.__init__(self)
32         self.name = ''
33         self.attributes = {}
34
35     def get_tag_info(self):
36         """
37         Returns TI_ flags base on the name of the name
38         """
39         return taginfo.get(self.name, 0)
40
41     def __repr__(self):
42         #assert self.name != u''
43         result = '<'
44         if self.flags & Node.Flags.CLOSING:
45             result += '/'
46         result += self.name
47         for k,v in self.attributes.items():
48             #result += u' (('+k+u'))'
49             result += ' '+k
50             if v:
51                 result += '="'+v.replace('\\', '\\\\').replace('"', '\\"')+'"'
52         result += '>'
53         return result
54
55     #def __repr__(self):
56     #    return 'Tag'+unicode(self).encode('utf8')
57
58 class Leaf(Node):
59     # TODO: rename this to CDATA or whatever
60     def __init__(self, text):
61         Node.__init__(self)
62         self.text = htmlentities.resolve(text)
63     def __repr__(self):
64         return self.text # FIXME escape ?
65     #def __repr__(self):
66     #    return 'Leaf<'+repr(self.text.encode('utf8'))+'>'
67
68
69 def html_lexer(page):
70     """
71     That iterator yields Nodes with father/children unset
72     """
73     buf = page # buffer
74     pos = 0 # everything before that position has already been parsed
75     l = len(buf) # constant length
76     state = 0
77
78     def buffind(token):
79         r = buf.find(token, pos)
80         if r==-1:
81             return None
82         return r
83
84     def get_next_tag():
85         state = 'INIT'
86         state_white_skiping = False
87         p = pos # will start with skipping '<'
88         tag = Tag()
89         while True:
90             p += 1
91             if p>=l: # EOS
92                 return None, p # what about last?
93             c = buf[p]
94            
95             if state_white_skiping:
96                 if ord(c)<=32:
97                     continue
98                 else:
99                     state_white_skiping = False
100                 
101             if state == 'INIT':
102                 if c == '/':
103                     tag.flags += Node.Flags.CLOSING
104                     continue
105                 elif c == '>':
106                     return tag, p+1
107                 else:
108                     state = 'NAME'
109                     tag.name += c.lower()
110                     continue
111             elif state == 'NAME':
112                 if ord(c)<=32 or c=='/':
113                     state = 'ATT_NAME'
114                     att_name = ''
115                     state_white_skiping = True
116                     continue
117                 elif c == '>':
118                     return tag, p+1
119                 else:
120                     tag.name += c.lower()
121                     continue
122             elif state == 'ATT_NAME':
123                 if ord(c)<=32:
124                     state = 'ATT_EQUALS'
125                     state_white_skiping = True
126                     continue
127                 elif c == '=':
128                     state = 'ATT_VALUE'
129                     state_white_skiping = True
130                     att_value = ''
131                     continue
132                 elif c == '>':
133                     if att_name != '':
134                         tag.attributes[att_name] = ''
135                     return tag, p+1
136                 else:   
137                     att_name += c.lower()
138                     continue
139             elif state == 'ATT_EQUALS':
140                 if ord(c)<=32:
141                     continue
142                 elif c == '=':
143                     state = 'ATT_VALUE'
144                     state_white_skiping = True
145                     att_value = ''
146                     continue
147                 elif c == '>':
148                     if att_name != '':
149                         tag.attributes[att_name] = ''
150                     return tag, p+1
151                 else:
152                     if att_name != '':
153                         tag.attributes[att_name] = ''
154                     state = 'ATT_NAME'
155                     att_name = c.lower()
156                     state_white_skiping = True
157                     continue
158             elif state == 'ATT_VALUE':
159                 if att_value == '': # first char
160                     if c == '"' or c == "'":
161                         att_value_escape = c
162                         state = 'ATT_VALUE_QUOTED'
163                         continue
164                 if ord(c)<32:
165                     tag.attributes[att_name] = att_value
166                     state = 'ATT_NAME'
167                     state_white_skiping = True
168                     att_name = ''
169                     continue
170                 elif c == '>':
171                     tag.attributes[att_name] = att_value
172                     return tag, p+1
173                 else:
174                     att_value += c
175                     continue
176             elif state == 'ATT_VALUE_QUOTED':
177                 if c == att_value_escape:
178                     tag.attributes[att_name] = att_value
179                     state = 'ATT_NAME'
180                     state_white_skiping = True
181                     att_name = ''
182                     continue
183                 else:
184                     att_value += c
185                     continue
186
187     while True:
188         # get next tag position
189         # TODO: check it's a real tag and not a fragment that should added to that leafnode
190         pt1 = buffind('<')
191         if pt1 != pos:
192             yield Leaf(buf[pos:pt1])
193             if pt1 is None:
194                 return
195         pos = pt1
196         
197         tag, pos = get_next_tag()
198         yield tag
199
200
201 def html_parse(page):
202     """
203     This function fetches the nodes from the lexer and assemble them in a node tree
204     """
205     root = Tag()
206     root.flags = Node.Flags.ROOT
207     father = root
208     for node in html_lexer(page):
209         if isinstance(node, Leaf):
210             node.father = father
211             father.children.append(node)
212         elif node.flags & Node.Flags.CLOSING:
213             # change current father
214             newfather = father
215             while True:
216                 # TODO: optimize with Node.Flags.ROOT
217                 if newfather is None:
218                     #TODO: log.debug()
219                     if VERBOSE_PARSER:
220                         print('Closing tag', node, 'does not match any opening tag. Discarding.', file=sys.stderr)
221                     break
222                 if newfather.name == node.name:
223                     newfather.flags |= Node.Flags.CLOSED
224                     if VERBOSE_PARSER:
225                         if newfather != father:
226                             print('Closing tag', node, 'has auto-closed other nodes', end=' ', file=sys.stderr)
227                             deb = father
228                             while deb != newfather:
229                                 print(deb, end=' ', file=sys.stderr)
230                                 deb = deb.father
231                             print(file=sys.stderr)
232                     father = newfather.father
233                     break
234                 newfather = newfather.father
235         else:
236             node.father = father
237             father.children.append(node)
238             #print 'node=',node,'info=',node.get_tag_info()
239             if not node.get_tag_info() & TI_EMPTY:
240                 father = node
241         #print 'node=',node,'father=',father
242     return root
243
244
245 def print_idented_tree(node, identation_level=-1):
246     if not node.flags & Node.Flags.ROOT:
247         print('   '*identation_level+repr(node))
248     for c in node.children:
249         print_idented_tree(c, identation_level+1)
250     if isinstance(node, Tag) and (node.flags&Node.Flags.CLOSED):
251         print('   '*identation_level+'</'+node.name+'>')
252
253 def print_lexer_tree(p):
254     identing = 0
255     for item in html_lexer(p):
256         if isinstance(item, Tag) and item.flags & Node.Flags.CLOSING:
257             identing -= 1
258         print('   '*identing, end=' ')
259         if isinstance(item, Tag) and not item.flags & Node.Flags.CLOSING:
260             identing += 1
261         print(repr(item))
262
263
264 def get_elem(root, tagname):
265     """
266     Returns all the elements whose name matches
267     But not from the children of thoses
268     """
269     if isinstance(root, Leaf):
270         return []
271     if root.name == tagname:
272         return [ root ]
273     results = []
274     for node in root.children:
275         match = get_elem(node, tagname)
276         if match:
277             results += match
278     return results
279         
280
281 def split_table(table):
282     """
283     Returns table content as a list (rows) of list (columns)
284     """
285     ctr = []
286     for tr in get_elem(table, 'tr'):
287         ctd = []
288         for td in get_elem(tr, 'td'):
289             ctd += [ td ]
290         ctr.append(ctd)
291     return ctr
292
293 def split_table_r_to_leaf(root):
294     """
295     Recursivly split tables as descibed in split_table
296     Only returns leaf text or list for sub tables
297     """
298     result = []
299     tables = get_elem(root, 'table')
300     if len(tables)==0:
301         return get_merged_leaf_content(root)
302     for table in tables:
303         rrow = []
304         for row in split_table(table):
305             rcol = []
306             for col in row:
307                 subr = split_table_r_to_leaf(col)
308                 rcol.append(subr)
309             rrow.append(rcol)
310         result.append(rrow)
311     return result
312         
313
314 def get_merged_leaf_content(root):
315     """
316     Returns all the leaf content agregated in a string
317     """
318     if isinstance(root, Leaf):
319         return root.text
320
321     result = ''
322     for node in root.children:
323         result += get_merged_leaf_content(node)
324     return result
325
326
327 if __name__ == "__main__":
328     parser = OptionParser()
329     parser.add_option("--dump-lexer", help="Debug: Dump idented lexer output", action='store_true', dest='lexer_dump', default=False)
330     parser.add_option("--dump-parser", help="Debug: Dump idented parser output", action='store_true', dest='parser_dump', default=False)
331     parser.add_option("--verbose-parser", help="Debug: Verbose parser errors", action='store_true', dest='verbose_parser', default=False)
332     (options, args) = parser.parse_args()
333
334     try:
335         filename = args[0]
336     except IndexError:
337         print('Need a filename', file=sys.stderr)
338         sys.exit(-1)
339
340     VERBOSE_PARSER = options.verbose_parser
341     p = file(filename, encoding='utf-8').read()
342    
343     if options.lexer_dump:
344         print_lexer_tree(p)
345         sys.exit(0)
346
347     if options.parser_dump:
348         root = html_parse(p)
349         print_idented_tree(root)
350         sys.exit(0)
351