2 # -*- encoding: utf-8 -*-
4 __all__ = ['resolve', 'expand', 'cleanCDATA']
261 entities_autocomplete = {}
264 for key,value in entities.iteritems():
266 entities_autocomplete[key] = value
268 if l>longestEntityLen:
271 # Characters in range 127-159 are illegals, but they are sometimes wrongly used in web pages
272 # Internet Explorer assumes it is taken from Microsoft extension to Latin 1 page 8859-1 aka CP1512
273 # However, to be clean, we must remap them to their real unicode values
274 # Unknown codes are translated into a space
277 8364, # 128: Euro symbol
279 8218, # 130: Single Low-9 Quotation Mark
280 402, # 131: Latin Small Letter F With Hook
281 8222, # 132: Double Low-9 Quotation Mark
282 8230, # 133: Horizontal Ellipsis
284 8225, # 135: Double Dagger
285 710, # 136: Modifier Letter Circumflex Accent
286 8240, # 137: Per Mille Sign
287 352, # 138: Latin Capital Letter S With Caron
288 8249, # 139: Single Left-Pointing Angle Quotation Mark
289 338, # 140: Latin Capital Ligature OE
291 381, # 142: Latin Capital Letter Z With Caron
294 8216, # 145: Left Single Quotation Mark
295 8217, # 146: Right Single Quotation Mark
296 8220, # 147: Left Double Quotation Mark
297 8221, # 148: Right Double Quotation Mark
301 732, # 152: Small Tilde
302 8482, # 153: Trade Mark Sign
303 353, # 154: Latin Small Letter S With Caron
304 8250, # 155: Single Right-Pointing Angle Quotation Mark
305 339, # 156: Latin Small Ligature OE
307 382, # 158: Latin Small Letter Z With Caron
308 376 # 159: Latin Capital Letter Y With Diaeresis
312 def checkForUnicodeReservedChar(value):
315 if value < 127 or value > 159:
317 return iso88591_remap[value-127]
324 oc = checkForUnicodeReservedChar(oc)
325 if oc<32 or c==u'&' or c==u'<' or c==u'>' or c==u'"' or oc>127:
326 result += u'&#'+unicode(oc)+u';'
338 pos = text.find(u'&', prevpos)
344 ## print "Too shoort"
346 # here we are sure the next two chars exist
348 result += text[prevpos:pos]
351 ## print "numeric entity"
352 # This looks like an char whose unicode if given raw
354 if c == u'x' or c == u'X' and pos < l-3:
355 tmppos = text.find(u';', pos+3)
357 s = text[pos+3: tmppos]
360 value = checkForUnicodeReservedChar(value) # remap unicode char if in range 127-159
361 result += unichr(value)
363 continue # ok, we did it
365 # there pos is not updated so that the original escape-like sequence is kept unchanged
368 # the given unicode value is decimal
369 # IE behavior: parse until non digital char, no conversion if this is not
385 value = checkForUnicodeReservedChar(value); # remap unicode char if in range 127-159
386 result += unichr(value)
388 continue # ok, we did it
390 # there pos is not updated so that the original escape-like sequence is kept unchanged
393 # here the first character is not a '#'
394 # let's try the known html entities
399 if tmppos >= l or tmppos-pos > longestEntityLen + 1: # 1 more for ';'
400 c2 = entities_autocomplete.get(sb, 0)
405 c2 = entities.get(sb, 0)
407 c2 = entities_autocomplete.get(sb, 0)
415 continue # ok, we did it
417 result += u'&' # something went wrong, just skip is '&'
420 result += text[prevpos:]
423 def cleanCDATA(text):
426 removes useless whites, \r, \n and \t with whites
431 was_white = False # so that first white is not removed
441 return expand(result)
444 if __name__ == '__main__':
446 from optparse import OptionParser
447 parser = OptionParser()
448 parser.add_option("-a", "--action", help="action: resolve, expand or clean [default: %default]", action="store", dest="action", choices=['expand', 'resolve', 'clean'], default='clean')
449 (options, args) = parser.parse_args()
451 print >> sys.stderr, u"Missing required parameter. Try '¢est'"
453 input = unicode(' '.join(args), 'utf-8')
454 if options.action=='resolve':
455 print resolve(input).encode('utf-8')
456 elif options.action=='expand':
458 else: # options.action=='clean':
459 print cleanCDATA(input)