2 # -*- encoding: utf-8 -*-
4 __all__ = ['resolve', 'expand', 'cleanCDATA']
6 from html.entities import name2codepoint as entities
8 entities_autocomplete = {}
10 for key,value in entities.items():
12 entities_autocomplete[key] = value
14 if l>longestEntityLen:
17 # Characters in range 127-159 are illegals, but they are sometimes wrongly used in web pages
18 # Internet Explorer assumes it is taken from Microsoft extension to Latin 1 page 8859-1 aka CP1512
19 # However, to be clean, we must remap them to their real unicode values
20 # Unknown codes are translated into a space
23 8364, # 128: Euro symbol
25 8218, # 130: Single Low-9 Quotation Mark
26 402, # 131: Latin Small Letter F With Hook
27 8222, # 132: Double Low-9 Quotation Mark
28 8230, # 133: Horizontal Ellipsis
30 8225, # 135: Double Dagger
31 710, # 136: Modifier Letter Circumflex Accent
32 8240, # 137: Per Mille Sign
33 352, # 138: Latin Capital Letter S With Caron
34 8249, # 139: Single Left-Pointing Angle Quotation Mark
35 338, # 140: Latin Capital Ligature OE
37 381, # 142: Latin Capital Letter Z With Caron
40 8216, # 145: Left Single Quotation Mark
41 8217, # 146: Right Single Quotation Mark
42 8220, # 147: Left Double Quotation Mark
43 8221, # 148: Right Double Quotation Mark
47 732, # 152: Small Tilde
48 8482, # 153: Trade Mark Sign
49 353, # 154: Latin Small Letter S With Caron
50 8250, # 155: Single Right-Pointing Angle Quotation Mark
51 339, # 156: Latin Small Ligature OE
53 382, # 158: Latin Small Letter Z With Caron
54 376 # 159: Latin Capital Letter Y With Diaeresis
58 def checkForUnicodeReservedChar(value):
61 if value < 127 or value > 159:
63 return iso88591_remap[value-127]
69 oc = checkForUnicodeReservedChar(oc)
70 if oc<32 or c=='&' or c=='<' or c=='>' or c=='"' or oc>127:
71 result += '&#'+str(oc)+';'
82 pos = text.find('&', prevpos)
90 # here we are sure the next two chars exist
92 result += text[prevpos:pos]
95 ## print "numeric entity"
96 # This looks like an char whose unicode if given raw
98 if c == 'x' or c == 'X' and pos < l-3:
99 tmppos = text.find(';', pos+3)
101 s = text[pos+3: tmppos]
104 value = checkForUnicodeReservedChar(value) # remap unicode char if in range 127-159
107 continue # ok, we did it
109 # there pos is not updated so that the original escape-like sequence is kept unchanged
112 # the given unicode value is decimal
113 # IE behavior: parse until non digital char, no conversion if this is not
129 value = checkForUnicodeReservedChar(value); # remap unicode char if in range 127-159
132 continue # ok, we did it
134 # there pos is not updated so that the original escape-like sequence is kept unchanged
137 # here the first character is not a '#'
138 # let's try the known html entities
143 if tmppos >= l or tmppos-pos > longestEntityLen + 1: # 1 more for ';'
144 c2 = entities_autocomplete.get(sb, 0)
149 c2 = entities.get(sb, 0)
151 c2 = entities_autocomplete.get(sb, 0)
159 continue # ok, we did it
161 result += '&' # something went wrong, just skip is '&'
164 result += text[prevpos:]
167 def cleanCDATA(text):
170 removes useless whites, \r, \n and \t with whites
175 isLastWhite = False # so that first white is not removed
185 return expand(result)
187 if __name__ == '__main__':
190 print("Missing required parameter. Try '&test'", file=sys.stderr)
192 input = ' '.join(sys.argv[1:])
193 #print 'input:', input
194 #raw = resolve(input)
195 #print 'resolved:', raw
196 #print 'expanded:', expand(raw)
197 print('cleanCDATA:', cleanCDATA(input))