Mercurial > cgi-bin > hgweb.cgi > curlyq
comparison writer.py @ 0:984876b6a095
Initial commit of first two classes.
author | David Barts <n5jrn@me.com> |
---|---|
date | Thu, 26 Dec 2019 08:09:11 -0800 |
parents | |
children | 091c03f1b2e8 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:984876b6a095 |
---|---|
1 #!/usr/bin/env python3 | |
2 # -*- coding: utf-8 -*- | |
3 | |
4 # A simple HTML writer, so we can process HTML in a streamwise fashion | |
5 # via callbacks, which compared to a document tree tends to be all of: | |
6 # easier to program, uses less memory, and uses less processor time. | |
7 | |
8 # I m p o r t s | |
9 | |
10 import os, sys | |
11 import codecs | |
12 import io | |
13 import html | |
14 | |
15 # V a r i a b l e s | |
16 | |
17 # We only support ASCII, ISO-8859-1, and anything capable of encoding | |
18 # the full Unicode set. Anything else is too sticky a wicket to want | |
19 # to mess with. | |
20 _CODECS_TO_NAME = {} | |
21 for i in [ "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16", "UTF-16LE", "UTF-16BE", "UTF-32", "UTF-32LE", "UTF-32BE" ]: | |
22 _CODECS_TO_NAME[codecs.lookup(i)] = i | |
23 del i | |
24 _MAXCHAR = { "US-ASCII": 127, "ISO-8859-1": 255 } | |
25 | |
26 # There are WAY more HTML entities than this, but we're pessimistic about | |
27 # what browsers "in the wild" support, so we stick to what XML supports. | |
28 _QUOTE_ENTITIES = { | |
29 "\"": "quot", | |
30 "'": "apos" | |
31 } | |
32 _OTHER_ENTITIES = { | |
33 "&": "amp", | |
34 "<": "lt", | |
35 ">": "gt" | |
36 } | |
37 | |
38 # C l a s s e s | |
39 | |
40 class HtmlStreamWriter(object): | |
41 """ | |
42 A simple HTML writer, intended to be used in a streamwise fashion. | |
43 This class takes REASONABLE precautions against writing garbage, but | |
44 does not check every last thing. It will happily write tags like | |
45 "<garb<<age>>>" etc. if you feed it the right garbage in. | |
46 """ | |
47 def __init__(self, stream, encoding): | |
48 """ | |
49 Initialize this writer. An encoding is mandatory, even though we | |
50 produce character output, because the encoding governs which | |
51 characters we can send on for I/O without entity-escaping them. | |
52 The supplied stream should be buffered or performance will suffer. | |
53 """ | |
54 # Stream we're using is available to the caller as .stream | |
55 self.stream = stream | |
56 try: | |
57 # A codec to use is available to the caller as .codec | |
58 self.codec = codecs.lookup(encoding) | |
59 # Normalized encoding name is available to the caller as .encoding | |
60 self.encoding = _CODECS_TO_NAME[self.codec] | |
61 except (KeyError, LookupError) as e: | |
62 raise ValueError("invalid encoding {0!r}".format(encoding)) | |
63 self._maxchar = _MAXCHAR.get(self.encoding, 0x7fffffff) | |
64 | |
65 # html.escape drops the ball, badly. It is too optimistic about what | |
66 # entity names are likely to be understood, and is too stupid to know | |
67 # that ASCII streams need lots of things escaped. | |
68 def _escape(self, string, quote=False): | |
69 for ch in string: | |
70 entity = None | |
71 if quote and ch in _QUOTE_ENTITIES: | |
72 entity = _QUOTE_ENTITIES[ch] | |
73 elif ch in _OTHER_ENTITIES: | |
74 entity = _OTHER_ENTITIES[ch] | |
75 if entity: | |
76 self.stream.write("&") | |
77 self.stream.write(entity) | |
78 self.stream.write(";") | |
79 continue | |
80 ordch = ord(ch) | |
81 if ordch > self._maxchar: | |
82 self.stream.write("&#") | |
83 self.stream.write(str(ordch)) | |
84 self.stream.write(";") | |
85 continue | |
86 self.stream.write(ch) | |
87 | |
88 def write_starttag(self, tag, attrs): | |
89 """ | |
90 Write a start tag. | |
91 """ | |
92 self.stream.write("<") | |
93 self.stream.write(tag) | |
94 self._writeattrs(attrs) | |
95 self.stream.write(">") | |
96 | |
97 def _writeattrs(self, attrs): | |
98 for k, v in attrs: | |
99 self.stream.write(" ") | |
100 self.stream.write(k) | |
101 self.stream.write("=\"") | |
102 self._escape(v, quote=True) | |
103 self.stream.write("\"") | |
104 | |
105 def write_endtag(self, tag): | |
106 """ | |
107 Write an end tag. | |
108 """ | |
109 self.stream.write("</") | |
110 self.stream.write(tag) | |
111 self.stream.write(">") | |
112 | |
113 def write_startendtag(self, tag, attrs): | |
114 """ | |
115 Write a "start-end" (i.e. empty) tag. | |
116 """ | |
117 self.stream.write("<") | |
118 self.stream.write(tag) | |
119 self._writeattrs(attrs) | |
120 self.stream.write("/>") | |
121 | |
122 def write_data(self, data): | |
123 """ | |
124 Write text data. | |
125 """ | |
126 self._escape(data) | |
127 | |
128 def write_raw_data(self, data): | |
129 """ | |
130 Write raw data (e.g. style sheets, scripts, etc.) | |
131 """ | |
132 self.stream.write(data) | |
133 | |
134 def write_charref(self, name): | |
135 """ | |
136 Write character reference (normally not needed). | |
137 """ | |
138 is_number = False | |
139 try: | |
140 junk = int(name) | |
141 is_number = True | |
142 except ValueError: | |
143 pass | |
144 if name.startswith("x"): | |
145 try: | |
146 junk = int(name[1:], 16) | |
147 is_number = True | |
148 except ValueError: | |
149 pass | |
150 self.stream.write("&") | |
151 if is_number: | |
152 self.stream.write("#") | |
153 self.stream.write(name) | |
154 else: | |
155 self.stream.write(name) | |
156 self.stream.write(";") | |
157 | |
158 def write_comment(self, data): | |
159 """ | |
160 Write a comment. | |
161 """ | |
162 self.stream.write("<!--") | |
163 self.stream.write(data) | |
164 self.stream.write("-->") | |
165 | |
166 def write_decl(self, decl): | |
167 """ | |
168 Write a declarationm. | |
169 """ | |
170 self.stream.write("<!") | |
171 self.stream.write(decl) | |
172 self.stream.write(">") | |
173 | |
174 def write_pi(self, data): | |
175 self.stream.write("<?") | |
176 self.stream.write(decl) | |
177 self.stream.write(">") |