annotate src/name/blackcap/clipman/Coerce.kt @ 56:22725d4d7849

An attempt to get it to troff-ize styled text.
author David Barts <n5jrn@me.com>
date Sat, 19 Mar 2022 23:04:40 -0700
parents 0c6c18a733b7
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
31
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
1 /*
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
2 * Font coercion. Would be easier if we didn't have to rummage through
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
3 * a doc tree and could use SAX-style callbacks instead. Would run faster
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
4 * if we didn't have a doc tree. Would use less memory if we didn't have
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
5 * a doc tree. But all HTML parsers these days force a doc tree on you.
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
6 * Sigh.
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
7 */
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
8 package name.blackcap.clipman
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
9
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
10 import org.jsoup.Jsoup
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
11 import org.jsoup.nodes.*
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
12 import java.util.Formatter
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
13 import java.util.logging.Level
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
14 import java.util.logging.Logger
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
15
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
16 /**
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
17 * Coerce fonts in HTML document.
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
18 * @param uncoerced String HTML document in
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
19 * @param pFamily String proportionally-spaced font family
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
20 * @param pSize Float proportionally-spaced font size
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
21 * @param mFamily String monospaced font family
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
22 * @param mSize Float monospaced font size
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
23 * @return HTML document with coerced fonts
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
24 */
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
25 fun coerceHTML(uncoerced: String, pFamily: String, pSize: Float, mFamily: String, mSize: Float): String {
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
26 val doc = Jsoup.parse(uncoerced)
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
27
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
28 /* apply standard scrubbing */
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
29 val head = _scrub(doc)
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
30
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
31 /* remove all stylesheets (and their content) and references to same */
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
32 doc.select("link[rel=stylesheet],style").remove()
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
33
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
34 /* add a style sheet of our own */
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
35 head?.appendElement("style")?.appendChild(DataNode(Formatter().run {
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
36 format("%nbody { font-family: \"%s\"; font-size: %.2f; }%n",
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
37 pFamily, pSize)
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
38 format("code, kbd, pre, samp, tt { font-family: \"%s\"; font-size: %.2f; }%n",
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
39 mFamily, mSize)
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
40 toString()
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
41 }))
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
42
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
43 /* remove all styling tags, but keep their content */
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
44 doc.select("basefont,big,div,font,small,span").forEach { discardTag ->
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
45 discardTag.getChildrenAsArray().forEach {
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
46 discardTag.before(it)
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
47 }
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
48 discardTag.remove()
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
49 }
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
50
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
51 /* remove all styling attributes */
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
52 val hitList = arrayOf("bgcolor", "class", "color", "style")
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
53 val selector = hitList.joinToString(prefix = "[", separator = "],[", postfix = "]")
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
54 doc.select(selector).forEach {
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
55 it.removeAll(*hitList)
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
56 }
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
57
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
58 /* remove body styling attributes */
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
59 doc.selectFirst(":root>body")?.removeAll("background", "text", "link", "alink", "vlink")
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
60
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
61 /* clean up horizontal rules */
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
62 doc.select("hr").forEach {
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
63 it.removeAll("shade", "noshade")
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
64 }
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
65
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
66 /* that's all! */
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
67 return _output(doc)
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
68 }
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
69
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
70 /**
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
71 * "Scrub" an HTML document (make it Java and clipboard-friendly)
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
72 * @param unscrubbed String HTML document in
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
73 * @return scrubbed HTML document
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
74 */
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
75 fun scrub(unscrubbed: String): String {
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
76 val doc = Jsoup.parse(unscrubbed)
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
77 _scrub(doc)
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
78 return _output(doc)
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
79 }
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
80
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
81 private fun _scrub(doc: Document): Element? {
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
82 /* remove any doctype or XML declarations */
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
83 doc.getChildrenAsArray().forEach {
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
84 if (it is DocumentType || it is XmlDeclaration) {
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
85 it.remove()
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
86 }
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
87 }
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
88
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
89 /* remove all non-HTML-4.01 attributes from <html> */
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
90 doc.attributes().map { it.key } .forEach {
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
91 if (it !in setOf("lang", "dir")) {
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
92 doc.removeAttr(it)
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
93 }
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
94 }
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
95
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
96 /* remove any conflicting charset declarations */
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
97 doc.select("meta[http-equiv=content-type],meta[charset]").remove()
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
98
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
99 /* add a charset declaration of our own */
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
100 val head = doc.selectFirst(":root>head")
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
101 if (head == null) {
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
102 LOGGER.log(Level.SEVERE, "no head found!")
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
103 return null
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
104 }
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
105 head.prependElement("meta").run {
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
106 attr("http-equiv", "Content-Type")
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
107 attr("content", "text/html; charset=" + CHARSET_NAME)
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
108 }
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
109 return head
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
110 }
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
111
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
112 private fun _output(doc: Document): String {
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
113 doc.outputSettings()
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
114 .charset(CHARSET_NAME)
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
115 .syntax(Document.OutputSettings.Syntax.xml);
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
116 return doc.outerHtml()
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
117 }
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
118
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
119 /**
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
120 * Turns node list to array. Needed to avoid ConcurrentModificationException.
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
121 * Ah, the joys of the doc tree.
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
122 * @return Array<Node>
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
123 */
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
124 fun Node.getChildrenAsArray(): Array<Node> = childNodes().run {
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
125 Array<Node>(size) { get(it) }
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
126 }
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
127
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
128 /**
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
129 * Remove all specified attributes from the element.
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
130 */
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
131 fun Element.removeAll(vararg hitList: String) {
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
132 hitList.forEach {
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
133 if (hasAttr(it)) {
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
134 removeAttr(it)
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
135 }
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
136 }
0c6c18a733b7 Compiles, new menu still a mess.
David Barts <n5jrn@me.com>
parents:
diff changeset
137 }