31
|
1 /*
|
|
2 * Font coercion. Would be easier if we didn't have to rummage through
|
|
3 * a doc tree and could use SAX-style callbacks instead. Would run faster
|
|
4 * if we didn't have a doc tree. Would use less memory if we didn't have
|
|
5 * a doc tree. But all HTML parsers these days force a doc tree on you.
|
|
6 * Sigh.
|
|
7 */
|
|
8 package name.blackcap.clipman
|
|
9
|
|
10 import org.jsoup.Jsoup
|
|
11 import org.jsoup.nodes.*
|
|
12 import java.util.Formatter
|
|
13 import java.util.logging.Level
|
|
14 import java.util.logging.Logger
|
|
15
|
|
16 /**
|
|
17 * Coerce fonts in HTML document.
|
|
18 * @param uncoerced String HTML document in
|
|
19 * @param pFamily String proportionally-spaced font family
|
|
20 * @param pSize Float proportionally-spaced font size
|
|
21 * @param mFamily String monospaced font family
|
|
22 * @param mSize Float monospaced font size
|
|
23 * @return HTML document with coerced fonts
|
|
24 */
|
|
25 fun coerceHTML(uncoerced: String, pFamily: String, pSize: Float, mFamily: String, mSize: Float): String {
|
|
26 val doc = Jsoup.parse(uncoerced)
|
|
27
|
|
28 /* apply standard scrubbing */
|
|
29 val head = _scrub(doc)
|
|
30
|
|
31 /* remove all stylesheets (and their content) and references to same */
|
|
32 doc.select("link[rel=stylesheet],style").remove()
|
|
33
|
|
34 /* add a style sheet of our own */
|
|
35 head?.appendElement("style")?.appendChild(DataNode(Formatter().run {
|
|
36 format("%nbody { font-family: \"%s\"; font-size: %.2f; }%n",
|
|
37 pFamily, pSize)
|
|
38 format("code, kbd, pre, samp, tt { font-family: \"%s\"; font-size: %.2f; }%n",
|
|
39 mFamily, mSize)
|
|
40 toString()
|
|
41 }))
|
|
42
|
|
43 /* remove all styling tags, but keep their content */
|
|
44 doc.select("basefont,big,div,font,small,span").forEach { discardTag ->
|
|
45 discardTag.getChildrenAsArray().forEach {
|
|
46 discardTag.before(it)
|
|
47 }
|
|
48 discardTag.remove()
|
|
49 }
|
|
50
|
|
51 /* remove all styling attributes */
|
|
52 val hitList = arrayOf("bgcolor", "class", "color", "style")
|
|
53 val selector = hitList.joinToString(prefix = "[", separator = "],[", postfix = "]")
|
|
54 doc.select(selector).forEach {
|
|
55 it.removeAll(*hitList)
|
|
56 }
|
|
57
|
|
58 /* remove body styling attributes */
|
|
59 doc.selectFirst(":root>body")?.removeAll("background", "text", "link", "alink", "vlink")
|
|
60
|
|
61 /* clean up horizontal rules */
|
|
62 doc.select("hr").forEach {
|
|
63 it.removeAll("shade", "noshade")
|
|
64 }
|
|
65
|
|
66 /* that's all! */
|
|
67 return _output(doc)
|
|
68 }
|
|
69
|
|
70 /**
|
|
71 * "Scrub" an HTML document (make it Java and clipboard-friendly)
|
|
72 * @param unscrubbed String HTML document in
|
|
73 * @return scrubbed HTML document
|
|
74 */
|
|
75 fun scrub(unscrubbed: String): String {
|
|
76 val doc = Jsoup.parse(unscrubbed)
|
|
77 _scrub(doc)
|
|
78 return _output(doc)
|
|
79 }
|
|
80
|
|
81 private fun _scrub(doc: Document): Element? {
|
|
82 /* remove any doctype or XML declarations */
|
|
83 doc.getChildrenAsArray().forEach {
|
|
84 if (it is DocumentType || it is XmlDeclaration) {
|
|
85 it.remove()
|
|
86 }
|
|
87 }
|
|
88
|
|
89 /* remove all non-HTML-4.01 attributes from <html> */
|
|
90 doc.attributes().map { it.key } .forEach {
|
|
91 if (it !in setOf("lang", "dir")) {
|
|
92 doc.removeAttr(it)
|
|
93 }
|
|
94 }
|
|
95
|
|
96 /* remove any conflicting charset declarations */
|
|
97 doc.select("meta[http-equiv=content-type],meta[charset]").remove()
|
|
98
|
|
99 /* add a charset declaration of our own */
|
|
100 val head = doc.selectFirst(":root>head")
|
|
101 if (head == null) {
|
|
102 LOGGER.log(Level.SEVERE, "no head found!")
|
|
103 return null
|
|
104 }
|
|
105 head.prependElement("meta").run {
|
|
106 attr("http-equiv", "Content-Type")
|
|
107 attr("content", "text/html; charset=" + CHARSET_NAME)
|
|
108 }
|
|
109 return head
|
|
110 }
|
|
111
|
|
112 private fun _output(doc: Document): String {
|
|
113 doc.outputSettings()
|
|
114 .charset(CHARSET_NAME)
|
|
115 .syntax(Document.OutputSettings.Syntax.xml);
|
|
116 return doc.outerHtml()
|
|
117 }
|
|
118
|
|
119 /**
|
|
120 * Turns node list to array. Needed to avoid ConcurrentModificationException.
|
|
121 * Ah, the joys of the doc tree.
|
|
122 * @return Array<Node>
|
|
123 */
|
|
124 fun Node.getChildrenAsArray(): Array<Node> = childNodes().run {
|
|
125 Array<Node>(size) { get(it) }
|
|
126 }
|
|
127
|
|
128 /**
|
|
129 * Remove all specified attributes from the element.
|
|
130 */
|
|
131 fun Element.removeAll(vararg hitList: String) {
|
|
132 hitList.forEach {
|
|
133 if (hasAttr(it)) {
|
|
134 removeAttr(it)
|
|
135 }
|
|
136 }
|
|
137 }
|