honk/html.go

193 lines
4.9 KiB
Go
Raw Normal View History

2019-04-09 13:59:33 +02:00
//
// Copyright (c) 2019 Ted Unangst <tedu@tedunangst.com>
//
// Permission to use, copy, modify, and distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
package main
import (
"fmt"
"html/template"
"io"
"log"
"net/url"
"regexp"
"sort"
"strings"
"golang.org/x/net/html"
)
var permittedtags = []string{"div", "h1", "h2", "h3", "h4", "h5", "h6",
"table", "thead", "tbody", "th", "tr", "td",
2019-04-24 04:42:14 +02:00
"p", "br", "pre", "code", "blockquote",
2019-05-05 22:42:00 +02:00
"strong", "em", "b", "i", "s", "u", "sup", "del",
2019-04-09 13:59:33 +02:00
"ol", "ul", "li"}
var permittedattr = []string{"colspan", "rowspan"}
var bannedtags = []string{"script", "style"}
func init() {
sort.Strings(permittedtags)
sort.Strings(permittedattr)
sort.Strings(bannedtags)
}
func contains(array []string, tag string) bool {
idx := sort.SearchStrings(array, tag)
return idx < len(array) && array[idx] == tag
}
func getattr(node *html.Node, attr string) string {
for _, a := range node.Attr {
if a.Key == attr {
return a.Val
}
}
return ""
}
func hasclass(node *html.Node, class string) bool {
return strings.Contains(" "+getattr(node, "class")+" ", " "+class+" ")
}
func writetag(w io.Writer, node *html.Node) {
io.WriteString(w, "<")
io.WriteString(w, node.Data)
for _, attr := range node.Attr {
if contains(permittedattr, attr.Key) {
fmt.Fprintf(w, ` %s="%s"`, attr.Key, html.EscapeString(attr.Val))
}
}
io.WriteString(w, ">")
}
func render(w io.Writer, node *html.Node) {
switch node.Type {
case html.ElementNode:
tag := node.Data
switch {
case tag == "a":
href := getattr(node, "href")
hrefurl, err := url.Parse(href)
if err != nil {
href = "#BROKEN-" + href
} else {
href = hrefurl.String()
}
fmt.Fprintf(w, `<a href="%s" rel=noreferrer>`, html.EscapeString(href))
case tag == "img":
div := replaceimg(node)
if div != "skip" {
io.WriteString(w, div)
}
case tag == "span":
case tag == "iframe":
src := html.EscapeString(getattr(node, "src"))
fmt.Fprintf(w, `&lt;iframe src="<a href="%s">%s</a>"&gt;`, src, src)
case contains(permittedtags, tag):
writetag(w, node)
case contains(bannedtags, tag):
return
}
case html.TextNode:
io.WriteString(w, html.EscapeString(node.Data))
}
for c := node.FirstChild; c != nil; c = c.NextSibling {
render(w, c)
}
if node.Type == html.ElementNode {
tag := node.Data
if tag == "a" || (contains(permittedtags, tag) && tag != "br") {
fmt.Fprintf(w, "</%s>", tag)
}
if tag == "p" || tag == "div" {
io.WriteString(w, "\n")
}
}
}
func replaceimg(node *html.Node) string {
src := getattr(node, "src")
alt := getattr(node, "alt")
//title := getattr(node, "title")
if hasclass(node, "Emoji") && alt != "" {
return html.EscapeString(alt)
}
return html.EscapeString(fmt.Sprintf(`<img src="%s">`, src))
}
func cleannode(node *html.Node) template.HTML {
var buf strings.Builder
render(&buf, node)
return template.HTML(buf.String())
}
func cleanstring(shtml string) template.HTML {
reader := strings.NewReader(shtml)
body, err := html.Parse(reader)
if err != nil {
log.Printf("error parsing html: %s", err)
return ""
}
return cleannode(body)
}
func textonly(w io.Writer, node *html.Node) {
switch node.Type {
case html.ElementNode:
tag := node.Data
switch {
case tag == "a":
href := getattr(node, "href")
fmt.Fprintf(w, `<a href="%s">`, href)
case tag == "img":
io.WriteString(w, "<img>")
case contains(bannedtags, tag):
return
}
case html.TextNode:
io.WriteString(w, node.Data)
}
for c := node.FirstChild; c != nil; c = c.NextSibling {
textonly(w, c)
}
if node.Type == html.ElementNode {
tag := node.Data
if tag == "a" {
fmt.Fprintf(w, "</%s>", tag)
}
if tag == "p" || tag == "div" {
io.WriteString(w, "\n")
}
}
}
var re_whitespaceeater = regexp.MustCompile("[ \t\r]*\n[ \t\r]*")
var re_blanklineeater = regexp.MustCompile("\n\n+")
var re_tabeater = regexp.MustCompile("[ \t]+")
func htmltotext(shtml template.HTML) string {
reader := strings.NewReader(string(shtml))
body, _ := html.Parse(reader)
var buf strings.Builder
textonly(&buf, body)
rv := buf.String()
rv = re_whitespaceeater.ReplaceAllLiteralString(rv, "\n")
rv = re_blanklineeater.ReplaceAllLiteralString(rv, "\n\n")
rv = re_tabeater.ReplaceAllLiteralString(rv, " ")
for len(rv) > 0 && rv[0] == '\n' {
rv = rv[1:]
}
return rv
}