From 4e225e16f3f7c55d19d983ed3b7a083a302936d8 Mon Sep 17 00:00:00 2001 From: Ted Unangst Date: Wed, 30 Oct 2019 23:05:27 -0400 Subject: [PATCH] dedupe the hoots --- hoot.go | 153 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 80 insertions(+), 73 deletions(-) diff --git a/hoot.go b/hoot.go index 6624f68..8d46dfe 100644 --- a/hoot.go +++ b/hoot.go @@ -33,81 +33,88 @@ var tweetsel = cascadia.MustCompile("p.tweet-text") var linksel = cascadia.MustCompile(".time a.tweet-timestamp") var authorregex = regexp.MustCompile("twitter.com/([^/]+)") -func hootfetcher(hoot string) string { - url := hoot[5:] - if url[0] == ' ' { - url = url[1:] - } - url = strings.Replace(url, "mobile.twitter.com", "twitter.com", -1) - log.Printf("hooterizing %s", url) - req, err := http.NewRequest("GET", url, nil) - if err != nil { - log.Printf("error: %s", err) - return hoot - } - req.Header.Set("User-Agent", "OpenBSD ftp") - req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") - req.Header.Set("Accept-Language", "en-US,en;q=0.9") - resp, err := http.DefaultClient.Do(req) - if err != nil { - log.Printf("error: %s", err) - return hoot - } - defer resp.Body.Close() - if resp.StatusCode != 200 { - log.Printf("error getting %s: %d", url, resp.StatusCode) - return hoot - } - ld, _ := os.Create("lasthoot.html") - r := io.TeeReader(resp.Body, ld) - return hootfixer(r, url) -} - -func hootfixer(r io.Reader, url string) string { - root, err := html.Parse(r) - if err != nil { - log.Printf("error parsing hoot: %s", err) - return url - } - divs := tweetsel.MatchAll(root) - - wantmatch := authorregex.FindStringSubmatch(url) - if len(wantmatch) < 2 { - log.Printf("no wanted author?") - } - wanted := wantmatch[1] - var buf strings.Builder - - var htf htfilter.Filter - fmt.Fprintf(&buf, "%s\n", url) - for _, div := range divs { - twp := div.Parent.Parent.Parent - alink := linksel.MatchFirst(twp) - if alink == nil { - log.Printf("missing link") - continue - } - link := "https://twitter.com" + htfilter.GetAttr(alink, "href") - authormatch := authorregex.FindStringSubmatch(link) - if len(authormatch) < 2 { - log.Printf("no author?") - continue - } - author := authormatch[1] - if author != wanted { - continue - } - text := htf.TextOnly(div) - text = strings.Replace(text, "\n", " ", -1) - text = strings.Replace(text, "pic.twitter.com", "https://pic.twitter.com", -1) - - fmt.Fprintf(&buf, "> @%s: %s\n", author, text) - } - return buf.String() -} - var re_hoots = regexp.MustCompile(`hoot: ?https://\S+`) func hooterize(noise string) string { + seen := make(map[string]bool) + + hootfixer := func(r io.Reader, url string) string { + root, err := html.Parse(r) + if err != nil { + log.Printf("error parsing hoot: %s", err) + return url + } + divs := tweetsel.MatchAll(root) + + wantmatch := authorregex.FindStringSubmatch(url) + if len(wantmatch) < 2 { + log.Printf("no wanted author?") + } + wanted := wantmatch[1] + var buf strings.Builder + + var htf htfilter.Filter + fmt.Fprintf(&buf, "%s\n", url) + for _, div := range divs { + twp := div.Parent.Parent.Parent + alink := linksel.MatchFirst(twp) + if alink == nil { + log.Printf("missing link") + continue + } + link := "https://twitter.com" + htfilter.GetAttr(alink, "href") + authormatch := authorregex.FindStringSubmatch(link) + if len(authormatch) < 2 { + log.Printf("no author?") + continue + } + author := authormatch[1] + if author != wanted { + continue + } + text := htf.TextOnly(div) + text = strings.Replace(text, "\n", " ", -1) + text = strings.Replace(text, "pic.twitter.com", "https://pic.twitter.com", -1) + + if seen[text] { + continue + } + + fmt.Fprintf(&buf, "> @%s: %s\n", author, text) + seen[text] = true + } + return buf.String() + } + + hootfetcher := func(hoot string) string { + url := hoot[5:] + if url[0] == ' ' { + url = url[1:] + } + url = strings.Replace(url, "mobile.twitter.com", "twitter.com", -1) + log.Printf("hooterizing %s", url) + req, err := http.NewRequest("GET", url, nil) + if err != nil { + log.Printf("error: %s", err) + return hoot + } + req.Header.Set("User-Agent", "OpenBSD ftp") + req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + req.Header.Set("Accept-Language", "en-US,en;q=0.9") + resp, err := http.DefaultClient.Do(req) + if err != nil { + log.Printf("error: %s", err) + return hoot + } + defer resp.Body.Close() + if resp.StatusCode != 200 { + log.Printf("error getting %s: %d", url, resp.StatusCode) + return hoot + } + ld, _ := os.Create("lasthoot.html") + r := io.TeeReader(resp.Body, ld) + return hootfixer(r, url) + } + return re_hoots.ReplaceAllStringFunc(noise, hootfetcher) }