From 6ada675630438759904ff08aec3a4cbd3a51769c Mon Sep 17 00:00:00 2001 From: Ted Unangst Date: Sat, 2 Jul 2022 17:57:29 -0400 Subject: [PATCH] twitter is simply incapable of leaving well enough alone. --- docs/changelog.txt | 4 ++++ hoot.go | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/docs/changelog.txt b/docs/changelog.txt index 1d9ec65..479bbb8 100644 --- a/docs/changelog.txt +++ b/docs/changelog.txt @@ -1,5 +1,9 @@ changelog +=== next + ++ Try to fix hoot again because Twitter did a Twitter. + === 0.9.8 Tentative Tentacle + Switch database to WAL mode. diff --git a/hoot.go b/hoot.go index 14d3005..a9701e3 100644 --- a/hoot.go +++ b/hoot.go @@ -28,8 +28,8 @@ import ( "humungus.tedunangst.com/r/webs/htfilter" ) -var tweetsel = cascadia.MustCompile("p.tweet-text") -var linksel = cascadia.MustCompile("a.tweet-timestamp") +var tweetsel = cascadia.MustCompile("div[itemProp=articleBody]") +var linksel = cascadia.MustCompile("a time") var replyingto = cascadia.MustCompile(".ReplyingToContextBelowAuthor") var imgsel = cascadia.MustCompile("div.js-adaptive-photo img") var authorregex = regexp.MustCompile("twitter.com/([^/]+)") @@ -65,6 +65,34 @@ func hootextractor(r io.Reader, url string, seen map[string]bool) string { divs := tweetsel.MatchAll(root) for i, div := range divs { + { + twp := div.Parent.Parent.Parent + link := url + alink := linksel.MatchFirst(twp) + if alink == nil { + if i != 0 { + dlog.Printf("missing link") + continue + } + } else { + alink = alink.Parent + link = "https://twitter.com" + htfilter.GetAttr(alink, "href") + } + authormatch := authorregex.FindStringSubmatch(link) + if len(authormatch) < 2 { + dlog.Printf("no author?: %s", link) + continue + } + author := authormatch[1] + if author != wanted { + continue + } + text := htf.NodeText(div) + text = strings.Replace(text, "\n", " ", -1) + fmt.Fprintf(&buf, "> @%s: %s\n", author, text) + continue + } + twp := div.Parent.Parent.Parent link := url alink := linksel.MatchFirst(twp)