From 65712620c58faca45f6a3803a4fa2eaaed2109f3 Mon Sep 17 00:00:00 2001 From: Ted Unangst Date: Wed, 30 Sep 2020 15:20:40 -0400 Subject: [PATCH] dedupe blob file data --- database.go | 58 ++++++++++++++++++++++++++++++---------------- docs/changelog.txt | 2 ++ upgradedb.go | 35 +++++++++++++++++++++++++++- util.go | 7 +++++- 4 files changed, 80 insertions(+), 22 deletions(-) diff --git a/database.go b/database.go index 1155f17..d1c11dd 100644 --- a/database.go +++ b/database.go @@ -17,6 +17,7 @@ package main import ( "bytes" + "crypto/sha512" "database/sql" "encoding/json" "fmt" @@ -509,20 +510,41 @@ func savefile(name string, desc string, url string, media string, local bool, da return fileid, err } +func hashfiledata(data []byte) string { + h := sha512.New512_256() + h.Write(data) + return fmt.Sprintf("%x", h.Sum(nil)) +} + func savefileandxid(name string, desc string, url string, media string, local bool, data []byte) (int64, string, error) { - xid := xfiltrate() - switch media { - case "image/png": - xid += ".png" - case "image/jpeg": - xid += ".jpg" - case "application/pdf": - xid += ".pdf" - case "text/plain": - xid += ".txt" - } - if url == "" { - url = fmt.Sprintf("https://%s/d/%s", serverName, xid) + var xid string + if local { + hash := hashfiledata(data) + row := stmtCheckFileData.QueryRow(hash) + err := row.Scan(&xid) + if err == sql.ErrNoRows { + xid = xfiltrate() + switch media { + case "image/png": + xid += ".png" + case "image/jpeg": + xid += ".jpg" + case "application/pdf": + xid += ".pdf" + case "text/plain": + xid += ".txt" + } + _, err = stmtSaveFileData.Exec(xid, media, hash, data) + if err != nil { + return 0, "", err + } + } else if err != nil { + log.Printf("error checking file hash: %s", err) + return 0, "", err + } + if url == "" { + url = fmt.Sprintf("https://%s/d/%s", serverName, xid) + } } res, err := stmtSaveFile.Exec(xid, name, desc, url, media, local) @@ -530,12 +552,6 @@ func savefileandxid(name string, desc string, url string, media string, local bo return 0, "", err } fileid, _ := res.LastInsertId() - if local { - _, err = stmtSaveFileData.Exec(xid, media, data) - if err != nil { - return 0, "", err - } - } return fileid, xid, nil } @@ -890,6 +906,7 @@ var stmtHonksFromLongAgo *sql.Stmt var stmtHonksByHonker, stmtSaveHonk, stmtUserByName, stmtUserByNumber *sql.Stmt var stmtEventHonks, stmtOneBonk, stmtFindZonk, stmtFindXonk, stmtSaveDonk *sql.Stmt var stmtFindFile, stmtGetFileData, stmtSaveFileData, stmtSaveFile *sql.Stmt +var stmtCheckFileData *sql.Stmt var stmtAddDoover, stmtGetDoovers, stmtLoadDoover, stmtZapDoover, stmtOneHonker *sql.Stmt var stmtUntagged, stmtDeleteHonk, stmtDeleteDonks, stmtDeleteOnts, stmtSaveZonker *sql.Stmt var stmtGetZonkers, stmtRecentHonkers, stmtGetXonker, stmtSaveXonker, stmtDeleteXonker *sql.Stmt @@ -951,7 +968,8 @@ func prepareStatements(db *sql.DB) { stmtDeleteDonks = preparetodie(db, "delete from donks where honkid = ?") stmtSaveFile = preparetodie(db, "insert into filemeta (xid, name, description, url, media, local) values (?, ?, ?, ?, ?, ?)") blobdb := openblobdb() - stmtSaveFileData = preparetodie(blobdb, "insert into filedata (xid, media, content) values (?, ?, ?)") + stmtSaveFileData = preparetodie(blobdb, "insert into filedata (xid, media, hash, content) values (?, ?, ?, ?)") + stmtCheckFileData = preparetodie(blobdb, "select xid from filedata where hash = ?") stmtGetFileData = preparetodie(blobdb, "select media, content from filedata where xid = ?") stmtFindXonk = preparetodie(db, "select honkid from honks where userid = ? and xid = ?") stmtFindFile = preparetodie(db, "select fileid, xid from filemeta where url = ? and local = 1") diff --git a/docs/changelog.txt b/docs/changelog.txt index af29eae..67ddb37 100644 --- a/docs/changelog.txt +++ b/docs/changelog.txt @@ -2,6 +2,8 @@ changelog === next ++ Dedupe blob file data. + - Custom lingo for those who don't like honking. + Better support for rich text bios. diff --git a/upgradedb.go b/upgradedb.go index e5e37c5..8015b1a 100644 --- a/upgradedb.go +++ b/upgradedb.go @@ -23,7 +23,7 @@ import ( "time" ) -var myVersion = 39 +var myVersion = 40 type dbexecer interface { Exec(query string, args ...interface{}) (sql.Result, error) @@ -168,6 +168,39 @@ func upgradedb() { doordie(db, "update config set value = 39 where key = 'dbversion'") fallthrough case 39: + blobdb := openblobdb() + doordie(blobdb, "alter table filedata add column hash text") + doordie(blobdb, "create index idx_filehash on filedata(hash)") + rows, err := blobdb.Query("select xid, content from filedata") + if err != nil { + log.Fatal(err) + } + m := make(map[string]string) + for rows.Next() { + var xid string + var data sql.RawBytes + err := rows.Scan(&xid, &data) + if err != nil { + log.Fatal(err) + } + hash := hashfiledata(data) + m[xid] = hash + } + rows.Close() + tx, err := blobdb.Begin() + if err != nil { + log.Fatal(err) + } + for xid, hash := range m { + doordie(tx, "update filedata set hash = ? where xid = ?", hash, xid) + } + err = tx.Commit() + if err != nil { + log.Fatal(err) + } + doordie(db, "update config set value = 40 where key = 'dbversion'") + fallthrough + case 40: default: log.Fatalf("can't upgrade unknown version %d", dbversion) diff --git a/util.go b/util.go index 0659492..04a6880 100644 --- a/util.go +++ b/util.go @@ -172,7 +172,7 @@ func initblobdb() { log.Print(err) return } - _, err = blobdb.Exec("create table filedata (xid text, media text, content blob)") + _, err = blobdb.Exec("create table filedata (xid text, media text, hash text, content blob)") if err != nil { log.Print(err) return @@ -182,6 +182,11 @@ func initblobdb() { log.Print(err) return } + _, err = blobdb.Exec("create index idx_filehash on filedata(hash)") + if err != nil { + log.Print(err) + return + } blobdb.Close() }