1
0
Fork 0

Add bogus comment detector.

* tekuti/classifier.scm: New file.
* Makefile.am: Add new file.
* tekuti/comment.scm:
* tekuti/git.scm:
* tekuti/index.scm:
* tekuti/page.scm: Update to include bogus comment detector.
This commit is contained in:
Andy Wingo 2017-03-06 09:25:39 +01:00
parent 42b77efe6a
commit 5e603daa9e
6 changed files with 453 additions and 70 deletions

View file

@ -4,6 +4,7 @@ SOURCES = \
tekuti/base64.scm \
tekuti/boot.scm \
tekuti/cache.scm \
tekuti/classifier.scm \
tekuti/comment.scm \
tekuti/config.scm \
tekuti/filters.scm \

302
tekuti/classifier.scm Normal file
View file

@ -0,0 +1,302 @@
;; Tekuti
;; Copyright (C) 2008, 2010, 2012 Andy Wingo <wingo at pobox dot com>
;; This program is free software; you can redistribute it and/or
;; modify it under the terms of the GNU General Public License as
;; published by the Free Software Foundation; either version 3 of
;; the License, or (at your option) any later version.
;;
;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with this program; if not, contact:
;;
;; Free Software Foundation Voice: +1-617-542-5942
;; 59 Temple Place - Suite 330 Fax: +1-617-542-2652
;; Boston, MA 02111-1307, USA gnu@gnu.org
;;; Commentary:
;;
;; Comments -- pulling them out of the database, and making new ones.
;;
;;; Code:
(define-module (tekuti classifier)
#:use-module (tekuti comment)
#:use-module (tekuti filters)
#:use-module (tekuti git)
#:use-module (tekuti util)
#:use-module (ice-9 match)
#:use-module (srfi srfi-11)
#:use-module (srfi srfi-9)
#:export (reindex-legit-comments
reindex-bogus-comments
reindex-classifier
comment-is-bogus?))
(define (tokenize-comment comment)
(define (decorate-tokens decorator tokens)
(map (lambda (token) (string-append decorator token)) tokens))
(define (tokenize exp)
(match exp
((or () #f) '())
((? string?) (string-tokenize exp char-set:letter+digit))
(((or 'comment 'div 'p) . body)
(tokenize body))
(((? symbol? tag) . body)
(decorate-tokens (string-append (symbol->string tag) ".")
(tokenize body)))
((head . tail) (append (tokenize head) (tokenize tail)))
(_ (pk 'what exp comment) '())))
(tokenize
`(comment
(author ,(assq-ref comment 'author))
(email ,(assq-ref comment 'author_email))
(url ,(assq-ref comment 'author_url))
,(let ((format (or (assq-ref comment 'format) 'wordpress))
(raw (assq-ref comment 'raw-content)))
(or (case format
((wordpress) (false-if-exception (wordpress->sxml raw)))
(else `(pre ,raw)))
`(pre ,raw))))))
(define (fold-features comment f seed)
(let lp ((tokens (tokenize-comment comment)) (seed seed))
(define (make-3-gram a b c)
(string-append a " " (or b "") " " (or c "")))
(define (make-2-gram a b)
(string-append a " " (or b "")))
(define (make-1-gram a)
a)
(match tokens
((a . tokens)
(lp tokens
(or (f (match tokens
((b c . _) (make-3-gram a b c))
((b) (make-3-gram a b #f))
(() (make-3-gram a #f #f)))
(or (f (match tokens
((b . _) (make-2-gram a b))
(() (make-2-gram a #f)))
(f (make-1-gram a)
seed))
seed))
(f (make-1-gram a)
seed))))
(() seed))))
(define (count-features comments)
(let ((counts (make-hash-table)))
(hash-for-each
(lambda (comment-sha1 comment-name)
(fold-features (blob->comment comment-name comment-sha1)
(lambda (feature counts)
(add-feature! counts feature)
counts)
counts))
comments)
counts))
;; A feature's bogosity is the probability that a bogus document
;; contains that feature, divided by the probability that a legit
;; document contains the feature.
(define (compute-log-bogosities legit-features bogus-features)
(define (feature-count table)
(hash-fold (lambda (feature count sum) (+ count sum)) 0 table))
(let ((total-bogus-features (feature-count bogus-features))
(total-legit-features (feature-count legit-features))
(log-bogosities (make-hash-table)))
(hash-for-each
(lambda (feature bogus-count)
(let ((legit-count (hash-ref legit-features feature 0)))
(hash-set! log-bogosities feature
(log (/ (/ (+ bogus-count 0.001) total-bogus-features)
(/ (+ legit-count 0.001) total-legit-features))))))
bogus-features)
(hash-for-each
(lambda (feature legit-count)
(let ((bogus-count (hash-ref bogus-features feature)))
(unless bogus-count
(hash-set! log-bogosities feature
(log (/ (/ 0.01 total-bogus-features)
(/ (+ legit-count 0.01) total-legit-features)))))))
legit-features)
log-bogosities))
(define (update-bogosities! log-bogosities changed-features
legit-features bogus-features)
(define (feature-count table)
(hash-fold (lambda (feature count sum) (+ count sum)) 0 table))
(let ((total-bogus-features (feature-count bogus-features))
(total-legit-features (feature-count legit-features)))
(hash-for-each
(lambda (feature _)
(let ((bogus-count (hash-ref bogus-features feature 0))
(legit-count (hash-ref legit-features feature 0)))
(hash-set! log-bogosities feature
(log (/ (/ (+ bogus-count 0.001) total-bogus-features)
(/ (+ legit-count 0.001) total-legit-features))))))
changed-features)))
(define (compute-bogus-probability comment log-bogosities bogus-prior
feature-limit)
(let ((v (make-vector feature-limit 0.0)))
(define (add-bogosity! log-bogosity)
(let ((mag (abs log-bogosity)))
(when (< (abs (vector-ref v 0)) mag)
(let lp ((idx 0))
(let ((next (1+ idx)))
(cond
((and (< next (vector-length v))
(< (abs (vector-ref v next)) mag))
(vector-set! v idx (vector-ref v next))
(lp (1+ idx)))
(else
(vector-set! v idx log-bogosity))))))))
(fold-features comment
(lambda (feature _)
(add-bogosity! (hash-ref log-bogosities feature 0.0)))
#f)
(let* ((ratio (exp (+ (log (/ bogus-prior (- 1.0 bogus-prior)))
(apply + (vector->list v))))))
(/ ratio (+ ratio 1.0)))))
(define (compute-differing-comments old-master new-master)
;; sha1 -> name
(define (compute-hash-diff old new)
(let ((removed (make-hash-table))
(added (make-hash-table)))
(hash-for-each (lambda (post-sha1 post-name)
(unless (hash-ref new post-sha1)
(hash-set! removed post-sha1 post-name)))
old)
(hash-for-each (lambda (post-sha1 post-name)
(unless (hash-ref old post-sha1)
(hash-set! added post-sha1 post-name)))
new)
(values removed added)))
(define* (git-ls-tree->hash ref kind #:optional (out (make-hash-table)))
(for-each
(match-lambda
((name sha1 (? (lambda (x) (eq? x kind))))
(hash-set! out sha1 name)))
(git-ls-tree ref #f))
out)
(let ((old-post-trees (git-ls-tree->hash old-master 'tree))
(new-post-trees (git-ls-tree->hash new-master 'tree)))
(let-values (((removed-post-trees added-post-trees)
(compute-hash-diff old-post-trees new-post-trees)))
(let ((old-comments (make-hash-table))
(new-comments (make-hash-table)))
(hash-for-each
(lambda (sha1 name)
(git-ls-tree->hash (string-append sha1 ":comments") 'blob
old-comments))
removed-post-trees)
(hash-for-each
(lambda (sha1 name)
(git-ls-tree->hash (string-append sha1 ":comments") 'blob
new-comments))
added-post-trees)
(compute-hash-diff old-comments new-comments)))))
(define (reindex-legit-comments old-index index)
(let ((old-legit-comments (assq-ref old-index 'legit-comments))
(old-master (assq-ref old-index 'master))
(new-master (assq-ref index 'master)))
(if old-legit-comments
(let-values (((removed added)
(compute-differing-comments old-master new-master)))
(hash-for-each (lambda (k v)
(hash-remove! old-legit-comments k))
removed)
(hash-for-each (lambda (k v)
(hash-set! old-legit-comments k v))
added)
old-legit-comments)
(compute-legit-comments new-master))))
(define (reindex-bogus-comments old-index index)
(let ((old-bogus-comments (assq-ref old-index 'bogus-comments))
(old-classifier (assq-ref old-index 'classifier))
(old-master (assq-ref old-index 'master))
(new-master (assq-ref index 'master)))
(if old-bogus-comments
(let-values (((removed added)
(compute-differing-comments old-master new-master)))
(hash-for-each (lambda (k v)
(hash-remove! old-bogus-comments k))
added)
(hash-for-each (lambda (k v)
(hash-set! old-bogus-comments k v))
removed)
old-bogus-comments)
(compute-bogus-comments new-master (assq-ref index 'legit-comments)))))
(define-record-type <classifier>
(make-classifier legit-prior legit-features bogus-features bogosities)
classifier?
(legit-prior classifier-legit-prior)
(legit-features classifier-legit-features)
(bogus-features classifier-bogus-features)
(bogosities classifier-bogosities))
(define (rebuild-classifier legit-comments bogus-comments)
(format #t "Rebuilding bogus comment classifier...\n")
(with-time-debugging
(let* ((legit-count (hash-count (const #t) legit-comments))
(bogus-count (hash-count (const #t) bogus-comments))
(legit-prior (/ legit-count (+ legit-count bogus-count 0.0)))
(legit-features (count-features legit-comments))
(bogus-features (count-features bogus-comments))
(bogosities (compute-log-bogosities legit-features bogus-features)))
(make-classifier legit-prior legit-features bogus-features bogosities))))
(define* (add-feature! features feature #:optional (count 1))
(let ((h (hash-create-handle! features feature 0)))
(set-cdr! h (+ (cdr h) count))))
(define* (remove-feature! features feature #:optional (count 1))
(let ((h (hash-get-handle features feature)))
(when h
(set-cdr! h (- (cdr h) count)))))
(define (reindex-classifier old-index index)
(match (assq-ref old-index 'classifier)
(#f
(rebuild-classifier (assq-ref index 'legit-comments)
(assq-ref index 'bogus-comments)))
((and classifier
($ <classifier> legit-prior legit-features bogus-features bogosities))
(let-values (((removed added)
(compute-differing-comments (assq-ref old-index 'master)
(assq-ref index 'master))))
(let ((removed-features (count-features removed))
(added-features (count-features added)))
(hash-for-each (lambda (k v)
(remove-feature! legit-features k v)
(add-feature! bogus-features k v))
removed-features)
(hash-for-each (lambda (k v)
(add-feature! legit-features k v)
(remove-feature! bogus-features k v))
added-features)
(update-bogosities! bogosities removed-features
legit-features bogus-features)
(update-bogosities! bogosities added-features
legit-features bogus-features)))
classifier)))
(define* (comment-bogus-probability classifier comment #:key (feature-limit 20))
(let* ((log-bogosities (classifier-bogosities classifier))
(bogus-prior (- 1.0 (classifier-legit-prior classifier))))
(compute-bogus-probability comment log-bogosities
bogus-prior feature-limit)))
(define* (comment-is-bogus? index comment #:key (threshold 0.5))
(let ((classifier (assq-ref index 'classifier)))
(> (comment-bogus-probability classifier comment) threshold)))

View file

@ -37,7 +37,8 @@
#:use-module (tekuti match-bind)
#:export (blob->comment comment-sxml-content comment-timestamp
comment-readable-date bad-new-comment-post?
make-new-comment delete-comment))
parse-new-comment make-new-comment delete-comment
compute-legit-comments compute-bogus-comments))
(define *comment-spec*
`((timestamp . ,string->number)))
@ -137,31 +138,38 @@
(define de-newline (s///g "[\n\r]" " "))
(define (make-new-comment key title post-data)
(define (parse-new-comment post-data)
(let ((content (assoc-ref post-data "comment"))
(author (assoc-ref post-data "author"))
(email (assoc-ref post-data "email"))
(url (assoc-ref post-data "url")))
(let ((sha1 (with-output-to-blob
(for-each
(lambda (pair)
(format #t "~a: ~a\n" (car pair) (cdr pair)))
`((timestamp . ,(time-second (current-time)))
(author . ,(de-newline author))
(author_email . ,email)
(author_url . ,url)))
(display "\n")
(display content)))
(message (format #f "comment on \"~a\" by ~a" title author)))
(git-update-ref
"refs/heads/master"
(lambda (master)
(git-commit-tree (munge-tree1 master
'create
(list key "comments")
(list sha1 sha1 'blob))
master message #f))
5))))
`((timestamp . ,(time-second (current-time)))
(author . ,(de-newline author))
(author_email . ,email)
(author_url . ,url)
(raw-content . ,content))))
(define (make-new-comment key title comment)
(let ((sha1 (with-output-to-blob
(for-each
(match-lambda
((k . v)
(unless (eq? k 'raw-content)
(format #t "~a: ~a\n" k v))))
comment)
(display "\n")
(display (assq-ref comment 'raw-content))))
(message (format #f "comment on \"~a\" by ~a" title
(assq-ref comment 'author))))
(git-update-ref
"refs/heads/master"
(lambda (master)
(git-commit-tree (munge-tree1 master
'create
(list key "comments")
(list sha1 sha1 'blob))
master message #f))
5)))
(define (delete-comment post id)
(let ((key (post-key post))
@ -174,3 +182,45 @@
`(,id))
master message #f))
5)))
(define (compute-legit-comments master-ref)
;; sha1 -> #t
(define legit (make-hash-table))
(pk 'computing-legit)
(for-each
(match-lambda
((post-name post-sha1 'tree)
(for-each
(match-lambda
((comment-name comment-sha1 'blob)
(hash-set! legit comment-sha1 comment-name)))
(git-ls-tree (string-append post-sha1 ":comments") #f))))
(git-ls-tree master-ref #f))
(pk 'done legit))
(define (compute-bogus-comments master-ref legit)
;; sha1 -> #t
(define visited-trees (make-hash-table))
(define bogus (make-hash-table))
(pk 'computing-bogus)
(fold-commits
(lambda (rev commit _)
(pk 'computing-bogus rev)
(for-each
(match-lambda
((post-name post-sha1 'tree)
(unless (hash-ref visited-trees post-sha1)
(hash-set! visited-trees post-sha1 #t)
(for-each
(match-lambda
((comment-name comment-sha1 'blob)
(unless (or (hash-ref legit comment-sha1)
(hash-ref bogus comment-sha1))
(hash-set! bogus comment-sha1 comment-name)))
(_ #f))
(git-ls-tree (string-append post-sha1 ":comments") #f))))
(_ #f))
(git-ls-tree (assq-ref commit 'tree) #f)))
(assq-ref (parse-commit master-ref) 'parent)
#f)
bogus)

View file

@ -27,6 +27,7 @@
(define-module (tekuti git)
#:use-module (ice-9 rdelim)
#:use-module (ice-9 popen)
#:use-module (ice-9 textual-ports)
#:use-module (tekuti util)
#:use-module (tekuti config)
#:use-module (tekuti match-bind)
@ -44,6 +45,7 @@
git-commit-tree git-rev-list git-revert
munge-tree munge-tree1 parse-commit commit-utc-timestamp
fold-commits
with-output-to-blob with-input-from-blob))
@ -77,7 +79,7 @@
(define (prepend-env args)
(if (null? env)
args
(cons "/usr/bin/env" (append env args))))
(cons "/run/current-system/profile/bin/env" (append env args))))
(define (redirect-input args)
(if input-file
(list "/bin/sh" "-c"
@ -86,11 +88,7 @@
args))
(let* ((real-args (trc (redirect-input (prepend-env args))))
(pipe (apply open-pipe* OPEN_READ real-args))
(output (begin
(let ((bv (get-bytevector-all pipe)))
(if (eof-object? bv)
""
(utf8->string bv)))))
(output (get-string-all pipe))
(ret (close-pipe pipe)))
(case (status:exit-val ret)
((0) output)
@ -335,6 +333,14 @@
"^([^ ]+) (.*)$" (_ k v)
(cons (string->symbol k) v))))))
(define (fold-commits f rev seed)
(let lp ((rev (git-rev-parse rev)) (seed seed))
(if rev
(let ((commit (parse-commit rev)))
(lp (assq-ref commit 'parent)
(f rev commit seed)))
seed)))
(define (commit-utc-timestamp commit)
(match-bind
"^(.*) ([0-9]+) ([+-][0-9]+)" (assq-ref (parse-commit commit) 'committer)

View file

@ -33,46 +33,48 @@
#:use-module (tekuti post)
#:use-module (tekuti tags)
#:use-module (tekuti cache)
#:use-module (tekuti classifier)
#:export (maybe-reindex read-index update-index))
;; Additionally an index has an "index" field, indicating the commit
;; that it was saved in, and a "master" field, indicating the commit
;; that it indexes.
(define index-specs
`((posts ,reindex-posts ,write-hash ,read-hash)
`((master #f ,write ,read)
(posts ,reindex-posts ,write-hash ,read-hash)
(posts-by-date ,reindex-posts-by-date ,write ,read)
(tags ,reindex-tags ,write-hash ,read-hash)
(cache ,(lambda _ (make-empty-cache)) ,(lambda (x) #f) ,(lambda () '()))))
(legit-comments ,reindex-legit-comments ,write-hash ,read-hash)
(bogus-comments ,reindex-bogus-comments ,write-hash ,read-hash)
(classifier ,reindex-classifier #f #f)
(cache ,(lambda _ (make-empty-cache)) #f #f)))
(define (reindex oldindex master)
;; Leave off "index" field.
(with-time-debugging
(fold (lambda (pair index)
(acons (car pair) ((cadr pair) oldindex index)
index))
(fold (lambda (spec index)
(match spec
((key reindex write read)
(acons key (with-time-debugging (begin (pk key) (reindex oldindex index))) index))))
(acons 'master master '())
index-specs)))
(define (assoc-list-ref alist key n default)
(let ((l (assoc key alist)))
(if l (list-ref l n) default)))
(define (index->blob key value)
(with-output-to-blob
((assoc-list-ref index-specs key 2 write) value)))
(define (blob->index name sha1)
(with-input-from-blob
sha1
((assoc-list-ref index-specs (string->symbol name) 3 read))))
;; Skip past "master" as we handle that one specially.
(match index-specs
((('master . _) . specs) specs)))))
(define (write-index index oldref)
(let ((new (git-commit-tree
(git-mktree
(let lp ((index index))
(cond
((null? index) '())
((eq? (caar index) 'index) (lp (cdr index)))
(else (cons (list (caar index)
(index->blob (caar index) (cdar index))
'blob)
(lp (cdr index)))))))
(match index
(() '())
(((k . v) . index)
(match (assq k index-specs)
((_ reindex write read)
(if write
(cons (list k (with-output-to-blob (write v)) 'blob)
(lp index))
(lp index)))
(_ (lp index)))))))
oldref "reindex\n"
(commit-utc-timestamp (assq-ref index 'master)))))
(or (false-if-git-error
@ -81,19 +83,26 @@
new))
(define (read-index)
(pk 'reading-index)
(match (false-if-git-error (git-rev-parse "refs/heads/index"))
(#f (maybe-reindex '()))
(ref
(let ((dents (git-ls-tree ref #f)))
(if (and-map (lambda (spec)
(assoc (symbol->string (car spec)) dents))
index-specs)
(acons 'index ref
(map (lambda (dent)
(cons (string->symbol (car dent))
(blob->index (car dent) (cadr dent))))
dents))
(maybe-reindex (acons 'index ref '())))))))
(fold (lambda (spec index)
(match spec
((key reindex write read)
(pk 'read-index-key key)
(acons key
(cond
((and read (assoc (symbol->string key) dents))
=> (match-lambda
((_ sha1 'blob)
(with-input-from-blob sha1 (read)))))
(else
(reindex '() index)))
index))))
`((index . ,ref))
index-specs)))))
(define (maybe-reindex old-index)
(let ((master (git-rev-parse "refs/heads/master")))

View file

@ -30,6 +30,7 @@
#:use-module (tekuti git)
#:use-module (tekuti post)
#:use-module (tekuti comment)
#:use-module (tekuti classifier)
#:use-module (web uri)
#:use-module (web request)
#:use-module (tekuti request)
@ -208,13 +209,27 @@
(cond
((post-from-key index (make-post-key year month day name))
=> (lambda (post)
(cond
((bad-new-comment-post? data)
=> (lambda (reason)
(respond `((p "Bad post data: " ,(pk reason))))))
(else
(let ((comment (make-new-comment (post-key post) (post-title post)
data)))
(let ((comment (parse-new-comment data)))
(cond
((bad-new-comment-post? data)
=> (lambda (reason)
(respond `((p "Bad post data: " ,(pk reason))))))
((comment-is-bogus? index comment)
(respond `((p "Comment appears to be bogus; ignoring.")
(p "I'm testing out a new automated bogus "
"comment detector. If you feel your comment "
"was caught unfairly, tweet it to me or send "
"it by email. Or press back and reword it.")
(p "If you are a spammer, note that I fixed "
"the comment renderer to properly add "
(tt "rel='external nofollow'") " on all "
"links in comments. Go take a look at any "
"comment with a link to see for yourself. "
"Trying to linkbomb this site probably won't "
"give you any link juice so it's not worth "
"the trouble to either one of us :)"))))
(else
(make-new-comment (post-key post) (post-title post) comment)
;; nb: at this point, `post' is out-of-date
(respond `((p "Comment posted, thanks."))
#:redirect (post-url post #:fragment "comments")