1
0
Fork 0

Fix some “Numerical overflow” errors

When starting a new blog, or trying to run on some very old data, there is no
data available on any bogus or legitimate comments. When this happens Tekuti
will try to divide by 0 and divide 0 by other things in a few places, assuming
that there is at least _some_ data about bogus and legitimate comments.

I’m not 100% sure what the expectation is of these calculations, so I might have
chosen the wrong solution, but it seems to me that if there is no data
available, all we know is that there is nothing (0).

This fixes an issue with both trying to comment on a fresh new data set, or a
dataset that hasn’t been touched since 2014.
This commit is contained in:
Tom Willemse 2021-05-30 01:21:47 -07:00
parent 480c050275
commit 6da1ed8fd2

View file

@ -114,16 +114,22 @@
(lambda (feature bogus-count) (lambda (feature bogus-count)
(let ((legit-count (hash-ref legit-features feature 0))) (let ((legit-count (hash-ref legit-features feature 0)))
(hash-set! log-bogosities feature (hash-set! log-bogosities feature
(log (/ (/ (+ bogus-count 0.001) total-bogus-features) (if (and (> total-bogus-features 0)
(/ (+ legit-count 0.001) total-legit-features)))))) (> total-legit-features 0))
(log (/ (/ (+ bogus-count 0.001) total-bogus-features)
(/ (+ legit-count 0.001) total-legit-features)))
0))))
bogus-features) bogus-features)
(hash-for-each (hash-for-each
(lambda (feature legit-count) (lambda (feature legit-count)
(let ((bogus-count (hash-ref bogus-features feature))) (let ((bogus-count (hash-ref bogus-features feature)))
(unless bogus-count (unless bogus-count
(hash-set! log-bogosities feature (hash-set! log-bogosities feature
(log (/ (/ 0.01 total-bogus-features) (if (and (> total-bogus-features 0)
(/ (+ legit-count 0.01) total-legit-features))))))) (> total-legit-features 0))
(log (/ (/ 0.01 total-bogus-features)
(/ (+ legit-count 0.01) total-legit-features)))
0)))))
legit-features) legit-features)
log-bogosities)) log-bogosities))
@ -138,8 +144,11 @@
(let ((bogus-count (hash-ref bogus-features feature 0)) (let ((bogus-count (hash-ref bogus-features feature 0))
(legit-count (hash-ref legit-features feature 0))) (legit-count (hash-ref legit-features feature 0)))
(hash-set! log-bogosities feature (hash-set! log-bogosities feature
(log (/ (/ (+ bogus-count 0.001) total-bogus-features) (if (and (> total-bogus-features 0)
(/ (+ legit-count 0.001) total-legit-features)))))) (> total-legit-features 0))
(log (/ (/ (+ bogus-count 0.001) total-bogus-features)
(/ (+ legit-count 0.001) total-legit-features)))
0))))
changed-features))) changed-features)))
(define (compute-bogus-probability comment log-bogosities bogus-prior (define (compute-bogus-probability comment log-bogosities bogus-prior
@ -250,7 +259,9 @@
(with-time-debugging (with-time-debugging
(let* ((legit-count (hash-count (const #t) legit-comments)) (let* ((legit-count (hash-count (const #t) legit-comments))
(bogus-count (hash-count (const #t) bogus-comments)) (bogus-count (hash-count (const #t) bogus-comments))
(legit-prior (/ legit-count (+ legit-count bogus-count 0.0))) (legit-prior (if (> legit-count 0)
(/ legit-count (+ legit-count bogus-count 0.0))
0))
(legit-features (count-features legit-comments)) (legit-features (count-features legit-comments))
(bogus-features (count-features bogus-comments)) (bogus-features (count-features bogus-comments))
(bogosities (compute-log-bogosities legit-features bogus-features))) (bogosities (compute-log-bogosities legit-features bogus-features)))