From 6da1ed8fd2788a65681079453d9294e359f859d8 Mon Sep 17 00:00:00 2001 From: Tom Willemse Date: Sun, 30 May 2021 01:21:47 -0700 Subject: [PATCH] =?UTF-8?q?Fix=20some=20=E2=80=9CNumerical=20overflow?= =?UTF-8?q?=E2=80=9D=20errors?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When starting a new blog, or trying to run on some very old data, there is no data available on any bogus or legitimate comments. When this happens Tekuti will try to divide by 0 and divide 0 by other things in a few places, assuming that there is at least _some_ data about bogus and legitimate comments. I’m not 100% sure what the expectation is of these calculations, so I might have chosen the wrong solution, but it seems to me that if there is no data available, all we know is that there is nothing (0). This fixes an issue with both trying to comment on a fresh new data set, or a dataset that hasn’t been touched since 2014. --- tekuti/classifier.scm | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/tekuti/classifier.scm b/tekuti/classifier.scm index 6c8ba9a..05417d4 100644 --- a/tekuti/classifier.scm +++ b/tekuti/classifier.scm @@ -114,16 +114,22 @@ (lambda (feature bogus-count) (let ((legit-count (hash-ref legit-features feature 0))) (hash-set! log-bogosities feature - (log (/ (/ (+ bogus-count 0.001) total-bogus-features) - (/ (+ legit-count 0.001) total-legit-features)))))) + (if (and (> total-bogus-features 0) + (> total-legit-features 0)) + (log (/ (/ (+ bogus-count 0.001) total-bogus-features) + (/ (+ legit-count 0.001) total-legit-features))) + 0)))) bogus-features) (hash-for-each (lambda (feature legit-count) (let ((bogus-count (hash-ref bogus-features feature))) (unless bogus-count (hash-set! log-bogosities feature - (log (/ (/ 0.01 total-bogus-features) - (/ (+ legit-count 0.01) total-legit-features))))))) + (if (and (> total-bogus-features 0) + (> total-legit-features 0)) + (log (/ (/ 0.01 total-bogus-features) + (/ (+ legit-count 0.01) total-legit-features))) + 0))))) legit-features) log-bogosities)) @@ -138,8 +144,11 @@ (let ((bogus-count (hash-ref bogus-features feature 0)) (legit-count (hash-ref legit-features feature 0))) (hash-set! log-bogosities feature - (log (/ (/ (+ bogus-count 0.001) total-bogus-features) - (/ (+ legit-count 0.001) total-legit-features)))))) + (if (and (> total-bogus-features 0) + (> total-legit-features 0)) + (log (/ (/ (+ bogus-count 0.001) total-bogus-features) + (/ (+ legit-count 0.001) total-legit-features))) + 0)))) changed-features))) (define (compute-bogus-probability comment log-bogosities bogus-prior @@ -250,7 +259,9 @@ (with-time-debugging (let* ((legit-count (hash-count (const #t) legit-comments)) (bogus-count (hash-count (const #t) bogus-comments)) - (legit-prior (/ legit-count (+ legit-count bogus-count 0.0))) + (legit-prior (if (> legit-count 0) + (/ legit-count (+ legit-count bogus-count 0.0)) + 0)) (legit-features (count-features legit-comments)) (bogus-features (count-features bogus-comments)) (bogosities (compute-log-bogosities legit-features bogus-features)))