From 6da1ed8fd2788a65681079453d9294e359f859d8 Mon Sep 17 00:00:00 2001
From: Tom Willemse <tom@ryuslash.org>
Date: Sun, 30 May 2021 01:21:47 -0700
Subject: [PATCH] =?UTF-8?q?Fix=20some=20=E2=80=9CNumerical=20overflow?=
 =?UTF-8?q?=E2=80=9D=20errors?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When starting a new blog, or trying to run on some very old data, there is no
data available on any bogus or legitimate comments. When this happens Tekuti
will try to divide by 0 and divide 0 by other things in a few places, assuming
that there is at least _some_ data about bogus and legitimate comments.

I’m not 100% sure what the expectation is of these calculations, so I might have
chosen the wrong solution, but it seems to me that if there is no data
available, all we know is that there is nothing (0).

This fixes an issue with both trying to comment on a fresh new data set, or a
dataset that hasn’t been touched since 2014.
---
 tekuti/classifier.scm | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/tekuti/classifier.scm b/tekuti/classifier.scm
index 6c8ba9a..05417d4 100644
--- a/tekuti/classifier.scm
+++ b/tekuti/classifier.scm
@@ -114,16 +114,22 @@
      (lambda (feature bogus-count)
        (let ((legit-count (hash-ref legit-features feature 0)))
          (hash-set! log-bogosities feature
-                    (log (/ (/ (+ bogus-count 0.001) total-bogus-features)
-                            (/ (+ legit-count 0.001) total-legit-features))))))
+                    (if (and (> total-bogus-features 0)
+                             (> total-legit-features 0))
+                        (log (/ (/ (+ bogus-count 0.001) total-bogus-features)
+                                (/ (+ legit-count 0.001) total-legit-features)))
+                        0))))
      bogus-features)
     (hash-for-each
      (lambda (feature legit-count)
        (let ((bogus-count (hash-ref bogus-features feature)))
          (unless bogus-count
            (hash-set! log-bogosities feature
-                      (log (/ (/ 0.01 total-bogus-features)
-                              (/ (+ legit-count 0.01) total-legit-features)))))))
+                      (if (and (> total-bogus-features 0)
+                               (> total-legit-features 0))
+                          (log (/ (/ 0.01 total-bogus-features)
+                                  (/ (+ legit-count 0.01) total-legit-features)))
+                          0)))))
      legit-features)
     log-bogosities))
 
@@ -138,8 +144,11 @@
        (let ((bogus-count (hash-ref bogus-features feature 0))
              (legit-count (hash-ref legit-features feature 0)))
          (hash-set! log-bogosities feature
-                    (log (/ (/ (+ bogus-count 0.001) total-bogus-features)
-                            (/ (+ legit-count 0.001) total-legit-features))))))
+                    (if (and  (> total-bogus-features 0)
+                              (> total-legit-features 0))
+                        (log (/ (/ (+ bogus-count 0.001) total-bogus-features)
+                                (/ (+ legit-count 0.001) total-legit-features)))
+                        0))))
      changed-features)))
 
 (define (compute-bogus-probability comment log-bogosities bogus-prior
@@ -250,7 +259,9 @@
   (with-time-debugging
    (let* ((legit-count (hash-count (const #t) legit-comments))
           (bogus-count (hash-count (const #t) bogus-comments))
-          (legit-prior (/ legit-count (+ legit-count bogus-count 0.0)))
+          (legit-prior (if (> legit-count 0)
+                           (/ legit-count (+ legit-count bogus-count 0.0))
+                           0))
           (legit-features (count-features legit-comments))
           (bogus-features (count-features bogus-comments))
           (bogosities (compute-log-bogosities legit-features bogus-features)))