Thu Jun 16 05:58:14 UTC 2011 pix@kepibu.org * Automagical updating of last-known good if a non-spammy update was made. diff -rN -u old-claki/claki.lisp new-claki/claki.lisp --- old-claki/claki.lisp 2013-07-26 15:39:52.000000000 +0000 +++ new-claki/claki.lisp 2013-07-26 15:39:52.000000000 +0000 @@ -39,7 +39,9 @@ (defvar *spam-urls* (make-hash-table :test 'equal) "hash table of spam urls.") (defvar *okay-urls* (make-hash-table :test 'equal) "hash table of acceptable urls.") (defvar *urls-to-classify* (list) "list of (url-to-classify cliki-page-where-it-was-found page-version).") -(defvar *has-spam* (list) "list of (cliki-page . version) known to have spam.") +(defvar *has-spam* (list) "list of (cliki-page version) known to have spam.") +(defvar *updated-pages* (list) "list of pages that have been updated in the format (cliki-page version).") +(defvar *ignore-update* (list) "list of pages in the format (cliki-page version) which updates should not result in updating the last-known good version (e.g., because we did it and it was a reversion).") #+(or) (clrhash *okay-urls*) @@ -51,6 +53,8 @@ (match (#t(oh-ducks:html ("a[href^=http]" . ?a) ("#footer > b" . #t(list ?b))) page) (let ((current-version (oh-ducks.traversal:element-content b))) + (format t "; Page ~s modified, now at version ~a.~%" page-url current-version) + (pushnew (list page-url current-version) *updated-pages* :test #'equal) (dolist (link a) (let ((url (oh-ducks.traversal:element-attribute :href link))) (cond @@ -134,6 +138,29 @@ #+(or) (maybe-request-last-known-good "araneida") +(defun update-last-known-good () + ;; Remove known-spam pages from list of updates + (loop :for (page-url version) :in *has-spam* + :do (removef *updated-pages* (list page-url version) :test #'equal)) + ;; Remove updates we made from list of updates (that way, we'll continue to + ;; use the old cached known-good, saving a bit of strain on cliki) + (loop :for (page-url version) :in *ignore-update* + :do (removef *updated-pages* (list page-url version) :test #'equal)) + (setf *updated-pages* + (loop :for (page-url version) :in *updated-pages* + ;; If there are unclassified urls from this page, don't mark good + ;; (could be spam!) + :if (notany (lambda (x) (and (string= page-url (second x)) + (string= version (third x)))) + *urls-to-classify*) + :do (format t "; Updating last-known good of ~s from ~a to ~a~%" page-url (gethash page-url *last-known-good*) version) + (setf (gethash page-url *last-known-good*) version) + :else + :collect (list page-url version)))) + +(defun numstring+1 (numstring) + (write-to-string (1+ (parse-integer numstring)))) + (defun revert-page (url current-version to-version) (multiple-value-bind (page status headers) (drakma:http-request (format nil "http://cliki.net/edit/~a" url) @@ -143,12 +170,13 @@ ("E0" . ,(get-cliki-source url to-version)) ("summary" . "Spam detected, reverting to Known-Good.") ("captcha" . "lisp") - ("name" . "Claki (Revertobot Alpha)"))) + ("name" . "Claki (Revertobot Beta)"))) (declare (ignore headers)) (cond ((and (= status 200) (not (search "rejected" page :test #'char-equal))) (format t "; Reverted page ~s to version ~a.~%" url to-version) + (pushnew (list url (numstring+1 current-version)) *ignore-update* :test #'equal) page) (t nil)))) @@ -175,11 +203,13 @@ (defun attendant () (classify-unknown-urls) + (update-last-known-good) (mark-known-goods) (revert-spam)) (defun unattended-revert-new-spam () (mapcar #'parse-page (get-recent-changes)) + (update-last-known-good) (revert-spam)) #+(or) (attended-revert-new-spam) @@ -226,7 +256,9 @@ (with-rucksack-and-transaction (rs) (btree) (rs:btree-insert btree 'spam-urls *spam-urls*) (rs:btree-insert btree 'ham-urls *okay-urls*) - (rs:btree-insert btree 'known-good *last-known-good*))) + (rs:btree-insert btree 'known-good *last-known-good*) + (rs:btree-insert btree 'updated-pages *updated-pages*) + (rs:btree-insert btree 'our-updates *ignore-update*))) #+(or) (save-state) @@ -234,7 +266,9 @@ (with-rucksack-and-transaction (rs) (btree) (setf *spam-urls* (rs:btree-search btree 'spam-urls) *okay-urls* (rs:btree-search btree 'ham-urls) - *last-known-good* (rs:btree-search btree 'known-good)))) + *last-known-good* (rs:btree-search btree 'known-good) + *updated-pages* (rs:btree-search btree 'updated-pages) + *ignore-update* (rs:btree-search btree 'our-updates)))) #+(or) (restore-state) #+(or) (with-rucksack-and-transaction (rs) ()