Automagical updating of last-known good if a non-spammy update was made.
Thu Jun 16 05:58:14 UTC 2011 pix@kepibu.org
* Automagical updating of last-known good if a non-spammy update was made.
diff -rN -u old-claki/claki.lisp new-claki/claki.lisp
--- old-claki/claki.lisp 2013-07-22 03:24:21.000000000 +0000
+++ new-claki/claki.lisp 2013-07-22 03:24:21.000000000 +0000
@@ -39,7 +39,9 @@
(defvar *spam-urls* (make-hash-table :test 'equal) "hash table of spam urls.")
(defvar *okay-urls* (make-hash-table :test 'equal) "hash table of acceptable urls.")
(defvar *urls-to-classify* (list) "list of (url-to-classify cliki-page-where-it-was-found page-version).")
-(defvar *has-spam* (list) "list of (cliki-page . version) known to have spam.")
+(defvar *has-spam* (list) "list of (cliki-page version) known to have spam.")
+(defvar *updated-pages* (list) "list of pages that have been updated in the format (cliki-page version).")
+(defvar *ignore-update* (list) "list of pages in the format (cliki-page version) which updates should not result in updating the last-known good version (e.g., because we did it and it was a reversion).")
#+(or) (clrhash *okay-urls*)
@@ -51,6 +53,8 @@
(match (#t(oh-ducks:html ("a[href^=http]" . ?a)
("#footer > b" . #t(list ?b))) page)
(let ((current-version (oh-ducks.traversal:element-content b)))
+ (format t "; Page ~s modified, now at version ~a.~%" page-url current-version)
+ (pushnew (list page-url current-version) *updated-pages* :test #'equal)
(dolist (link a)
(let ((url (oh-ducks.traversal:element-attribute :href link)))
(cond
@@ -134,6 +138,29 @@
#+(or) (maybe-request-last-known-good "araneida")
+(defun update-last-known-good ()
+ ;; Remove known-spam pages from list of updates
+ (loop :for (page-url version) :in *has-spam*
+ :do (removef *updated-pages* (list page-url version) :test #'equal))
+ ;; Remove updates we made from list of updates (that way, we'll continue to
+ ;; use the old cached known-good, saving a bit of strain on cliki)
+ (loop :for (page-url version) :in *ignore-update*
+ :do (removef *updated-pages* (list page-url version) :test #'equal))
+ (setf *updated-pages*
+ (loop :for (page-url version) :in *updated-pages*
+ ;; If there are unclassified urls from this page, don't mark good
+ ;; (could be spam!)
+ :if (notany (lambda (x) (and (string= page-url (second x))
+ (string= version (third x))))
+ *urls-to-classify*)
+ :do (format t "; Updating last-known good of ~s from ~a to ~a~%" page-url (gethash page-url *last-known-good*) version)
+ (setf (gethash page-url *last-known-good*) version)
+ :else
+ :collect (list page-url version))))
+
+(defun numstring+1 (numstring)
+ (write-to-string (1+ (parse-integer numstring))))
+
(defun revert-page (url current-version to-version)
(multiple-value-bind (page status headers)
(drakma:http-request (format nil "http://cliki.net/edit/~a" url)
@@ -143,12 +170,13 @@
("E0" . ,(get-cliki-source url to-version))
("summary" . "Spam detected, reverting to Known-Good.")
("captcha" . "lisp")
- ("name" . "Claki (Revertobot Alpha)")))
+ ("name" . "Claki (Revertobot Beta)")))
(declare (ignore headers))
(cond
((and (= status 200)
(not (search "rejected" page :test #'char-equal)))
(format t "; Reverted page ~s to version ~a.~%" url to-version)
+ (pushnew (list url (numstring+1 current-version)) *ignore-update* :test #'equal)
page)
(t nil))))
@@ -175,11 +203,13 @@
(defun attendant ()
(classify-unknown-urls)
+ (update-last-known-good)
(mark-known-goods)
(revert-spam))
(defun unattended-revert-new-spam ()
(mapcar #'parse-page (get-recent-changes))
+ (update-last-known-good)
(revert-spam))
#+(or) (attended-revert-new-spam)
@@ -226,7 +256,9 @@
(with-rucksack-and-transaction (rs) (btree)
(rs:btree-insert btree 'spam-urls *spam-urls*)
(rs:btree-insert btree 'ham-urls *okay-urls*)
- (rs:btree-insert btree 'known-good *last-known-good*)))
+ (rs:btree-insert btree 'known-good *last-known-good*)
+ (rs:btree-insert btree 'updated-pages *updated-pages*)
+ (rs:btree-insert btree 'our-updates *ignore-update*)))
#+(or) (save-state)
@@ -234,7 +266,9 @@
(with-rucksack-and-transaction (rs) (btree)
(setf *spam-urls* (rs:btree-search btree 'spam-urls)
*okay-urls* (rs:btree-search btree 'ham-urls)
- *last-known-good* (rs:btree-search btree 'known-good))))
+ *last-known-good* (rs:btree-search btree 'known-good)
+ *updated-pages* (rs:btree-search btree 'updated-pages)
+ *ignore-update* (rs:btree-search btree 'our-updates))))
#+(or) (restore-state)
#+(or) (with-rucksack-and-transaction (rs) ()