Mon Jun 13 22:35:27 UTC 2011 pix@kepibu.org
* Split out an unattended and an attented portion
In preparation for the time I let it loose without me watching over it.
diff -rN -u old-claki/claki.lisp new-claki/claki.lisp
--- old-claki/claki.lisp 2013-07-22 03:23:57.000000000 +0000
+++ new-claki/claki.lisp 2013-07-22 03:23:57.000000000 +0000
@@ -38,6 +38,8 @@
(defvar *spam-urls* (make-hash-table :test 'equal) "hash table of spam urls.")
(defvar *okay-urls* (make-hash-table :test 'equal) "hash table of acceptable urls.")
+(defvar *urls-to-classify* (list) "list of (url-to-classify cliki-page-where-it-was-found page-version).")
+(defvar *has-spam* (list) "list of (cliki-page . version) known to have spam.")
#+(or) (clrhash *okay-urls*)
@@ -50,20 +52,22 @@
("#footer > b" . #t(list ?b))) page)
(let ((current-version (oh-ducks.traversal:element-content b)))
(dolist (link a)
- (let ((url (oh-ducks.traversal:element-attribute :href link)))
- (tagbody
- :handle-url
- (cond
- ((or (gethash url *okay-urls*)
- (gethash (url-domain url) *okay-urls*))
- #+(or) (do-nothing))
- ((or (gethash url *spam-urls*)
- (gethash (url-domain url) *spam-urls*))
- (maybe-request-last-known-good page-url)
- (return-from parse-page (revert-page page-url current-version (gethash page-url *last-known-good*))))
- (t
- (request-classification url)
- (go :handle-url))))))))))
+ (let ((url (oh-ducks.traversal:element-attribute :href link))
+ (rel (oh-ducks.traversal:element-attribute :rel link)))
+ (cond
+ ((or (gethash url *okay-urls*)
+ (gethash (url-domain url) *okay-urls*))
+ #+(or) (do-nothing))
+ ((or (gethash url *spam-urls*)
+ (gethash (url-domain url) *spam-urls*))
+ (pushnew (list page-url current-version) *has-spam* :test #'equal))
+ ((and (stringp rel)
+ (or (string-equal "follow" rel)
+ (string-equal "dofollow" rel)))
+ (setf (gethash url *spam-urls*) t)
+ (pushnew (list page-url current-version) *has-spam* :test #'equal))
+ (t
+ (pushnew (list url page-url current-version) *urls-to-classify* :test #'equal)))))))))
#+(or) (parse-page "araneida")
@@ -81,7 +85,31 @@
(setf (gethash url *spam-urls*) t))
(mark-domain-spam ()
:report "Mark the domain as spam."
- (setf (gethash (url-domain url) *spam-urls*) t))))
+ (setf (gethash (url-domain url) *spam-urls*) t))
+ (classify-later ()
+ :report "Don't classify this URL yet."
+ nil)))
+
+(defun classify-unknown-urls ()
+ (setf *urls-to-classify*
+ (loop :for (url page version) :in *urls-to-classify*
+ :unless (or (gethash url *okay-urls*)
+ (gethash (url-domain url) *okay-urls*)
+ (gethash url *spam-urls*)
+ (gethash (url-domain url) *spam-urls*)
+ (request-classification url))
+ :collect (list url page version))))
+
+(defun mark-known-goods ()
+ (loop :for (page-url version) :in *has-spam*
+ :do (maybe-request-last-known-good page-url)))
+
+(defun revert-spam ()
+ (setf *has-spam*
+ (loop :for (page-url version) :in *has-spam*
+ :unless (and (gethash page-url *last-known-good*)
+ (revert-page page-url version (gethash page-url *last-known-good*)))
+ :collect (list page-url version))))
(defvar *last-known-good* (make-hash-table :test 'equal) "hash table of cliki pages and the last-known \"good\" revision.")
@@ -100,15 +128,21 @@
#+(or) (maybe-request-last-known-good "araneida")
-(defun revert-page (page current-version to-version)
- (drakma:http-request (format nil "http://cliki.net/edit/~a" page)
- :method :post
- :parameters `(("version" . ,current-version)
- ("T0" . "BODY")
- ("E0" . ,(get-cliki-source page to-version))
- ("summary" . "Spam detected, reverting to Known-Good.")
- ("captcha" . "lisp")
- ("name" . "Claki (Revertobot Alpha)"))))
+(defun revert-page (url current-version to-version)
+ (multiple-value-bind (page status headers)
+ (drakma:http-request (format nil "http://cliki.net/edit/~a" url)
+ :method :post
+ :parameters `(("version" . ,current-version)
+ ("T0" . "BODY")
+ ("E0" . ,(get-cliki-source url to-version))
+ ("summary" . "Spam detected, reverting to Known-Good.")
+ ("captcha" . "lisp")
+ ("name" . "Claki (Revertobot Alpha)")))
+ (cond
+ ((and (= status 200)
+ (not (search "rejected" page :test #'char-equal)))
+ page)
+ (t nil))))
(defun get-cliki-source (url version)
"Fetches the source text of a given version of a cliki page. That is, it
@@ -119,11 +153,19 @@
((= 200 status) page)
(t (error "crap!")))))
-(defun revert-new-spam ()
- (let ((modified-pages (get-recent-changes)))
- (loop :for page :in modified-pages
- :do (parse-page page))))
+(defun attented-revert-new-spam ()
+ (mapcar #'parse-page (get-recent-changes))
+ (attendant))
+
+(defun attendant ()
+ (classify-unknown-urls)
+ (mark-known-goods)
+ (revert-spam))
+
+(defun unattented-revert-new-spam ()
+ (mapcar #'parse-page (get-recent-changes))
+ (revert-spam))
-#+(or) (revert-new-spam)
+#+(or) (attented-revert-new-spam)
-#+(or) (loop (sleep (* 60 60)) (revert-new-spam))
+#+(or) (loop (sleep (* 60 60)) (unattented-revert-new-spam))