Mon Jun 13 22:35:27 UTC 2011 pix@kepibu.org * Split out an unattended and an attented portion In preparation for the time I let it loose without me watching over it. diff -rN -u old-claki/claki.lisp new-claki/claki.lisp --- old-claki/claki.lisp 2013-07-18 20:08:40.000000000 +0000 +++ new-claki/claki.lisp 2013-07-18 20:08:40.000000000 +0000 @@ -38,6 +38,8 @@ (defvar *spam-urls* (make-hash-table :test 'equal) "hash table of spam urls.") (defvar *okay-urls* (make-hash-table :test 'equal) "hash table of acceptable urls.") +(defvar *urls-to-classify* (list) "list of (url-to-classify cliki-page-where-it-was-found page-version).") +(defvar *has-spam* (list) "list of (cliki-page . version) known to have spam.") #+(or) (clrhash *okay-urls*) @@ -50,20 +52,22 @@ ("#footer > b" . #t(list ?b))) page) (let ((current-version (oh-ducks.traversal:element-content b))) (dolist (link a) - (let ((url (oh-ducks.traversal:element-attribute :href link))) - (tagbody - :handle-url - (cond - ((or (gethash url *okay-urls*) - (gethash (url-domain url) *okay-urls*)) - #+(or) (do-nothing)) - ((or (gethash url *spam-urls*) - (gethash (url-domain url) *spam-urls*)) - (maybe-request-last-known-good page-url) - (return-from parse-page (revert-page page-url current-version (gethash page-url *last-known-good*)))) - (t - (request-classification url) - (go :handle-url)))))))))) + (let ((url (oh-ducks.traversal:element-attribute :href link)) + (rel (oh-ducks.traversal:element-attribute :rel link))) + (cond + ((or (gethash url *okay-urls*) + (gethash (url-domain url) *okay-urls*)) + #+(or) (do-nothing)) + ((or (gethash url *spam-urls*) + (gethash (url-domain url) *spam-urls*)) + (pushnew (list page-url current-version) *has-spam* :test #'equal)) + ((and (stringp rel) + (or (string-equal "follow" rel) + (string-equal "dofollow" rel))) + (setf (gethash url *spam-urls*) t) + (pushnew (list page-url current-version) *has-spam* :test #'equal)) + (t + (pushnew (list url page-url current-version) *urls-to-classify* :test #'equal))))))))) #+(or) (parse-page "araneida") @@ -81,7 +85,31 @@ (setf (gethash url *spam-urls*) t)) (mark-domain-spam () :report "Mark the domain as spam." - (setf (gethash (url-domain url) *spam-urls*) t)))) + (setf (gethash (url-domain url) *spam-urls*) t)) + (classify-later () + :report "Don't classify this URL yet." + nil))) + +(defun classify-unknown-urls () + (setf *urls-to-classify* + (loop :for (url page version) :in *urls-to-classify* + :unless (or (gethash url *okay-urls*) + (gethash (url-domain url) *okay-urls*) + (gethash url *spam-urls*) + (gethash (url-domain url) *spam-urls*) + (request-classification url)) + :collect (list url page version)))) + +(defun mark-known-goods () + (loop :for (page-url version) :in *has-spam* + :do (maybe-request-last-known-good page-url))) + +(defun revert-spam () + (setf *has-spam* + (loop :for (page-url version) :in *has-spam* + :unless (and (gethash page-url *last-known-good*) + (revert-page page-url version (gethash page-url *last-known-good*))) + :collect (list page-url version)))) (defvar *last-known-good* (make-hash-table :test 'equal) "hash table of cliki pages and the last-known \"good\" revision.") @@ -100,15 +128,21 @@ #+(or) (maybe-request-last-known-good "araneida") -(defun revert-page (page current-version to-version) - (drakma:http-request (format nil "http://cliki.net/edit/~a" page) - :method :post - :parameters `(("version" . ,current-version) - ("T0" . "BODY") - ("E0" . ,(get-cliki-source page to-version)) - ("summary" . "Spam detected, reverting to Known-Good.") - ("captcha" . "lisp") - ("name" . "Claki (Revertobot Alpha)")))) +(defun revert-page (url current-version to-version) + (multiple-value-bind (page status headers) + (drakma:http-request (format nil "http://cliki.net/edit/~a" url) + :method :post + :parameters `(("version" . ,current-version) + ("T0" . "BODY") + ("E0" . ,(get-cliki-source url to-version)) + ("summary" . "Spam detected, reverting to Known-Good.") + ("captcha" . "lisp") + ("name" . "Claki (Revertobot Alpha)"))) + (cond + ((and (= status 200) + (not (search "rejected" page :test #'char-equal))) + page) + (t nil)))) (defun get-cliki-source (url version) "Fetches the source text of a given version of a cliki page. That is, it @@ -119,11 +153,19 @@ ((= 200 status) page) (t (error "crap!"))))) -(defun revert-new-spam () - (let ((modified-pages (get-recent-changes))) - (loop :for page :in modified-pages - :do (parse-page page)))) +(defun attented-revert-new-spam () + (mapcar #'parse-page (get-recent-changes)) + (attendant)) + +(defun attendant () + (classify-unknown-urls) + (mark-known-goods) + (revert-spam)) + +(defun unattented-revert-new-spam () + (mapcar #'parse-page (get-recent-changes)) + (revert-spam)) -#+(or) (revert-new-spam) +#+(or) (attented-revert-new-spam) -#+(or) (loop (sleep (* 60 60)) (revert-new-spam)) +#+(or) (loop (sleep (* 60 60)) (unattented-revert-new-spam))