Split out an unattended and an attented portion --> to head
Sun Jul 3 08:26:55 UTC 2011 pix@kepibu.org
* Fix some problems discovered on fresh-load
Fri Jul 1 22:52:58 UTC 2011 pix@kepibu.org
* Expand the timing arguments run-for-a-while takes
Wed Jun 29 03:08:25 UTC 2011 pix@kepibu.org
* Wrap drakma:http-request so we can have automatic retries on timeouts
Tue Jun 28 17:03:09 UTC 2011 pix@kepibu.org
* Make #'run-for-a-while take the number of iterations as an argument
Tue Jun 28 08:05:37 UTC 2011 pix@kepibu.org
* Padding for time numbers, so we get 01 instead of 1
Tue Jun 28 08:05:09 UTC 2011 pix@kepibu.org
* Fix state-file pathname when *load-pathname* actually exists
Tue Jun 28 08:04:46 UTC 2011 pix@kepibu.org
* Consider text-decoration:none to be a spam indicator as well
Tue Jun 28 08:04:14 UTC 2011 pix@kepibu.org
* Don't consider a page updated if the last-known good is the same as the current version
Sat Jun 18 23:16:27 UTC 2011 pix@kepibu.org
* Send cliki reverter thread output to swank's standard-io (the repl in emacs)
Sat Jun 18 23:15:57 UTC 2011 pix@kepibu.org
* Add FIXME noting a behavior of cliki I wasn't expecting
Sat Jun 18 23:15:27 UTC 2011 pix@kepibu.org
* Print when we start and stop a run.
Sat Jun 18 23:14:59 UTC 2011 pix@kepibu.org
* Bugfix: if we manually classify an URL as spam, don't then mark the page that
had that URL as known-good.
Thu Jun 16 05:58:14 UTC 2011 pix@kepibu.org
* Automagical updating of last-known good if a non-spammy update was made.
Wed Jun 15 09:22:12 UTC 2011 pix@kepibu.org
* More declarative way of saying "every x minutes, give or take"
Wed Jun 15 09:21:48 UTC 2011 pix@kepibu.org
* Cache known-good copies; simplify working with rucksack
Tue Jun 14 18:17:05 UTC 2011 pix@kepibu.org
* Add ability to save state, and some more status prints
Tue Jun 14 08:42:02 UTC 2011 pix@kepibu.org
* Silence is creepy: print something when we revert a page.
Tue Jun 14 08:41:43 UTC 2011 pix@kepibu.org
* Ignore headers var.
Tue Jun 14 08:41:24 UTC 2011 pix@kepibu.org
* Spelling.
Mon Jun 13 22:58:32 UTC 2011 pix@kepibu.org
* Easier dealing with time periods.
Mon Jun 13 22:58:13 UTC 2011 pix@kepibu.org
* Move automatic URL classification into its own function.
Mon Jun 13 22:35:27 UTC 2011 pix@kepibu.org
* Split out an unattended and an attented portion
In preparation for the time I let it loose without me watching over it.
diff -rN -u old-claki/claki.asd new-claki/claki.asd
--- old-claki/claki.asd 2013-07-22 03:26:24.000000000 +0000
+++ new-claki/claki.asd 2013-07-22 03:26:24.000000000 +0000
@@ -1,3 +1,3 @@
(defsystem :claki
- :depends-on (:oh-ducks :closure-html :cl-unification :drakma :alexandria)
+ :depends-on (:oh-ducks :closure-html :cl-unification :drakma :alexandria :rucksack :local-time)
:components ((:file "claki")))
diff -rN -u old-claki/claki.lisp new-claki/claki.lisp
--- old-claki/claki.lisp 2013-07-22 03:26:24.000000000 +0000
+++ new-claki/claki.lisp 2013-07-22 03:26:24.000000000 +0000
@@ -6,13 +6,28 @@
#+(or) (clrhash *last-modified*)
+(defun http-request (&rest drakma-args)
+ "A wrapper around drakma:http-request which automatically retries the request
+in the event of a timeout."
+ (let ((times-failed 0))
+ (tagbody
+ :fetch-page
+ (handler-case (return-from http-request (apply #'drakma:http-request drakma-args))
+ (usocket:timeout-error ()
+ (sleep (* 60 (expt 2 times-failed)))
+ (incf times-failed)
+ (go :fetch-page))))))
+
(defun get-cliki-page (url)
"Returns a page from cliki if it has been modified since we last saw it or nil
if it has not been modified. Signals an error otherwise."
(multiple-value-bind (page status headers)
- (drakma:http-request (format nil "http://cliki.net/~a" url)
- :additional-headers (when (gethash url *last-modified*)
- `((:if-modified-since (gethash url *last-modified*)))))
+ (http-request (format nil "http://cliki.net/~a" url)
+ :additional-headers (when (gethash url *last-modified*)
+ `((:if-modified-since (gethash url *last-modified*)))))
+ ;; FIXME: this doesn't work all that well: it turns out cliki uses a single
+ ;; last-modified header for the entire wiki, rather than one per page, so we
+ ;; aren't saving ourselves (or cliki) much with this.
(cond
;; If the page hasn't been modified, no need to update
((= 304 status) nil)
@@ -38,6 +53,10 @@
(defvar *spam-urls* (make-hash-table :test 'equal) "hash table of spam urls.")
(defvar *okay-urls* (make-hash-table :test 'equal) "hash table of acceptable urls.")
+(defvar *urls-to-classify* (list) "list of (url-to-classify cliki-page-where-it-was-found page-version).")
+(defvar *has-spam* (list) "list of (cliki-page version) known to have spam.")
+(defvar *updated-pages* (list) "list of pages that have been updated in the format (cliki-page version).")
+(defvar *ignore-update* (list) "list of pages in the format (cliki-page version) which updates should not result in updating the last-known good version (e.g., because we did it and it was a reversion).")
#+(or) (clrhash *okay-urls*)
@@ -49,25 +68,41 @@
(match (#t(oh-ducks:html ("a[href^=http]" . ?a)
("#footer > b" . #t(list ?b))) page)
(let ((current-version (oh-ducks.traversal:element-content b)))
+ (format t "; Page ~s modified, now at version ~a.~%" page-url current-version)
+ (when (string= (gethash page-url *last-known-good*) current-version)
+ (return-from parse-page nil))
+ (pushnew (list page-url current-version) *updated-pages* :test #'equal)
(dolist (link a)
(let ((url (oh-ducks.traversal:element-attribute :href link)))
- (tagbody
- :handle-url
- (cond
- ((or (gethash url *okay-urls*)
- (gethash (url-domain url) *okay-urls*))
- #+(or) (do-nothing))
- ((or (gethash url *spam-urls*)
- (gethash (url-domain url) *spam-urls*))
- (maybe-request-last-known-good page-url)
- (return-from parse-page (revert-page page-url current-version (gethash page-url *last-known-good*))))
- (t
- (request-classification url)
- (go :handle-url))))))))))
+ (cond
+ ((or (gethash url *okay-urls*)
+ (gethash (url-domain url) *okay-urls*))
+ #+(or) (do-nothing))
+ ((or (gethash url *spam-urls*)
+ (gethash (url-domain url) *spam-urls*)
+ (auto-classify link))
+ (pushnew (list page-url current-version) *has-spam* :test #'equal))
+ (t
+ (pushnew (list url page-url current-version) *urls-to-classify* :test #'equal)))))))))
#+(or) (parse-page "araneida")
-(defun request-classification (url)
+(defun auto-classify (link)
+ "Auto-classify URLs based upon traits common to spammers."
+ (let ((rel (oh-ducks.traversal:element-attribute :rel link))
+ (style (oh-ducks.traversal:element-attribute :style link))
+ (url (oh-ducks.traversal:element-attribute :href link)))
+ (cond
+ ((and (stringp rel)
+ (or (string-equal "follow" rel)
+ (string-equal "dofollow" rel)))
+ (setf (gethash url *spam-urls*) t))
+ ((and (stringp style)
+ (cl-ppcre:scan "text-decoration[ ]*:[ ]*none" style))
+ (setf (gethash url *spam-urls*) t))
+ (t nil))))
+
+(defun request-classification (url &optional page version)
(restart-case (error 'simple-error :format-control "Please classify the URL ~s."
:format-arguments (list url))
(mark-url-okay ()
@@ -78,10 +113,38 @@
(setf (gethash (url-domain url) *okay-urls*) t))
(mark-url-spam ()
:report "Mark this URL as spam."
+ (when (and page version)
+ (pushnew (list page version) *has-spam* :test #'equal))
(setf (gethash url *spam-urls*) t))
(mark-domain-spam ()
:report "Mark the domain as spam."
- (setf (gethash (url-domain url) *spam-urls*) t))))
+ (when (and page version)
+ (pushnew (list page version) *has-spam* :test #'equal))
+ (setf (gethash (url-domain url) *spam-urls*) t))
+ (classify-later ()
+ :report "Don't classify this URL yet."
+ nil)))
+
+(defun classify-unknown-urls ()
+ (setf *urls-to-classify*
+ (loop :for (url page version) :in *urls-to-classify*
+ :unless (or (gethash url *okay-urls*)
+ (gethash (url-domain url) *okay-urls*)
+ (gethash url *spam-urls*)
+ (gethash (url-domain url) *spam-urls*)
+ (request-classification url page version))
+ :collect (list url page version))))
+
+(defun mark-known-goods ()
+ (loop :for (page-url version) :in *has-spam*
+ :do (maybe-request-last-known-good page-url)))
+
+(defun revert-spam ()
+ (setf *has-spam*
+ (loop :for (page-url version) :in *has-spam*
+ :unless (and (gethash page-url *last-known-good*)
+ (revert-page page-url version (gethash page-url *last-known-good*)))
+ :collect (list page-url version))))
(defvar *last-known-good* (make-hash-table :test 'equal) "hash table of cliki pages and the last-known \"good\" revision.")
@@ -100,30 +163,159 @@
#+(or) (maybe-request-last-known-good "araneida")
-(defun revert-page (page current-version to-version)
- (drakma:http-request (format nil "http://cliki.net/edit/~a" page)
- :method :post
- :parameters `(("version" . ,current-version)
- ("T0" . "BODY")
- ("E0" . ,(get-cliki-source page to-version))
- ("summary" . "Spam detected, reverting to Known-Good.")
- ("captcha" . "lisp")
- ("name" . "Claki (Revertobot Alpha)"))))
+(defun update-last-known-good ()
+ ;; Remove known-spam pages from list of updates
+ (loop :for (page-url version) :in *has-spam*
+ :do (removef *updated-pages* (list page-url version) :test #'equal))
+ ;; Remove updates we made from list of updates (that way, we'll continue to
+ ;; use the old cached known-good, saving a bit of strain on cliki)
+ (loop :for (page-url version) :in *ignore-update*
+ :do (removef *updated-pages* (list page-url version) :test #'equal))
+ (setf *updated-pages*
+ (loop :for (page-url version) :in *updated-pages*
+ ;; If there are unclassified urls from this page, don't mark good
+ ;; (could be spam!)
+ :if (notany (lambda (x) (and (string= page-url (second x))
+ (string= version (third x))))
+ *urls-to-classify*)
+ :do (format t "; Updating last-known good of ~s from ~a to ~a~%" page-url (gethash page-url *last-known-good*) version)
+ (setf (gethash page-url *last-known-good*) version)
+ :else
+ :collect (list page-url version))))
-(defun get-cliki-source (url version)
- "Fetches the source text of a given version of a cliki page. That is, it
-returns the text you should POST to revert a cliki page to the given version."
+(defun numstring+1 (numstring)
+ (write-to-string (1+ (parse-integer numstring))))
+
+(defun revert-page (url current-version to-version)
(multiple-value-bind (page status headers)
- (drakma:http-request (format nil "http://cliki.net/~a?source&v=~a" url version))
+ (http-request (format nil "http://cliki.net/edit/~a" url)
+ :method :post
+ :parameters `(("version" . ,current-version)
+ ("T0" . "BODY")
+ ("E0" . ,(get-cliki-source url to-version))
+ ("summary" . "Spam detected, reverting to Known-Good.")
+ ("captcha" . "lisp")
+ ("name" . "Claki (Revertobot Beta)")))
+ (declare (ignore headers))
(cond
- ((= 200 status) page)
- (t (error "crap!")))))
-
-(defun revert-new-spam ()
- (let ((modified-pages (get-recent-changes)))
- (loop :for page :in modified-pages
- :do (parse-page page))))
+ ((and (= status 200)
+ (not (search "rejected" page :test #'char-equal)))
+ (format t "; Reverted page ~s to version ~a.~%" url to-version)
+ (pushnew (list url (numstring+1 current-version)) *ignore-update* :test #'equal)
+ page)
+ (t nil))))
-#+(or) (revert-new-spam)
+(defun get-cliki-source (url version)
+ "Fetches the source text of a given version of a cliki page. That is, it
+returns the text you should POST to revert a cliki page to the given version."
+ (or (find-in-cache url version)
+ (multiple-value-bind (page status headers)
+ (http-request (format nil "http://cliki.net/~a?source&v=~a" url version))
+ (declare (ignore headers))
+ (cond
+ ((= 200 status)
+ ;; We have to coerce to a simple-string because rucksack mistakenly
+ ;; assumes non-simple strings have a fill pointer
+ (cache-known-good url version (coerce page 'simple-string))
+ page)
+ (t (error "crap!"))))))
+
+#+(or) (get-cliki-source "araneida" "281")
+
+(defun attended-revert-new-spam ()
+ (mapcar #'parse-page (get-recent-changes))
+ (attendant))
+
+(defun attendant ()
+ (classify-unknown-urls)
+ (update-last-known-good)
+ (mark-known-goods)
+ (revert-spam))
+
+(defun unattended-revert-new-spam ()
+ (mapcar #'parse-page (get-recent-changes))
+ (update-last-known-good)
+ (revert-spam))
+
+#+(or) (attended-revert-new-spam)
+#+(or) (attendant)
+
+(defun seconds (s) s)
+(defun minutes (m) (* (seconds 60) m))
+(defun hours (h) (* (minutes 60) h))
+(defun days (d) (* (hours 24) d))
+(defun plus-or-minus (x y) (+ (- x y) (random (* 2 y))))
+
+(defconstant +simple-time+ '(:year #\- (:month 2) #\- (:day 2) #\Space (:hour 2) #\: (:min 2) #\: (:sec 2)))
+(defun now () (local-time:format-timestring nil (local-time:now) :format +simple-time+))
+
+(defun run-for-a-while (how-long how-often variance)
+ (format t "; Beginning run at ~a~%" (now))
+ (dotimes (i (floor how-long how-often))
+ (sleep (plus-or-minus how-often variance))
+ (format t "; Unattended run at ~a~%" (now))
+ (unattended-revert-new-spam)
+ (save-state))
+ (format t "; Run ended at ~a~%~%" (now)))
+
+#+(or)
+ (let ((stdout *standard-output*))
+ (sb-thread:make-thread (lambda () (let ((*standard-output* stdout)) (run-for-a-while (days 2) (minutes 30) (minutes 5))))
+ :name "cliki reverter"))
+
+(defvar *state-file* (merge-pathnames #p"state/" (directory-namestring (or #.*load-pathname* #p"/home/pixel/repos/claki/"))))
+
+(defmacro with-rucksack-and-transaction ((rucksack) (&rest root-vars) &body body)
+ (with-unique-names (rest)
+ `(rucksack:with-rucksack (,rucksack *state-file*)
+ (rucksack:with-transaction ()
+ (destructuring-bind (&optional ,@root-vars &rest ,rest) (ordered-roots rs:*rucksack*)
+ (declare (ignore ,rest) ,@(when (member '_ root-vars) `((ignore _))))
+ ,@body)))))
+
+(defun ordered-roots (sack)
+ (sort (rs:rucksack-roots sack) #'< :key #'rs:object-id))
+
+(defun initialize-rucksack ()
+ (with-rucksack-and-transaction (rs) ()
+ (let ((roots (rs:rucksack-roots rs:*rucksack*)))
+ (dotimes (i (- 2 (length roots)))
+ (rs:add-rucksack-root (make-instance 'rs:btree :key< 'string<) rs:*rucksack*)))))
+
+#+(or) (initialize-rucksack)
+
+(defun save-state ()
+ (with-rucksack-and-transaction (rs) (btree)
+ (rs:btree-insert btree 'spam-urls *spam-urls*)
+ (rs:btree-insert btree 'ham-urls *okay-urls*)
+ (rs:btree-insert btree 'known-good *last-known-good*)
+ (rs:btree-insert btree 'updated-pages *updated-pages*)
+ (rs:btree-insert btree 'our-updates *ignore-update*)))
+
+#+(or) (save-state)
+
+(defun restore-state ()
+ (with-rucksack-and-transaction (rs) (btree)
+ (setf *spam-urls* (rs:btree-search btree 'spam-urls)
+ *okay-urls* (rs:btree-search btree 'ham-urls)
+ *last-known-good* (rs:btree-search btree 'known-good)
+ *updated-pages* (rs:btree-search btree 'updated-pages)
+ *ignore-update* (rs:btree-search btree 'our-updates))))
+
+#+(or) (restore-state)
+#+(or) (with-rucksack-and-transaction (rs) ()
+ (ordered-roots rs:*rucksack*))
+
+(defun cache-known-good (cliki-page version content)
+ (with-rucksack-and-transaction (rs) (_ btree)
+ (rs:btree-insert btree cliki-page (list version content))))
+
+(defun find-in-cache (cliki-page version)
+ (with-rucksack-and-transaction (rs) (_ btree)
+ (destructuring-bind (&optional cached-version cached-content)
+ (rs:btree-search btree cliki-page :default-value nil :errorp nil)
+ (and cached-version
+ (string= cached-version version)
+ cached-content))))
-#+(or) (loop (sleep (* 60 60)) (revert-new-spam))
+#+(or) (find-in-cache "araneida" "281")