Add ability to save state, and some more status prints
Annotate for file claki.lisp
2011-06-13 pix 1 (defpackage #:claki
19:08:27 ' 2 (:use #:cl #:unify :alexandria))
' 3 (in-package #:claki)
' 4
' 5 (defvar *last-modified* (make-hash-table :test 'equal) "hash table of cliki pages and the last-modified header we last received.")
' 6
' 7 #+(or) (clrhash *last-modified*)
' 8
' 9 (defun get-cliki-page (url)
' 10 "Returns a page from cliki if it has been modified since we last saw it or nil
' 11 if it has not been modified. Signals an error otherwise."
' 12 (multiple-value-bind (page status headers)
2011-06-29 pix 13 (drakma:http-request (format nil "http://cliki.net/~a" url)
03:08:25 ' 14 :additional-headers (when (gethash url *last-modified*)
' 15 `((:if-modified-since (gethash url *last-modified*)))))
2011-06-13 pix 16 (cond
19:08:27 ' 17 ;; If the page hasn't been modified, no need to update
' 18 ((= 304 status) nil)
' 19 ;; The "Recent Changes" page doesn't return 304s, but does return a last-modified.
' 20 ((and (gethash url *last-modified*)
' 21 (cdr (assoc :last-modified headers))
' 22 (string= (cdr (assoc :last-modified headers)) (gethash url *last-modified*)))
' 23 nil)
' 24 ((= 200 status)
' 25 (setf (gethash url *last-modified*) (cdr (assoc :last-modified headers)))
' 26 page)
' 27 (t (error "crap!")))))
' 28
' 29 (defun get-recent-changes ()
' 30 (when-let ((page (get-cliki-page "Recent%20Changes")))
' 31 (match (#t(oh-ducks:html ("blockquote > b > a.internal" . ?links)) page)
' 32 (loop :for link :in (remove-duplicates links :test #'equalp
' 33 :key (lambda (x) (oh-ducks.traversal:element-attribute :href x))
' 34 :from-end t)
' 35 :collect (oh-ducks.traversal:element-attribute :href link)))))
' 36
' 37 #+(or) (get-recent-changes)
' 38
' 39 (defvar *spam-urls* (make-hash-table :test 'equal) "hash table of spam urls.")
' 40 (defvar *okay-urls* (make-hash-table :test 'equal) "hash table of acceptable urls.")
2011-06-13 pix 41 (defvar *urls-to-classify* (list) "list of (url-to-classify cliki-page-where-it-was-found page-version).")
2011-06-16 pix 42 (defvar *has-spam* (list) "list of (cliki-page . version) known to have spam.")
2011-06-13 pix 43
19:08:27 ' 44 #+(or) (clrhash *okay-urls*)
' 45
' 46 (defun url-domain (url)
' 47 (puri:uri-host (puri:parse-uri url)))
' 48
' 49 (defun parse-page (page-url)
' 50 (when-let ((page (get-cliki-page page-url)))
' 51 (match (#t(oh-ducks:html ("a[href^=http]" . ?a)
' 52 ("#footer > b" . #t(list ?b))) page)
' 53 (let ((current-version (oh-ducks.traversal:element-content b)))
' 54 (dolist (link a)
2011-06-13 pix 55 (let ((url (oh-ducks.traversal:element-attribute :href link)))
2011-06-13 pix 56 (cond
22:35:27 ' 57 ((or (gethash url *okay-urls*)
' 58 (gethash (url-domain url) *okay-urls*))
' 59 #+(or) (do-nothing))
' 60 ((or (gethash url *spam-urls*)
2011-06-13 pix 61 (gethash (url-domain url) *spam-urls*)
22:58:13 ' 62 (auto-classify link))
2011-06-13 pix 63 (pushnew (list page-url current-version) *has-spam* :test #'equal))
22:35:27 ' 64 (t
' 65 (pushnew (list url page-url current-version) *urls-to-classify* :test #'equal)))))))))
2011-06-13 pix 66
19:08:27 ' 67 #+(or) (parse-page "araneida")
' 68
2011-06-13 pix 69 (defun auto-classify (link)
22:58:13 ' 70 "Auto-classify URLs based upon traits common to spammers."
' 71 (let ((rel (oh-ducks.traversal:element-attribute :rel link))
' 72 (url (oh-ducks.traversal:element-attribute :href link)))
' 73 (cond
' 74 ((and (stringp rel)
' 75 (or (string-equal "follow" rel)
' 76 (string-equal "dofollow" rel)))
' 77 (setf (gethash url *spam-urls*) t))
' 78 (t nil))))
' 79
2011-06-18 pix 80 (defun request-classification (url)
2011-06-13 pix 81 (restart-case (error 'simple-error :format-control "Please classify the URL ~s."
19:08:27 ' 82 :format-arguments (list url))
' 83 (mark-url-okay ()
' 84 :report "Mark this URL as acceptable."
' 85 (setf (gethash url *okay-urls*) t))
' 86 (mark-domain-okay ()
' 87 :report "Mark the domain as acceptable."
' 88 (setf (gethash (url-domain url) *okay-urls*) t))
' 89 (mark-url-spam ()
' 90 :report "Mark this URL as spam."
' 91 (setf (gethash url *spam-urls*) t))
' 92 (mark-domain-spam ()
' 93 :report "Mark the domain as spam."
2011-06-13 pix 94 (setf (gethash (url-domain url) *spam-urls*) t))
22:35:27 ' 95 (classify-later ()
' 96 :report "Don't classify this URL yet."
' 97 nil)))
' 98
' 99 (defun classify-unknown-urls ()
' 100 (setf *urls-to-classify*
' 101 (loop :for (url page version) :in *urls-to-classify*
' 102 :unless (or (gethash url *okay-urls*)
' 103 (gethash (url-domain url) *okay-urls*)
' 104 (gethash url *spam-urls*)
' 105 (gethash (url-domain url) *spam-urls*)
2011-06-18 pix 106 (request-classification url))
2011-06-13 pix 107 :collect (list url page version))))
22:35:27 ' 108
' 109 (defun mark-known-goods ()
' 110 (loop :for (page-url version) :in *has-spam*
' 111 :do (maybe-request-last-known-good page-url)))
' 112
' 113 (defun revert-spam ()
' 114 (setf *has-spam*
' 115 (loop :for (page-url version) :in *has-spam*
' 116 :unless (and (gethash page-url *last-known-good*)
' 117 (revert-page page-url version (gethash page-url *last-known-good*)))
' 118 :collect (list page-url version))))
2011-06-13 pix 119
19:08:27 ' 120 (defvar *last-known-good* (make-hash-table :test 'equal) "hash table of cliki pages and the last-known \"good\" revision.")
' 121
' 122 (defun read-number ()
' 123 (format t "Enter a version: ")
' 124 (list (format nil "~a" (parse-integer (read-line)))))
' 125
' 126 (defun maybe-request-last-known-good (page)
' 127 (unless (gethash page *last-known-good*)
' 128 (restart-case (error 'simple-error :format-control "Do not know of a good version of cliki page ~s."
' 129 :format-arguments (list page))
' 130 (specify-version (version)
' 131 :interactive read-number
' 132 :report "Specify a known-good version."
' 133 (setf (gethash page *last-known-good*) version)))))
' 134
' 135 #+(or) (maybe-request-last-known-good "araneida")
' 136
2011-06-13 pix 137 (defun revert-page (url current-version to-version)
22:35:27 ' 138 (multiple-value-bind (page status headers)
2011-06-29 pix 139 (drakma:http-request (format nil "http://cliki.net/edit/~a" url)
03:08:25 ' 140 :method :post
' 141 :parameters `(("version" . ,current-version)
' 142 ("T0" . "BODY")
' 143 ("E0" . ,(get-cliki-source url to-version))
' 144 ("summary" . "Spam detected, reverting to Known-Good.")
' 145 ("captcha" . "lisp")
2011-06-16 pix 146 ("name" . "Claki (Revertobot Alpha)")))
2011-06-14 pix 147 (declare (ignore headers))
2011-06-13 pix 148 (cond
22:35:27 ' 149 ((and (= status 200)
' 150 (not (search "rejected" page :test #'char-equal)))
2011-06-14 pix 151 (format t "; Reverted page ~s to version ~a.~%" url to-version)
2011-06-13 pix 152 page)
22:35:27 ' 153 (t nil))))
2011-06-13 pix 154
2011-06-14 pix 155 ;; TODO?: persist this, so we don't have to care whether cliki remembers it
2011-06-13 pix 156 (defun get-cliki-source (url version)
19:08:27 ' 157 "Fetches the source text of a given version of a cliki page. That is, it
' 158 returns the text you should POST to revert a cliki page to the given version."
2011-06-15 pix 159 (multiple-value-bind (page status headers)
09:21:48 ' 160 (drakma:http-request (format nil "http://cliki.net/~a?source&v=~a" url version))
' 161 (cond
' 162 ((= 200 status) page)
' 163 (t (error "crap!")))))
2011-06-13 pix 164
2011-06-14 pix 165 (defun attended-revert-new-spam ()
2011-06-13 pix 166 (mapcar #'parse-page (get-recent-changes))
22:35:27 ' 167 (attendant))
' 168
' 169 (defun attendant ()
' 170 (classify-unknown-urls)
' 171 (mark-known-goods)
' 172 (revert-spam))
' 173
2011-06-14 pix 174 (defun unattended-revert-new-spam ()
2011-06-13 pix 175 (mapcar #'parse-page (get-recent-changes))
22:35:27 ' 176 (revert-spam))
2011-06-13 pix 177
2011-06-14 pix 178 #+(or) (attended-revert-new-spam)
08:41:24 ' 179 #+(or) (attendant)
2011-06-13 pix 180
2011-06-13 pix 181 (defun seconds (s) s)
22:58:32 ' 182 (defun minutes (m) (* (seconds 60) m))
' 183 (defun hours (h) (* (minutes 60) h))
' 184
2011-06-14 pix 185 (defconstant +simple-time+ '(:year #\- :month #\- :day #\Space :hour #\: :min #\: :sec))
18:17:05 ' 186
' 187 (defun run-for-a-while ()
' 188 (dotimes (i 20)
' 189 (sleep (minutes (+ 25 (random 10))))
' 190 (format t "; Unattented run at ~a~%" (local-time:format-timestring nil (local-time:now) :format +simple-time+))
' 191 (unattended-revert-new-spam)
' 192 (save-state)))
' 193
' 194 #+(or) (sb-thread:make-thread #'run-for-a-while :name "cliki reverter")
' 195 ^L
' 196 (defvar *state-file* (or #.*load-pathname* #p"/home/pixel/repos/claki/state/"))
' 197
' 198 (defun save-state ()
' 199 (rucksack:with-rucksack (rs *state-file*)
' 200 (rucksack:with-transaction ()
' 201 (unless (rs:rucksack-roots rs:*rucksack*)
' 202 (rs:add-rucksack-root (make-instance 'rs:btree :key< 'string<) rs:*rucksack*))
' 203 (let ((btree (first (rs:rucksack-roots rs:*rucksack*))))
' 204 (rs:btree-insert btree 'spam-urls *spam-urls*)
' 205 (rs:btree-insert btree 'ham-urls *okay-urls*)
' 206 (rs:btree-insert btree 'known-good *last-known-good*)))))
' 207
' 208 #+(or) (save-state)
' 209
' 210 (defun restore-state ()
' 211 (rs:with-rucksack (rs *state-file*)
' 212 (rs:with-transaction ()
' 213 (let ((btree (first (rs:rucksack-roots rs:*rucksack*))))
' 214 (setf *spam-urls* (rs:btree-search btree 'spam-urls)
' 215 *okay-urls* (rs:btree-search btree 'ham-urls)
' 216 *last-known-good* (rs:btree-search btree 'known-good))))))
' 217
' 218 #+(or) (restore-state)