Fix some problems discovered on fresh-load
Annotate for file /claki.lisp
2011-06-13 pix 1 (defpackage #:claki
19:08:27 ' 2 (:use #:cl #:unify :alexandria))
' 3 (in-package #:claki)
' 4
' 5 (defvar *last-modified* (make-hash-table :test 'equal) "hash table of cliki pages and the last-modified header we last received.")
' 6
' 7 #+(or) (clrhash *last-modified*)
' 8
2011-06-29 pix 9 (defun http-request (&rest drakma-args)
03:08:25 ' 10 "A wrapper around drakma:http-request which automatically retries the request
' 11 in the event of a timeout."
' 12 (let ((times-failed 0))
' 13 (tagbody
' 14 :fetch-page
' 15 (handler-case (return-from http-request (apply #'drakma:http-request drakma-args))
' 16 (usocket:timeout-error ()
' 17 (sleep (* 60 (expt 2 times-failed)))
' 18 (incf times-failed)
' 19 (go :fetch-page))))))
' 20
2011-06-13 pix 21 (defun get-cliki-page (url)
19:08:27 ' 22 "Returns a page from cliki if it has been modified since we last saw it or nil
' 23 if it has not been modified. Signals an error otherwise."
' 24 (multiple-value-bind (page status headers)
2011-06-29 pix 25 (http-request (format nil "http://cliki.net/~a" url)
03:08:25 ' 26 :additional-headers (when (gethash url *last-modified*)
' 27 `((:if-modified-since (gethash url *last-modified*)))))
2011-06-18 pix 28 ;; FIXME: this doesn't work all that well: it turns out cliki uses a single
23:15:57 ' 29 ;; last-modified header for the entire wiki, rather than one per page, so we
' 30 ;; aren't saving ourselves (or cliki) much with this.
2011-06-13 pix 31 (cond
19:08:27 ' 32 ;; If the page hasn't been modified, no need to update
' 33 ((= 304 status) nil)
' 34 ;; The "Recent Changes" page doesn't return 304s, but does return a last-modified.
' 35 ((and (gethash url *last-modified*)
' 36 (cdr (assoc :last-modified headers))
' 37 (string= (cdr (assoc :last-modified headers)) (gethash url *last-modified*)))
' 38 nil)
' 39 ((= 200 status)
' 40 (setf (gethash url *last-modified*) (cdr (assoc :last-modified headers)))
' 41 page)
' 42 (t (error "crap!")))))
' 43
' 44 (defun get-recent-changes ()
' 45 (when-let ((page (get-cliki-page "Recent%20Changes")))
' 46 (match (#t(oh-ducks:html ("blockquote > b > a.internal" . ?links)) page)
' 47 (loop :for link :in (remove-duplicates links :test #'equalp
' 48 :key (lambda (x) (oh-ducks.traversal:element-attribute :href x))
' 49 :from-end t)
' 50 :collect (oh-ducks.traversal:element-attribute :href link)))))
' 51
' 52 #+(or) (get-recent-changes)
' 53
' 54 (defvar *spam-urls* (make-hash-table :test 'equal) "hash table of spam urls.")
' 55 (defvar *okay-urls* (make-hash-table :test 'equal) "hash table of acceptable urls.")
2011-06-13 pix 56 (defvar *urls-to-classify* (list) "list of (url-to-classify cliki-page-where-it-was-found page-version).")
2011-06-16 pix 57 (defvar *has-spam* (list) "list of (cliki-page version) known to have spam.")
05:58:14 ' 58 (defvar *updated-pages* (list) "list of pages that have been updated in the format (cliki-page version).")
' 59 (defvar *ignore-update* (list) "list of pages in the format (cliki-page version) which updates should not result in updating the last-known good version (e.g., because we did it and it was a reversion).")
2011-06-13 pix 60
19:08:27 ' 61 #+(or) (clrhash *okay-urls*)
' 62
' 63 (defun url-domain (url)
' 64 (puri:uri-host (puri:parse-uri url)))
' 65
' 66 (defun parse-page (page-url)
' 67 (when-let ((page (get-cliki-page page-url)))
' 68 (match (#t(oh-ducks:html ("a[href^=http]" . ?a)
' 69 ("#footer > b" . #t(list ?b))) page)
' 70 (let ((current-version (oh-ducks.traversal:element-content b)))
2011-06-16 pix 71 (format t "; Page ~s modified, now at version ~a.~%" page-url current-version)
2011-06-28 pix 72 (when (string= (gethash page-url *last-known-good*) current-version)
08:04:14 ' 73 (return-from parse-page nil))
2011-06-16 pix 74 (pushnew (list page-url current-version) *updated-pages* :test #'equal)
2011-06-13 pix 75 (dolist (link a)
2011-06-13 pix 76 (let ((url (oh-ducks.traversal:element-attribute :href link)))
2011-06-13 pix 77 (cond
22:35:27 ' 78 ((or (gethash url *okay-urls*)
' 79 (gethash (url-domain url) *okay-urls*))
' 80 #+(or) (do-nothing))
' 81 ((or (gethash url *spam-urls*)
2011-06-13 pix 82 (gethash (url-domain url) *spam-urls*)
22:58:13 ' 83 (auto-classify link))
2011-06-13 pix 84 (pushnew (list page-url current-version) *has-spam* :test #'equal))
22:35:27 ' 85 (t
' 86 (pushnew (list url page-url current-version) *urls-to-classify* :test #'equal)))))))))
2011-06-13 pix 87
19:08:27 ' 88 #+(or) (parse-page "araneida")
' 89
2011-06-13 pix 90 (defun auto-classify (link)
22:58:13 ' 91 "Auto-classify URLs based upon traits common to spammers."
' 92 (let ((rel (oh-ducks.traversal:element-attribute :rel link))
2011-06-28 pix 93 (style (oh-ducks.traversal:element-attribute :style link))
2011-06-13 pix 94 (url (oh-ducks.traversal:element-attribute :href link)))
22:58:13 ' 95 (cond
' 96 ((and (stringp rel)
' 97 (or (string-equal "follow" rel)
' 98 (string-equal "dofollow" rel)))
' 99 (setf (gethash url *spam-urls*) t))
2011-06-28 pix 100 ((and (stringp style)
08:04:46 ' 101 (cl-ppcre:scan "text-decoration[ ]*:[ ]*none" style))
' 102 (setf (gethash url *spam-urls*) t))
2011-06-13 pix 103 (t nil))))
22:58:13 ' 104
2011-06-18 pix 105 (defun request-classification (url &optional page version)
2011-06-13 pix 106 (restart-case (error 'simple-error :format-control "Please classify the URL ~s."
19:08:27 ' 107 :format-arguments (list url))
' 108 (mark-url-okay ()
' 109 :report "Mark this URL as acceptable."
' 110 (setf (gethash url *okay-urls*) t))
' 111 (mark-domain-okay ()
' 112 :report "Mark the domain as acceptable."
' 113 (setf (gethash (url-domain url) *okay-urls*) t))
' 114 (mark-url-spam ()
' 115 :report "Mark this URL as spam."
2011-06-18 pix 116 (when (and page version)
23:14:59 ' 117 (pushnew (list page version) *has-spam* :test #'equal))
2011-06-13 pix 118 (setf (gethash url *spam-urls*) t))
19:08:27 ' 119 (mark-domain-spam ()
' 120 :report "Mark the domain as spam."
2011-06-18 pix 121 (when (and page version)
23:14:59 ' 122 (pushnew (list page version) *has-spam* :test #'equal))
2011-06-13 pix 123 (setf (gethash (url-domain url) *spam-urls*) t))
22:35:27 ' 124 (classify-later ()
' 125 :report "Don't classify this URL yet."
' 126 nil)))
' 127
' 128 (defun classify-unknown-urls ()
' 129 (setf *urls-to-classify*
' 130 (loop :for (url page version) :in *urls-to-classify*
' 131 :unless (or (gethash url *okay-urls*)
' 132 (gethash (url-domain url) *okay-urls*)
' 133 (gethash url *spam-urls*)
' 134 (gethash (url-domain url) *spam-urls*)
2011-06-18 pix 135 (request-classification url page version))
2011-06-13 pix 136 :collect (list url page version))))
22:35:27 ' 137
' 138 (defun mark-known-goods ()
' 139 (loop :for (page-url version) :in *has-spam*
' 140 :do (maybe-request-last-known-good page-url)))
' 141
' 142 (defun revert-spam ()
' 143 (setf *has-spam*
' 144 (loop :for (page-url version) :in *has-spam*
' 145 :unless (and (gethash page-url *last-known-good*)
' 146 (revert-page page-url version (gethash page-url *last-known-good*)))
' 147 :collect (list page-url version))))
2011-06-13 pix 148
19:08:27 ' 149 (defvar *last-known-good* (make-hash-table :test 'equal) "hash table of cliki pages and the last-known \"good\" revision.")
' 150
' 151 (defun read-number ()
' 152 (format t "Enter a version: ")
' 153 (list (format nil "~a" (parse-integer (read-line)))))
' 154
' 155 (defun maybe-request-last-known-good (page)
' 156 (unless (gethash page *last-known-good*)
' 157 (restart-case (error 'simple-error :format-control "Do not know of a good version of cliki page ~s."
' 158 :format-arguments (list page))
' 159 (specify-version (version)
' 160 :interactive read-number
' 161 :report "Specify a known-good version."
' 162 (setf (gethash page *last-known-good*) version)))))
' 163
' 164 #+(or) (maybe-request-last-known-good "araneida")
' 165
2011-06-16 pix 166 (defun update-last-known-good ()
05:58:14 ' 167 ;; Remove known-spam pages from list of updates
' 168 (loop :for (page-url version) :in *has-spam*
' 169 :do (removef *updated-pages* (list page-url version) :test #'equal))
' 170 ;; Remove updates we made from list of updates (that way, we'll continue to
' 171 ;; use the old cached known-good, saving a bit of strain on cliki)
' 172 (loop :for (page-url version) :in *ignore-update*
' 173 :do (removef *updated-pages* (list page-url version) :test #'equal))
' 174 (setf *updated-pages*
' 175 (loop :for (page-url version) :in *updated-pages*
' 176 ;; If there are unclassified urls from this page, don't mark good
' 177 ;; (could be spam!)
' 178 :if (notany (lambda (x) (and (string= page-url (second x))
' 179 (string= version (third x))))
' 180 *urls-to-classify*)
' 181 :do (format t "; Updating last-known good of ~s from ~a to ~a~%" page-url (gethash page-url *last-known-good*) version)
' 182 (setf (gethash page-url *last-known-good*) version)
' 183 :else
' 184 :collect (list page-url version))))
' 185
' 186 (defun numstring+1 (numstring)
' 187 (write-to-string (1+ (parse-integer numstring))))
' 188
2011-06-13 pix 189 (defun revert-page (url current-version to-version)
22:35:27 ' 190 (multiple-value-bind (page status headers)
2011-06-29 pix 191 (http-request (format nil "http://cliki.net/edit/~a" url)
03:08:25 ' 192 :method :post
' 193 :parameters `(("version" . ,current-version)
' 194 ("T0" . "BODY")
' 195 ("E0" . ,(get-cliki-source url to-version))
' 196 ("summary" . "Spam detected, reverting to Known-Good.")
' 197 ("captcha" . "lisp")
' 198 ("name" . "Claki (Revertobot Beta)")))
2011-06-14 pix 199 (declare (ignore headers))
2011-06-13 pix 200 (cond
22:35:27 ' 201 ((and (= status 200)
' 202 (not (search "rejected" page :test #'char-equal)))
2011-06-14 pix 203 (format t "; Reverted page ~s to version ~a.~%" url to-version)
2011-06-16 pix 204 (pushnew (list url (numstring+1 current-version)) *ignore-update* :test #'equal)
2011-06-13 pix 205 page)
22:35:27 ' 206 (t nil))))
2011-06-13 pix 207
19:08:27 ' 208 (defun get-cliki-source (url version)
' 209 "Fetches the source text of a given version of a cliki page. That is, it
' 210 returns the text you should POST to revert a cliki page to the given version."
2011-06-15 pix 211 (or (find-in-cache url version)
09:21:48 ' 212 (multiple-value-bind (page status headers)
2011-06-29 pix 213 (http-request (format nil "http://cliki.net/~a?source&v=~a" url version))
2011-06-15 pix 214 (declare (ignore headers))
09:21:48 ' 215 (cond
' 216 ((= 200 status)
' 217 ;; We have to coerce to a simple-string because rucksack mistakenly
' 218 ;; assumes non-simple strings have a fill pointer
' 219 (cache-known-good url version (coerce page 'simple-string))
' 220 page)
' 221 (t (error "crap!"))))))
' 222
' 223 #+(or) (get-cliki-source "araneida" "281")
2011-06-13 pix 224
2011-06-14 pix 225 (defun attended-revert-new-spam ()
2011-06-13 pix 226 (mapcar #'parse-page (get-recent-changes))
22:35:27 ' 227 (attendant))
' 228
' 229 (defun attendant ()
' 230 (classify-unknown-urls)
2011-06-16 pix 231 (update-last-known-good)
2011-06-13 pix 232 (mark-known-goods)
22:35:27 ' 233 (revert-spam))
' 234
2011-06-14 pix 235 (defun unattended-revert-new-spam ()
2011-06-13 pix 236 (mapcar #'parse-page (get-recent-changes))
2011-06-16 pix 237 (update-last-known-good)
2011-06-13 pix 238 (revert-spam))
2011-06-13 pix 239
2011-06-14 pix 240 #+(or) (attended-revert-new-spam)
08:41:24 ' 241 #+(or) (attendant)
2011-06-13 pix 242
2011-06-13 pix 243 (defun seconds (s) s)
22:58:32 ' 244 (defun minutes (m) (* (seconds 60) m))
' 245 (defun hours (h) (* (minutes 60) h))
2011-07-01 pix 246 (defun days (d) (* (hours 24) d))
2011-06-15 pix 247 (defun plus-or-minus (x y) (+ (- x y) (random (* 2 y))))
2011-06-13 pix 248
2011-06-28 pix 249 (defconstant +simple-time+ '(:year #\- (:month 2) #\- (:day 2) #\Space (:hour 2) #\: (:min 2) #\: (:sec 2)))
2011-06-18 pix 250 (defun now () (local-time:format-timestring nil (local-time:now) :format +simple-time+))
2011-06-14 pix 251
2011-07-01 pix 252 (defun run-for-a-while (how-long how-often variance)
2011-06-18 pix 253 (format t "; Beginning run at ~a~%" (now))
2011-07-01 pix 254 (dotimes (i (floor how-long how-often))
22:52:58 ' 255 (sleep (plus-or-minus how-often variance))
2011-06-18 pix 256 (format t "; Unattended run at ~a~%" (now))
2011-06-14 pix 257 (unattended-revert-new-spam)
2011-06-18 pix 258 (save-state))
23:15:27 ' 259 (format t "; Run ended at ~a~%~%" (now)))
2011-06-14 pix 260
2011-06-18 pix 261 #+(or)
23:16:27 ' 262 (let ((stdout *standard-output*))
2011-07-01 pix 263 (sb-thread:make-thread (lambda () (let ((*standard-output* stdout)) (run-for-a-while (days 2) (minutes 30) (minutes 5))))
2011-06-18 pix 264 :name "cliki reverter"))
2011-06-14 pix 265 ^L
2011-07-03 pix 266 (defvar *state-file* (merge-pathnames #p"state/" (directory-namestring (or #.*load-pathname* #p"/home/pixel/repos/claki/"))))
2011-06-14 pix 267
2011-06-15 pix 268 (defmacro with-rucksack-and-transaction ((rucksack) (&rest root-vars) &body body)
09:21:48 ' 269 (with-unique-names (rest)
' 270 `(rucksack:with-rucksack (,rucksack *state-file*)
' 271 (rucksack:with-transaction ()
' 272 (destructuring-bind (&optional ,@root-vars &rest ,rest) (ordered-roots rs:*rucksack*)
' 273 (declare (ignore ,rest) ,@(when (member '_ root-vars) `((ignore _))))
' 274 ,@body)))))
' 275
' 276 (defun ordered-roots (sack)
' 277 (sort (rs:rucksack-roots sack) #'< :key #'rs:object-id))
' 278
' 279 (defun initialize-rucksack ()
' 280 (with-rucksack-and-transaction (rs) ()
' 281 (let ((roots (rs:rucksack-roots rs:*rucksack*)))
' 282 (dotimes (i (- 2 (length roots)))
' 283 (rs:add-rucksack-root (make-instance 'rs:btree :key< 'string<) rs:*rucksack*)))))
' 284
' 285 #+(or) (initialize-rucksack)
' 286
2011-06-14 pix 287 (defun save-state ()
2011-06-15 pix 288 (with-rucksack-and-transaction (rs) (btree)
09:21:48 ' 289 (rs:btree-insert btree 'spam-urls *spam-urls*)
' 290 (rs:btree-insert btree 'ham-urls *okay-urls*)
2011-06-16 pix 291 (rs:btree-insert btree 'known-good *last-known-good*)
05:58:14 ' 292 (rs:btree-insert btree 'updated-pages *updated-pages*)
' 293 (rs:btree-insert btree 'our-updates *ignore-update*)))
2011-06-14 pix 294
18:17:05 ' 295 #+(or) (save-state)
' 296
' 297 (defun restore-state ()
2011-06-15 pix 298 (with-rucksack-and-transaction (rs) (btree)
09:21:48 ' 299 (setf *spam-urls* (rs:btree-search btree 'spam-urls)
' 300 *okay-urls* (rs:btree-search btree 'ham-urls)
2011-06-16 pix 301 *last-known-good* (rs:btree-search btree 'known-good)
05:58:14 ' 302 *updated-pages* (rs:btree-search btree 'updated-pages)
' 303 *ignore-update* (rs:btree-search btree 'our-updates))))
2011-06-14 pix 304
18:17:05 ' 305 #+(or) (restore-state)
2011-06-15 pix 306 #+(or) (with-rucksack-and-transaction (rs) ()
09:21:48 ' 307 (ordered-roots rs:*rucksack*))
' 308
' 309 (defun cache-known-good (cliki-page version content)
' 310 (with-rucksack-and-transaction (rs) (_ btree)
' 311 (rs:btree-insert btree cliki-page (list version content))))
' 312
' 313 (defun find-in-cache (cliki-page version)
' 314 (with-rucksack-and-transaction (rs) (_ btree)
' 315 (destructuring-bind (&optional cached-version cached-content)
' 316 (rs:btree-search btree cliki-page :default-value nil :errorp nil)
' 317 (and cached-version
' 318 (string= cached-version version)
' 319 cached-content))))
' 320
' 321 #+(or) (find-in-cache "araneida" "281")