Add FIXME noting a behavior of cliki I wasn't expecting
Annotate for file claki.lisp
2011-06-13 pix 1 (defpackage #:claki
19:08:27 ' 2 (:use #:cl #:unify :alexandria))
' 3 (in-package #:claki)
' 4
' 5 (defvar *last-modified* (make-hash-table :test 'equal) "hash table of cliki pages and the last-modified header we last received.")
' 6
' 7 #+(or) (clrhash *last-modified*)
' 8
' 9 (defun get-cliki-page (url)
' 10 "Returns a page from cliki if it has been modified since we last saw it or nil
' 11 if it has not been modified. Signals an error otherwise."
' 12 (multiple-value-bind (page status headers)
2011-06-29 pix 13 (drakma:http-request (format nil "http://cliki.net/~a" url)
03:08:25 ' 14 :additional-headers (when (gethash url *last-modified*)
' 15 `((:if-modified-since (gethash url *last-modified*)))))
2011-06-18 pix 16 ;; FIXME: this doesn't work all that well: it turns out cliki uses a single
23:15:57 ' 17 ;; last-modified header for the entire wiki, rather than one per page, so we
' 18 ;; aren't saving ourselves (or cliki) much with this.
2011-06-13 pix 19 (cond
19:08:27 ' 20 ;; If the page hasn't been modified, no need to update
' 21 ((= 304 status) nil)
' 22 ;; The "Recent Changes" page doesn't return 304s, but does return a last-modified.
' 23 ((and (gethash url *last-modified*)
' 24 (cdr (assoc :last-modified headers))
' 25 (string= (cdr (assoc :last-modified headers)) (gethash url *last-modified*)))
' 26 nil)
' 27 ((= 200 status)
' 28 (setf (gethash url *last-modified*) (cdr (assoc :last-modified headers)))
' 29 page)
' 30 (t (error "crap!")))))
' 31
' 32 (defun get-recent-changes ()
' 33 (when-let ((page (get-cliki-page "Recent%20Changes")))
' 34 (match (#t(oh-ducks:html ("blockquote > b > a.internal" . ?links)) page)
' 35 (loop :for link :in (remove-duplicates links :test #'equalp
' 36 :key (lambda (x) (oh-ducks.traversal:element-attribute :href x))
' 37 :from-end t)
' 38 :collect (oh-ducks.traversal:element-attribute :href link)))))
' 39
' 40 #+(or) (get-recent-changes)
' 41
' 42 (defvar *spam-urls* (make-hash-table :test 'equal) "hash table of spam urls.")
' 43 (defvar *okay-urls* (make-hash-table :test 'equal) "hash table of acceptable urls.")
2011-06-13 pix 44 (defvar *urls-to-classify* (list) "list of (url-to-classify cliki-page-where-it-was-found page-version).")
2011-06-16 pix 45 (defvar *has-spam* (list) "list of (cliki-page version) known to have spam.")
05:58:14 ' 46 (defvar *updated-pages* (list) "list of pages that have been updated in the format (cliki-page version).")
' 47 (defvar *ignore-update* (list) "list of pages in the format (cliki-page version) which updates should not result in updating the last-known good version (e.g., because we did it and it was a reversion).")
2011-06-13 pix 48
19:08:27 ' 49 #+(or) (clrhash *okay-urls*)
' 50
' 51 (defun url-domain (url)
' 52 (puri:uri-host (puri:parse-uri url)))
' 53
' 54 (defun parse-page (page-url)
' 55 (when-let ((page (get-cliki-page page-url)))
' 56 (match (#t(oh-ducks:html ("a[href^=http]" . ?a)
' 57 ("#footer > b" . #t(list ?b))) page)
' 58 (let ((current-version (oh-ducks.traversal:element-content b)))
2011-06-16 pix 59 (format t "; Page ~s modified, now at version ~a.~%" page-url current-version)
05:58:14 ' 60 (pushnew (list page-url current-version) *updated-pages* :test #'equal)
2011-06-13 pix 61 (dolist (link a)
2011-06-13 pix 62 (let ((url (oh-ducks.traversal:element-attribute :href link)))
2011-06-13 pix 63 (cond
22:35:27 ' 64 ((or (gethash url *okay-urls*)
' 65 (gethash (url-domain url) *okay-urls*))
' 66 #+(or) (do-nothing))
' 67 ((or (gethash url *spam-urls*)
2011-06-13 pix 68 (gethash (url-domain url) *spam-urls*)
22:58:13 ' 69 (auto-classify link))
2011-06-13 pix 70 (pushnew (list page-url current-version) *has-spam* :test #'equal))
22:35:27 ' 71 (t
' 72 (pushnew (list url page-url current-version) *urls-to-classify* :test #'equal)))))))))
2011-06-13 pix 73
19:08:27 ' 74 #+(or) (parse-page "araneida")
' 75
2011-06-13 pix 76 (defun auto-classify (link)
22:58:13 ' 77 "Auto-classify URLs based upon traits common to spammers."
' 78 (let ((rel (oh-ducks.traversal:element-attribute :rel link))
' 79 (url (oh-ducks.traversal:element-attribute :href link)))
' 80 (cond
' 81 ((and (stringp rel)
' 82 (or (string-equal "follow" rel)
' 83 (string-equal "dofollow" rel)))
' 84 (setf (gethash url *spam-urls*) t))
' 85 (t nil))))
' 86
2011-06-18 pix 87 (defun request-classification (url &optional page version)
2011-06-13 pix 88 (restart-case (error 'simple-error :format-control "Please classify the URL ~s."
19:08:27 ' 89 :format-arguments (list url))
' 90 (mark-url-okay ()
' 91 :report "Mark this URL as acceptable."
' 92 (setf (gethash url *okay-urls*) t))
' 93 (mark-domain-okay ()
' 94 :report "Mark the domain as acceptable."
' 95 (setf (gethash (url-domain url) *okay-urls*) t))
' 96 (mark-url-spam ()
' 97 :report "Mark this URL as spam."
2011-06-18 pix 98 (when (and page version)
23:14:59 ' 99 (pushnew (list page version) *has-spam* :test #'equal))
2011-06-13 pix 100 (setf (gethash url *spam-urls*) t))
19:08:27 ' 101 (mark-domain-spam ()
' 102 :report "Mark the domain as spam."
2011-06-18 pix 103 (when (and page version)
23:14:59 ' 104 (pushnew (list page version) *has-spam* :test #'equal))
2011-06-13 pix 105 (setf (gethash (url-domain url) *spam-urls*) t))
22:35:27 ' 106 (classify-later ()
' 107 :report "Don't classify this URL yet."
' 108 nil)))
' 109
' 110 (defun classify-unknown-urls ()
' 111 (setf *urls-to-classify*
' 112 (loop :for (url page version) :in *urls-to-classify*
' 113 :unless (or (gethash url *okay-urls*)
' 114 (gethash (url-domain url) *okay-urls*)
' 115 (gethash url *spam-urls*)
' 116 (gethash (url-domain url) *spam-urls*)
2011-06-18 pix 117 (request-classification url page version))
2011-06-13 pix 118 :collect (list url page version))))
22:35:27 ' 119
' 120 (defun mark-known-goods ()
' 121 (loop :for (page-url version) :in *has-spam*
' 122 :do (maybe-request-last-known-good page-url)))
' 123
' 124 (defun revert-spam ()
' 125 (setf *has-spam*
' 126 (loop :for (page-url version) :in *has-spam*
' 127 :unless (and (gethash page-url *last-known-good*)
' 128 (revert-page page-url version (gethash page-url *last-known-good*)))
' 129 :collect (list page-url version))))
2011-06-13 pix 130
19:08:27 ' 131 (defvar *last-known-good* (make-hash-table :test 'equal) "hash table of cliki pages and the last-known \"good\" revision.")
' 132
' 133 (defun read-number ()
' 134 (format t "Enter a version: ")
' 135 (list (format nil "~a" (parse-integer (read-line)))))
' 136
' 137 (defun maybe-request-last-known-good (page)
' 138 (unless (gethash page *last-known-good*)
' 139 (restart-case (error 'simple-error :format-control "Do not know of a good version of cliki page ~s."
' 140 :format-arguments (list page))
' 141 (specify-version (version)
' 142 :interactive read-number
' 143 :report "Specify a known-good version."
' 144 (setf (gethash page *last-known-good*) version)))))
' 145
' 146 #+(or) (maybe-request-last-known-good "araneida")
' 147
2011-06-16 pix 148 (defun update-last-known-good ()
05:58:14 ' 149 ;; Remove known-spam pages from list of updates
' 150 (loop :for (page-url version) :in *has-spam*
' 151 :do (removef *updated-pages* (list page-url version) :test #'equal))
' 152 ;; Remove updates we made from list of updates (that way, we'll continue to
' 153 ;; use the old cached known-good, saving a bit of strain on cliki)
' 154 (loop :for (page-url version) :in *ignore-update*
' 155 :do (removef *updated-pages* (list page-url version) :test #'equal))
' 156 (setf *updated-pages*
' 157 (loop :for (page-url version) :in *updated-pages*
' 158 ;; If there are unclassified urls from this page, don't mark good
' 159 ;; (could be spam!)
' 160 :if (notany (lambda (x) (and (string= page-url (second x))
' 161 (string= version (third x))))
' 162 *urls-to-classify*)
' 163 :do (format t "; Updating last-known good of ~s from ~a to ~a~%" page-url (gethash page-url *last-known-good*) version)
' 164 (setf (gethash page-url *last-known-good*) version)
' 165 :else
' 166 :collect (list page-url version))))
' 167
' 168 (defun numstring+1 (numstring)
' 169 (write-to-string (1+ (parse-integer numstring))))
' 170
2011-06-13 pix 171 (defun revert-page (url current-version to-version)
22:35:27 ' 172 (multiple-value-bind (page status headers)
2011-06-29 pix 173 (drakma:http-request (format nil "http://cliki.net/edit/~a" url)
03:08:25 ' 174 :method :post
' 175 :parameters `(("version" . ,current-version)
' 176 ("T0" . "BODY")
' 177 ("E0" . ,(get-cliki-source url to-version))
' 178 ("summary" . "Spam detected, reverting to Known-Good.")
' 179 ("captcha" . "lisp")
' 180 ("name" . "Claki (Revertobot Beta)")))
2011-06-14 pix 181 (declare (ignore headers))
2011-06-13 pix 182 (cond
22:35:27 ' 183 ((and (= status 200)
' 184 (not (search "rejected" page :test #'char-equal)))
2011-06-14 pix 185 (format t "; Reverted page ~s to version ~a.~%" url to-version)
2011-06-16 pix 186 (pushnew (list url (numstring+1 current-version)) *ignore-update* :test #'equal)
2011-06-13 pix 187 page)
22:35:27 ' 188 (t nil))))
2011-06-13 pix 189
19:08:27 ' 190 (defun get-cliki-source (url version)
' 191 "Fetches the source text of a given version of a cliki page. That is, it
' 192 returns the text you should POST to revert a cliki page to the given version."
2011-06-15 pix 193 (or (find-in-cache url version)
09:21:48 ' 194 (multiple-value-bind (page status headers)
2011-06-29 pix 195 (drakma:http-request (format nil "http://cliki.net/~a?source&v=~a" url version))
2011-06-15 pix 196 (declare (ignore headers))
09:21:48 ' 197 (cond
' 198 ((= 200 status)
' 199 ;; We have to coerce to a simple-string because rucksack mistakenly
' 200 ;; assumes non-simple strings have a fill pointer
' 201 (cache-known-good url version (coerce page 'simple-string))
' 202 page)
' 203 (t (error "crap!"))))))
' 204
' 205 #+(or) (get-cliki-source "araneida" "281")
2011-06-13 pix 206
2011-06-14 pix 207 (defun attended-revert-new-spam ()
2011-06-13 pix 208 (mapcar #'parse-page (get-recent-changes))
22:35:27 ' 209 (attendant))
' 210
' 211 (defun attendant ()
' 212 (classify-unknown-urls)
2011-06-16 pix 213 (update-last-known-good)
2011-06-13 pix 214 (mark-known-goods)
22:35:27 ' 215 (revert-spam))
' 216
2011-06-14 pix 217 (defun unattended-revert-new-spam ()
2011-06-13 pix 218 (mapcar #'parse-page (get-recent-changes))
2011-06-16 pix 219 (update-last-known-good)
2011-06-13 pix 220 (revert-spam))
2011-06-13 pix 221
2011-06-14 pix 222 #+(or) (attended-revert-new-spam)
08:41:24 ' 223 #+(or) (attendant)
2011-06-13 pix 224
2011-06-13 pix 225 (defun seconds (s) s)
22:58:32 ' 226 (defun minutes (m) (* (seconds 60) m))
' 227 (defun hours (h) (* (minutes 60) h))
2011-06-15 pix 228 (defun plus-or-minus (x y) (+ (- x y) (random (* 2 y))))
2011-06-13 pix 229
2011-06-28 pix 230 (defconstant +simple-time+ '(:year #\- :month #\- :day #\Space :hour #\: :min #\: :sec))
2011-06-18 pix 231 (defun now () (local-time:format-timestring nil (local-time:now) :format +simple-time+))
2011-06-14 pix 232
2011-06-28 pix 233 (defun run-for-a-while ()
2011-06-18 pix 234 (format t "; Beginning run at ~a~%" (now))
2011-06-28 pix 235 (dotimes (i (* 2 24))
2011-07-01 pix 236 (sleep (minutes (plus-or-minus 30 5)))
2011-06-18 pix 237 (format t "; Unattended run at ~a~%" (now))
2011-06-14 pix 238 (unattended-revert-new-spam)
2011-06-18 pix 239 (save-state))
23:15:27 ' 240 (format t "; Run ended at ~a~%~%" (now)))
2011-06-14 pix 241
2011-06-18 pix 242 #+(or) (sb-thread:make-thread #'run-for-a-while :name "cliki reverter")
2011-06-14 pix 243 ^L
2011-06-28 pix 244 (defvar *state-file* (or #.*load-pathname* #p"/home/pixel/repos/claki/state/"))
2011-06-14 pix 245
2011-06-15 pix 246 (defmacro with-rucksack-and-transaction ((rucksack) (&rest root-vars) &body body)
09:21:48 ' 247 (with-unique-names (rest)
' 248 `(rucksack:with-rucksack (,rucksack *state-file*)
' 249 (rucksack:with-transaction ()
' 250 (destructuring-bind (&optional ,@root-vars &rest ,rest) (ordered-roots rs:*rucksack*)
' 251 (declare (ignore ,rest) ,@(when (member '_ root-vars) `((ignore _))))
' 252 ,@body)))))
' 253
' 254 (defun ordered-roots (sack)
' 255 (sort (rs:rucksack-roots sack) #'< :key #'rs:object-id))
' 256
' 257 (defun initialize-rucksack ()
' 258 (with-rucksack-and-transaction (rs) ()
' 259 (let ((roots (rs:rucksack-roots rs:*rucksack*)))
' 260 (dotimes (i (- 2 (length roots)))
' 261 (rs:add-rucksack-root (make-instance 'rs:btree :key< 'string<) rs:*rucksack*)))))
' 262
' 263 #+(or) (initialize-rucksack)
' 264
2011-06-14 pix 265 (defun save-state ()
2011-06-15 pix 266 (with-rucksack-and-transaction (rs) (btree)
09:21:48 ' 267 (rs:btree-insert btree 'spam-urls *spam-urls*)
' 268 (rs:btree-insert btree 'ham-urls *okay-urls*)
2011-06-16 pix 269 (rs:btree-insert btree 'known-good *last-known-good*)
05:58:14 ' 270 (rs:btree-insert btree 'updated-pages *updated-pages*)
' 271 (rs:btree-insert btree 'our-updates *ignore-update*)))
2011-06-14 pix 272
18:17:05 ' 273 #+(or) (save-state)
' 274
' 275 (defun restore-state ()
2011-06-15 pix 276 (with-rucksack-and-transaction (rs) (btree)
09:21:48 ' 277 (setf *spam-urls* (rs:btree-search btree 'spam-urls)
' 278 *okay-urls* (rs:btree-search btree 'ham-urls)
2011-06-16 pix 279 *last-known-good* (rs:btree-search btree 'known-good)
05:58:14 ' 280 *updated-pages* (rs:btree-search btree 'updated-pages)
' 281 *ignore-update* (rs:btree-search btree 'our-updates))))
2011-06-14 pix 282
18:17:05 ' 283 #+(or) (restore-state)
2011-06-15 pix 284 #+(or) (with-rucksack-and-transaction (rs) ()
09:21:48 ' 285 (ordered-roots rs:*rucksack*))
' 286
' 287 (defun cache-known-good (cliki-page version content)
' 288 (with-rucksack-and-transaction (rs) (_ btree)
' 289 (rs:btree-insert btree cliki-page (list version content))))
' 290
' 291 (defun find-in-cache (cliki-page version)
' 292 (with-rucksack-and-transaction (rs) (_ btree)
' 293 (destructuring-bind (&optional cached-version cached-content)
' 294 (rs:btree-search btree cliki-page :default-value nil :errorp nil)
' 295 (and cached-version
' 296 (string= cached-version version)
' 297 cached-content))))
' 298
' 299 #+(or) (find-in-cache "araneida" "281")