Bugfix: if we manually classify an URL as spam, don't then mark the page that
Annotate for file claki.lisp
2011-06-13 pix 1 (defpackage #:claki
19:08:27 ' 2 (:use #:cl #:unify :alexandria))
' 3 (in-package #:claki)
' 4
' 5 (defvar *last-modified* (make-hash-table :test 'equal) "hash table of cliki pages and the last-modified header we last received.")
' 6
' 7 #+(or) (clrhash *last-modified*)
' 8
' 9 (defun get-cliki-page (url)
' 10 "Returns a page from cliki if it has been modified since we last saw it or nil
' 11 if it has not been modified. Signals an error otherwise."
' 12 (multiple-value-bind (page status headers)
2011-06-29 pix 13 (drakma:http-request (format nil "http://cliki.net/~a" url)
03:08:25 ' 14 :additional-headers (when (gethash url *last-modified*)
' 15 `((:if-modified-since (gethash url *last-modified*)))))
2011-06-13 pix 16 (cond
19:08:27 ' 17 ;; If the page hasn't been modified, no need to update
' 18 ((= 304 status) nil)
' 19 ;; The "Recent Changes" page doesn't return 304s, but does return a last-modified.
' 20 ((and (gethash url *last-modified*)
' 21 (cdr (assoc :last-modified headers))
' 22 (string= (cdr (assoc :last-modified headers)) (gethash url *last-modified*)))
' 23 nil)
' 24 ((= 200 status)
' 25 (setf (gethash url *last-modified*) (cdr (assoc :last-modified headers)))
' 26 page)
' 27 (t (error "crap!")))))
' 28
' 29 (defun get-recent-changes ()
' 30 (when-let ((page (get-cliki-page "Recent%20Changes")))
' 31 (match (#t(oh-ducks:html ("blockquote > b > a.internal" . ?links)) page)
' 32 (loop :for link :in (remove-duplicates links :test #'equalp
' 33 :key (lambda (x) (oh-ducks.traversal:element-attribute :href x))
' 34 :from-end t)
' 35 :collect (oh-ducks.traversal:element-attribute :href link)))))
' 36
' 37 #+(or) (get-recent-changes)
' 38
' 39 (defvar *spam-urls* (make-hash-table :test 'equal) "hash table of spam urls.")
' 40 (defvar *okay-urls* (make-hash-table :test 'equal) "hash table of acceptable urls.")
2011-06-13 pix 41 (defvar *urls-to-classify* (list) "list of (url-to-classify cliki-page-where-it-was-found page-version).")
2011-06-16 pix 42 (defvar *has-spam* (list) "list of (cliki-page version) known to have spam.")
05:58:14 ' 43 (defvar *updated-pages* (list) "list of pages that have been updated in the format (cliki-page version).")
' 44 (defvar *ignore-update* (list) "list of pages in the format (cliki-page version) which updates should not result in updating the last-known good version (e.g., because we did it and it was a reversion).")
2011-06-13 pix 45
19:08:27 ' 46 #+(or) (clrhash *okay-urls*)
' 47
' 48 (defun url-domain (url)
' 49 (puri:uri-host (puri:parse-uri url)))
' 50
' 51 (defun parse-page (page-url)
' 52 (when-let ((page (get-cliki-page page-url)))
' 53 (match (#t(oh-ducks:html ("a[href^=http]" . ?a)
' 54 ("#footer > b" . #t(list ?b))) page)
' 55 (let ((current-version (oh-ducks.traversal:element-content b)))
2011-06-16 pix 56 (format t "; Page ~s modified, now at version ~a.~%" page-url current-version)
05:58:14 ' 57 (pushnew (list page-url current-version) *updated-pages* :test #'equal)
2011-06-13 pix 58 (dolist (link a)
2011-06-13 pix 59 (let ((url (oh-ducks.traversal:element-attribute :href link)))
2011-06-13 pix 60 (cond
22:35:27 ' 61 ((or (gethash url *okay-urls*)
' 62 (gethash (url-domain url) *okay-urls*))
' 63 #+(or) (do-nothing))
' 64 ((or (gethash url *spam-urls*)
2011-06-13 pix 65 (gethash (url-domain url) *spam-urls*)
22:58:13 ' 66 (auto-classify link))
2011-06-13 pix 67 (pushnew (list page-url current-version) *has-spam* :test #'equal))
22:35:27 ' 68 (t
' 69 (pushnew (list url page-url current-version) *urls-to-classify* :test #'equal)))))))))
2011-06-13 pix 70
19:08:27 ' 71 #+(or) (parse-page "araneida")
' 72
2011-06-13 pix 73 (defun auto-classify (link)
22:58:13 ' 74 "Auto-classify URLs based upon traits common to spammers."
' 75 (let ((rel (oh-ducks.traversal:element-attribute :rel link))
' 76 (url (oh-ducks.traversal:element-attribute :href link)))
' 77 (cond
' 78 ((and (stringp rel)
' 79 (or (string-equal "follow" rel)
' 80 (string-equal "dofollow" rel)))
' 81 (setf (gethash url *spam-urls*) t))
' 82 (t nil))))
' 83
2011-06-18 pix 84 (defun request-classification (url &optional page version)
2011-06-13 pix 85 (restart-case (error 'simple-error :format-control "Please classify the URL ~s."
19:08:27 ' 86 :format-arguments (list url))
' 87 (mark-url-okay ()
' 88 :report "Mark this URL as acceptable."
' 89 (setf (gethash url *okay-urls*) t))
' 90 (mark-domain-okay ()
' 91 :report "Mark the domain as acceptable."
' 92 (setf (gethash (url-domain url) *okay-urls*) t))
' 93 (mark-url-spam ()
' 94 :report "Mark this URL as spam."
2011-06-18 pix 95 (when (and page version)
23:14:59 ' 96 (pushnew (list page version) *has-spam* :test #'equal))
2011-06-13 pix 97 (setf (gethash url *spam-urls*) t))
19:08:27 ' 98 (mark-domain-spam ()
' 99 :report "Mark the domain as spam."
2011-06-18 pix 100 (when (and page version)
23:14:59 ' 101 (pushnew (list page version) *has-spam* :test #'equal))
2011-06-13 pix 102 (setf (gethash (url-domain url) *spam-urls*) t))
22:35:27 ' 103 (classify-later ()
' 104 :report "Don't classify this URL yet."
' 105 nil)))
' 106
' 107 (defun classify-unknown-urls ()
' 108 (setf *urls-to-classify*
' 109 (loop :for (url page version) :in *urls-to-classify*
' 110 :unless (or (gethash url *okay-urls*)
' 111 (gethash (url-domain url) *okay-urls*)
' 112 (gethash url *spam-urls*)
' 113 (gethash (url-domain url) *spam-urls*)
2011-06-18 pix 114 (request-classification url page version))
2011-06-13 pix 115 :collect (list url page version))))
22:35:27 ' 116
' 117 (defun mark-known-goods ()
' 118 (loop :for (page-url version) :in *has-spam*
' 119 :do (maybe-request-last-known-good page-url)))
' 120
' 121 (defun revert-spam ()
' 122 (setf *has-spam*
' 123 (loop :for (page-url version) :in *has-spam*
' 124 :unless (and (gethash page-url *last-known-good*)
' 125 (revert-page page-url version (gethash page-url *last-known-good*)))
' 126 :collect (list page-url version))))
2011-06-13 pix 127
19:08:27 ' 128 (defvar *last-known-good* (make-hash-table :test 'equal) "hash table of cliki pages and the last-known \"good\" revision.")
' 129
' 130 (defun read-number ()
' 131 (format t "Enter a version: ")
' 132 (list (format nil "~a" (parse-integer (read-line)))))
' 133
' 134 (defun maybe-request-last-known-good (page)
' 135 (unless (gethash page *last-known-good*)
' 136 (restart-case (error 'simple-error :format-control "Do not know of a good version of cliki page ~s."
' 137 :format-arguments (list page))
' 138 (specify-version (version)
' 139 :interactive read-number
' 140 :report "Specify a known-good version."
' 141 (setf (gethash page *last-known-good*) version)))))
' 142
' 143 #+(or) (maybe-request-last-known-good "araneida")
' 144
2011-06-16 pix 145 (defun update-last-known-good ()
05:58:14 ' 146 ;; Remove known-spam pages from list of updates
' 147 (loop :for (page-url version) :in *has-spam*
' 148 :do (removef *updated-pages* (list page-url version) :test #'equal))
' 149 ;; Remove updates we made from list of updates (that way, we'll continue to
' 150 ;; use the old cached known-good, saving a bit of strain on cliki)
' 151 (loop :for (page-url version) :in *ignore-update*
' 152 :do (removef *updated-pages* (list page-url version) :test #'equal))
' 153 (setf *updated-pages*
' 154 (loop :for (page-url version) :in *updated-pages*
' 155 ;; If there are unclassified urls from this page, don't mark good
' 156 ;; (could be spam!)
' 157 :if (notany (lambda (x) (and (string= page-url (second x))
' 158 (string= version (third x))))
' 159 *urls-to-classify*)
' 160 :do (format t "; Updating last-known good of ~s from ~a to ~a~%" page-url (gethash page-url *last-known-good*) version)
' 161 (setf (gethash page-url *last-known-good*) version)
' 162 :else
' 163 :collect (list page-url version))))
' 164
' 165 (defun numstring+1 (numstring)
' 166 (write-to-string (1+ (parse-integer numstring))))
' 167
2011-06-13 pix 168 (defun revert-page (url current-version to-version)
22:35:27 ' 169 (multiple-value-bind (page status headers)
2011-06-29 pix 170 (drakma:http-request (format nil "http://cliki.net/edit/~a" url)
03:08:25 ' 171 :method :post
' 172 :parameters `(("version" . ,current-version)
' 173 ("T0" . "BODY")
' 174 ("E0" . ,(get-cliki-source url to-version))
' 175 ("summary" . "Spam detected, reverting to Known-Good.")
' 176 ("captcha" . "lisp")
' 177 ("name" . "Claki (Revertobot Beta)")))
2011-06-14 pix 178 (declare (ignore headers))
2011-06-13 pix 179 (cond
22:35:27 ' 180 ((and (= status 200)
' 181 (not (search "rejected" page :test #'char-equal)))
2011-06-14 pix 182 (format t "; Reverted page ~s to version ~a.~%" url to-version)
2011-06-16 pix 183 (pushnew (list url (numstring+1 current-version)) *ignore-update* :test #'equal)
2011-06-13 pix 184 page)
22:35:27 ' 185 (t nil))))
2011-06-13 pix 186
19:08:27 ' 187 (defun get-cliki-source (url version)
' 188 "Fetches the source text of a given version of a cliki page. That is, it
' 189 returns the text you should POST to revert a cliki page to the given version."
2011-06-15 pix 190 (or (find-in-cache url version)
09:21:48 ' 191 (multiple-value-bind (page status headers)
2011-06-29 pix 192 (drakma:http-request (format nil "http://cliki.net/~a?source&v=~a" url version))
2011-06-15 pix 193 (declare (ignore headers))
09:21:48 ' 194 (cond
' 195 ((= 200 status)
' 196 ;; We have to coerce to a simple-string because rucksack mistakenly
' 197 ;; assumes non-simple strings have a fill pointer
' 198 (cache-known-good url version (coerce page 'simple-string))
' 199 page)
' 200 (t (error "crap!"))))))
' 201
' 202 #+(or) (get-cliki-source "araneida" "281")
2011-06-13 pix 203
2011-06-14 pix 204 (defun attended-revert-new-spam ()
2011-06-13 pix 205 (mapcar #'parse-page (get-recent-changes))
22:35:27 ' 206 (attendant))
' 207
' 208 (defun attendant ()
' 209 (classify-unknown-urls)
2011-06-16 pix 210 (update-last-known-good)
2011-06-13 pix 211 (mark-known-goods)
22:35:27 ' 212 (revert-spam))
' 213
2011-06-14 pix 214 (defun unattended-revert-new-spam ()
2011-06-13 pix 215 (mapcar #'parse-page (get-recent-changes))
2011-06-16 pix 216 (update-last-known-good)
2011-06-13 pix 217 (revert-spam))
2011-06-13 pix 218
2011-06-14 pix 219 #+(or) (attended-revert-new-spam)
08:41:24 ' 220 #+(or) (attendant)
2011-06-13 pix 221
2011-06-13 pix 222 (defun seconds (s) s)
22:58:32 ' 223 (defun minutes (m) (* (seconds 60) m))
' 224 (defun hours (h) (* (minutes 60) h))
2011-06-15 pix 225 (defun plus-or-minus (x y) (+ (- x y) (random (* 2 y))))
2011-06-13 pix 226
2011-06-28 pix 227 (defconstant +simple-time+ '(:year #\- :month #\- :day #\Space :hour #\: :min #\: :sec))
2011-06-14 pix 228
2011-06-28 pix 229 (defun run-for-a-while ()
2011-06-18 pix 230 (dotimes (i 20)
2011-07-01 pix 231 (sleep (minutes (plus-or-minus 30 5)))
2011-06-18 pix 232 (format t "; Unattented run at ~a~%" (local-time:format-timestring nil (local-time:now) :format +simple-time+))
2011-06-14 pix 233 (unattended-revert-new-spam)
2011-06-18 pix 234 (save-state)))
2011-06-14 pix 235
2011-06-18 pix 236 #+(or) (sb-thread:make-thread #'run-for-a-while :name "cliki reverter")
2011-06-14 pix 237 ^L
2011-06-28 pix 238 (defvar *state-file* (or #.*load-pathname* #p"/home/pixel/repos/claki/state/"))
2011-06-14 pix 239
2011-06-15 pix 240 (defmacro with-rucksack-and-transaction ((rucksack) (&rest root-vars) &body body)
09:21:48 ' 241 (with-unique-names (rest)
' 242 `(rucksack:with-rucksack (,rucksack *state-file*)
' 243 (rucksack:with-transaction ()
' 244 (destructuring-bind (&optional ,@root-vars &rest ,rest) (ordered-roots rs:*rucksack*)
' 245 (declare (ignore ,rest) ,@(when (member '_ root-vars) `((ignore _))))
' 246 ,@body)))))
' 247
' 248 (defun ordered-roots (sack)
' 249 (sort (rs:rucksack-roots sack) #'< :key #'rs:object-id))
' 250
' 251 (defun initialize-rucksack ()
' 252 (with-rucksack-and-transaction (rs) ()
' 253 (let ((roots (rs:rucksack-roots rs:*rucksack*)))
' 254 (dotimes (i (- 2 (length roots)))
' 255 (rs:add-rucksack-root (make-instance 'rs:btree :key< 'string<) rs:*rucksack*)))))
' 256
' 257 #+(or) (initialize-rucksack)
' 258
2011-06-14 pix 259 (defun save-state ()
2011-06-15 pix 260 (with-rucksack-and-transaction (rs) (btree)
09:21:48 ' 261 (rs:btree-insert btree 'spam-urls *spam-urls*)
' 262 (rs:btree-insert btree 'ham-urls *okay-urls*)
2011-06-16 pix 263 (rs:btree-insert btree 'known-good *last-known-good*)
05:58:14 ' 264 (rs:btree-insert btree 'updated-pages *updated-pages*)
' 265 (rs:btree-insert btree 'our-updates *ignore-update*)))
2011-06-14 pix 266
18:17:05 ' 267 #+(or) (save-state)
' 268
' 269 (defun restore-state ()
2011-06-15 pix 270 (with-rucksack-and-transaction (rs) (btree)
09:21:48 ' 271 (setf *spam-urls* (rs:btree-search btree 'spam-urls)
' 272 *okay-urls* (rs:btree-search btree 'ham-urls)
2011-06-16 pix 273 *last-known-good* (rs:btree-search btree 'known-good)
05:58:14 ' 274 *updated-pages* (rs:btree-search btree 'updated-pages)
' 275 *ignore-update* (rs:btree-search btree 'our-updates))))
2011-06-14 pix 276
18:17:05 ' 277 #+(or) (restore-state)
2011-06-15 pix 278 #+(or) (with-rucksack-and-transaction (rs) ()
09:21:48 ' 279 (ordered-roots rs:*rucksack*))
' 280
' 281 (defun cache-known-good (cliki-page version content)
' 282 (with-rucksack-and-transaction (rs) (_ btree)
' 283 (rs:btree-insert btree cliki-page (list version content))))
' 284
' 285 (defun find-in-cache (cliki-page version)
' 286 (with-rucksack-and-transaction (rs) (_ btree)
' 287 (destructuring-bind (&optional cached-version cached-content)
' 288 (rs:btree-search btree cliki-page :default-value nil :errorp nil)
' 289 (and cached-version
' 290 (string= cached-version version)
' 291 cached-content))))
' 292
' 293 #+(or) (find-in-cache "araneida" "281")