1 (defpackage #:claki 2 (:use #:cl #:unify :alexandria)) 3 (in-package #:claki) 4 5 (defvar *last-modified* (make-hash-table :test 'equal) "hash table of cliki pages and the last-modified header we last received.") 6 7 #+(or) (clrhash *last-modified*) 8 9 (defun http-request (&rest drakma-args) 10 "A wrapper around drakma:http-request which automatically retries the request 11 in the event of a timeout." 12 (let ((times-failed 0)) 13 (tagbody 14 :fetch-page 15 (handler-case (return-from http-request (apply #'drakma:http-request drakma-args)) 16 (usocket:timeout-error () 17 (sleep (* 60 (expt 2 times-failed))) 18 (incf times-failed) 19 (go :fetch-page)))))) 20 21 (defun get-cliki-page (url) 22 "Returns a page from cliki if it has been modified since we last saw it or nil 23 if it has not been modified. Signals an error otherwise." 24 (multiple-value-bind (page status headers) 25 (http-request (format nil "http://cliki.net/~a" url) 26 :additional-headers (when (gethash url *last-modified*) 27 `((:if-modified-since (gethash url *last-modified*))))) 28 ;; FIXME: this doesn't work all that well: it turns out cliki uses a single 29 ;; last-modified header for the entire wiki, rather than one per page, so we 30 ;; aren't saving ourselves (or cliki) much with this. 31 (cond 32 ;; If the page hasn't been modified, no need to update 33 ((= 304 status) nil) 34 ;; The "Recent Changes" page doesn't return 304s, but does return a last-modified. 35 ((and (gethash url *last-modified*) 36 (cdr (assoc :last-modified headers)) 37 (string= (cdr (assoc :last-modified headers)) (gethash url *last-modified*))) 38 nil) 39 ((= 200 status) 40 (setf (gethash url *last-modified*) (cdr (assoc :last-modified headers))) 41 page) 42 (t (error "crap!"))))) 43 44 (defun get-recent-changes () 45 (when-let ((page (get-cliki-page "Recent%20Changes"))) 46 (match (#t(oh-ducks:html ("blockquote > b > a.internal" . ?links)) page) 47 (loop :for link :in (remove-duplicates links :test #'equalp 48 :key (lambda (x) (oh-ducks.traversal:element-attribute :href x)) 49 :from-end t) 50 :collect (oh-ducks.traversal:element-attribute :href link))))) 51 52 #+(or) (get-recent-changes) 53 54 (defvar *spam-urls* (make-hash-table :test 'equal) "hash table of spam urls.") 55 (defvar *okay-urls* (make-hash-table :test 'equal) "hash table of acceptable urls.") 56 (defvar *urls-to-classify* (list) "list of (url-to-classify cliki-page-where-it-was-found page-version).") 57 (defvar *has-spam* (list) "list of (cliki-page version) known to have spam.") 58 (defvar *updated-pages* (list) "list of pages that have been updated in the format (cliki-page version).") 59 (defvar *ignore-update* (list) "list of pages in the format (cliki-page version) which updates should not result in updating the last-known good version (e.g., because we did it and it was a reversion).") 60 61 #+(or) (clrhash *okay-urls*) 62 63 (defun url-domain (url) 64 (puri:uri-host (puri:parse-uri url))) 65 66 (defun parse-page (page-url) 67 (when-let ((page (get-cliki-page page-url))) 68 (match (#t(oh-ducks:html ("a[href^=http]" . ?a) 69 ("#footer > b" . #t(list ?b))) page) 70 (let ((current-version (oh-ducks.traversal:element-content b))) 71 (format t "; Page ~s modified, now at version ~a.~%" page-url current-version) 72 (when (string= (gethash page-url *last-known-good*) current-version) 73 (return-from parse-page nil)) 74 (pushnew (list page-url current-version) *updated-pages* :test #'equal) 75 (dolist (link a) 76 (let ((url (oh-ducks.traversal:element-attribute :href link))) 77 (cond 78 ((or (gethash url *okay-urls*) 79 (gethash (url-domain url) *okay-urls*)) 80 #+(or) (do-nothing)) 81 ((or (gethash url *spam-urls*) 82 (gethash (url-domain url) *spam-urls*) 83 (auto-classify link)) 84 (pushnew (list page-url current-version) *has-spam* :test #'equal)) 85 (t 86 (pushnew (list url page-url current-version) *urls-to-classify* :test #'equal))))))))) 87 88 #+(or) (parse-page "araneida") 89 90 (defun auto-classify (link) 91 "Auto-classify URLs based upon traits common to spammers." 92 (let ((rel (oh-ducks.traversal:element-attribute :rel link)) 93 (style (oh-ducks.traversal:element-attribute :style link)) 94 (url (oh-ducks.traversal:element-attribute :href link))) 95 (cond 96 ((and (stringp rel) 97 (or (string-equal "follow" rel) 98 (string-equal "dofollow" rel))) 99 (setf (gethash url *spam-urls*) t)) 100 ((and (stringp style) 101 (cl-ppcre:scan "text-decoration[ ]*:[ ]*none" style)) 102 (setf (gethash url *spam-urls*) t)) 103 (t nil)))) 104 105 (defun request-classification (url &optional page version) 106 (restart-case (error 'simple-error :format-control "Please classify the URL ~s." 107 :format-arguments (list url)) 108 (mark-url-okay () 109 :report "Mark this URL as acceptable." 110 (setf (gethash url *okay-urls*) t)) 111 (mark-domain-okay () 112 :report "Mark the domain as acceptable." 113 (setf (gethash (url-domain url) *okay-urls*) t)) 114 (mark-url-spam () 115 :report "Mark this URL as spam." 116 (when (and page version) 117 (pushnew (list page version) *has-spam* :test #'equal)) 118 (setf (gethash url *spam-urls*) t)) 119 (mark-domain-spam () 120 :report "Mark the domain as spam." 121 (when (and page version) 122 (pushnew (list page version) *has-spam* :test #'equal)) 123 (setf (gethash (url-domain url) *spam-urls*) t)) 124 (classify-later () 125 :report "Don't classify this URL yet." 126 nil))) 127 128 (defun classify-unknown-urls () 129 (setf *urls-to-classify* 130 (loop :for (url page version) :in *urls-to-classify* 131 :unless (or (gethash url *okay-urls*) 132 (gethash (url-domain url) *okay-urls*) 133 (gethash url *spam-urls*) 134 (gethash (url-domain url) *spam-urls*) 135 (request-classification url page version)) 136 :collect (list url page version)))) 137 138 (defun mark-known-goods () 139 (loop :for (page-url version) :in *has-spam* 140 :do (maybe-request-last-known-good page-url))) 141 142 (defun revert-spam () 143 (setf *has-spam* 144 (loop :for (page-url version) :in *has-spam* 145 :unless (and (gethash page-url *last-known-good*) 146 (revert-page page-url version (gethash page-url *last-known-good*))) 147 :collect (list page-url version)))) 148 149 (defvar *last-known-good* (make-hash-table :test 'equal) "hash table of cliki pages and the last-known \"good\" revision.") 150 151 (defun read-number () 152 (format t "Enter a version: ") 153 (list (format nil "~a" (parse-integer (read-line))))) 154 155 (defun maybe-request-last-known-good (page) 156 (unless (gethash page *last-known-good*) 157 (restart-case (error 'simple-error :format-control "Do not know of a good version of cliki page ~s." 158 :format-arguments (list page)) 159 (specify-version (version) 160 :interactive read-number 161 :report "Specify a known-good version." 162 (setf (gethash page *last-known-good*) version))))) 163 164 #+(or) (maybe-request-last-known-good "araneida") 165 166 (defun update-last-known-good () 167 ;; Remove known-spam pages from list of updates 168 (loop :for (page-url version) :in *has-spam* 169 :do (removef *updated-pages* (list page-url version) :test #'equal)) 170 ;; Remove updates we made from list of updates (that way, we'll continue to 171 ;; use the old cached known-good, saving a bit of strain on cliki) 172 (loop :for (page-url version) :in *ignore-update* 173 :do (removef *updated-pages* (list page-url version) :test #'equal)) 174 (setf *updated-pages* 175 (loop :for (page-url version) :in *updated-pages* 176 ;; If there are unclassified urls from this page, don't mark good 177 ;; (could be spam!) 178 :if (notany (lambda (x) (and (string= page-url (second x)) 179 (string= version (third x)))) 180 *urls-to-classify*) 181 :do (format t "; Updating last-known good of ~s from ~a to ~a~%" page-url (gethash page-url *last-known-good*) version) 182 (setf (gethash page-url *last-known-good*) version) 183 :else 184 :collect (list page-url version)))) 185 186 (defun numstring+1 (numstring) 187 (write-to-string (1+ (parse-integer numstring)))) 188 189 (defun revert-page (url current-version to-version) 190 (multiple-value-bind (page status headers) 191 (http-request (format nil "http://cliki.net/edit/~a" url) 192 :method :post 193 :parameters `(("version" . ,current-version) 194 ("T0" . "BODY") 195 ("E0" . ,(get-cliki-source url to-version)) 196 ("summary" . "Spam detected, reverting to Known-Good.") 197 ("captcha" . "lisp") 198 ("name" . "Claki (Revertobot Beta)"))) 199 (declare (ignore headers)) 200 (cond 201 ((and (= status 200) 202 (not (search "rejected" page :test #'char-equal))) 203 (format t "; Reverted page ~s to version ~a.~%" url to-version) 204 (pushnew (list url (numstring+1 current-version)) *ignore-update* :test #'equal) 205 page) 206 (t nil)))) 207 208 (defun get-cliki-source (url version) 209 "Fetches the source text of a given version of a cliki page. That is, it 210 returns the text you should POST to revert a cliki page to the given version." 211 (or (find-in-cache url version) 212 (multiple-value-bind (page status headers) 213 (http-request (format nil "http://cliki.net/~a?source&v=~a" url version)) 214 (declare (ignore headers)) 215 (cond 216 ((= 200 status) 217 ;; We have to coerce to a simple-string because rucksack mistakenly 218 ;; assumes non-simple strings have a fill pointer 219 (cache-known-good url version (coerce page 'simple-string)) 220 page) 221 (t (error "crap!")))))) 222 223 #+(or) (get-cliki-source "araneida" "281") 224 225 (defun attended-revert-new-spam () 226 (mapcar #'parse-page (get-recent-changes)) 227 (attendant)) 228 229 (defun attendant () 230 (classify-unknown-urls) 231 (update-last-known-good) 232 (mark-known-goods) 233 (revert-spam)) 234 235 (defun unattended-revert-new-spam () 236 (mapcar #'parse-page (get-recent-changes)) 237 (update-last-known-good) 238 (revert-spam)) 239 240 #+(or) (attended-revert-new-spam) 241 #+(or) (attendant) 242 243 (defun seconds (s) s) 244 (defun minutes (m) (* (seconds 60) m)) 245 (defun hours (h) (* (minutes 60) h)) 246 (defun days (d) (* (hours 24) d)) 247 (defun plus-or-minus (x y) (+ (- x y) (random (* 2 y)))) 248 249 (defconstant +simple-time+ '(:year #\- (:month 2) #\- (:day 2) #\Space (:hour 2) #\: (:min 2) #\: (:sec 2))) 250 (defun now () (local-time:format-timestring nil (local-time:now) :format +simple-time+)) 251 252 (defun run-for-a-while (how-long how-often variance) 253 (format t "; Beginning run at ~a~%" (now)) 254 (dotimes (i (floor how-long how-often)) 255 (sleep (plus-or-minus how-often variance)) 256 (format t "; Unattended run at ~a~%" (now)) 257 (unattended-revert-new-spam) 258 (save-state)) 259 (format t "; Run ended at ~a~%~%" (now))) 260 261 #+(or) 262 (let ((stdout *standard-output*)) 263 (sb-thread:make-thread (lambda () (let ((*standard-output* stdout)) (run-for-a-while (days 2) (minutes 30) (minutes 5)))) 264 :name "cliki reverter")) 265 266 (defvar *state-file* (merge-pathnames #p"state/" (directory-namestring (or #.*load-pathname* #p"/home/pixel/repos/claki/")))) 267 268 (defmacro with-rucksack-and-transaction ((rucksack) (&rest root-vars) &body body) 269 (with-unique-names (rest) 270 `(rucksack:with-rucksack (,rucksack *state-file*) 271 (rucksack:with-transaction () 272 (destructuring-bind (&optional ,@root-vars &rest ,rest) (ordered-roots rs:*rucksack*) 273 (declare (ignore ,rest) ,@(when (member '_ root-vars) `((ignore _)))) 274 ,@body))))) 275 276 (defun ordered-roots (sack) 277 (sort (rs:rucksack-roots sack) #'< :key #'rs:object-id)) 278 279 (defun initialize-rucksack () 280 (with-rucksack-and-transaction (rs) () 281 (let ((roots (rs:rucksack-roots rs:*rucksack*))) 282 (dotimes (i (- 2 (length roots))) 283 (rs:add-rucksack-root (make-instance 'rs:btree :key< 'string<) rs:*rucksack*))))) 284 285 #+(or) (initialize-rucksack) 286 287 (defun save-state () 288 (with-rucksack-and-transaction (rs) (btree) 289 (rs:btree-insert btree 'spam-urls *spam-urls*) 290 (rs:btree-insert btree 'ham-urls *okay-urls*) 291 (rs:btree-insert btree 'known-good *last-known-good*) 292 (rs:btree-insert btree 'updated-pages *updated-pages*) 293 (rs:btree-insert btree 'our-updates *ignore-update*))) 294 295 #+(or) (save-state) 296 297 (defun restore-state () 298 (with-rucksack-and-transaction (rs) (btree) 299 (setf *spam-urls* (rs:btree-search btree 'spam-urls) 300 *okay-urls* (rs:btree-search btree 'ham-urls) 301 *last-known-good* (rs:btree-search btree 'known-good) 302 *updated-pages* (rs:btree-search btree 'updated-pages) 303 *ignore-update* (rs:btree-search btree 'our-updates)))) 304 305 #+(or) (restore-state) 306 #+(or) (with-rucksack-and-transaction (rs) () 307 (ordered-roots rs:*rucksack*)) 308 309 (defun cache-known-good (cliki-page version content) 310 (with-rucksack-and-transaction (rs) (_ btree) 311 (rs:btree-insert btree cliki-page (list version content)))) 312 313 (defun find-in-cache (cliki-page version) 314 (with-rucksack-and-transaction (rs) (_ btree) 315 (destructuring-bind (&optional cached-version cached-content) 316 (rs:btree-search btree cliki-page :default-value nil :errorp nil) 317 (and cached-version 318 (string= cached-version version) 319 cached-content)))) 320 321 #+(or) (find-in-cache "araneida" "281")