/
claki.lisp
  1 (defpackage #:claki
  2   (:use #:cl #:unify :alexandria))
  3 (in-package #:claki)
  4 
  5 (defvar *last-modified* (make-hash-table :test 'equal) "hash table of cliki pages and the last-modified header we last received.")
  6 
  7 #+(or) (clrhash *last-modified*)
  8 
  9 (defun http-request (&rest drakma-args)
 10   "A wrapper around drakma:http-request which automatically retries the request
 11 in the event of a timeout."
 12   (let ((times-failed 0))
 13     (tagbody
 14      :fetch-page
 15        (handler-case (return-from http-request (apply #'drakma:http-request drakma-args))
 16          (usocket:timeout-error ()
 17            (sleep (* 60 (expt 2 times-failed)))
 18            (incf times-failed)
 19            (go :fetch-page))))))
 20 
 21 (defun get-cliki-page (url)
 22   "Returns a page from cliki if it has been modified since we last saw it or nil
 23 if it has not been modified.  Signals an error otherwise."
 24   (multiple-value-bind (page status headers)
 25       (http-request (format nil "http://cliki.net/~a" url)
 26                     :additional-headers (when (gethash url *last-modified*)
 27                                           `((:if-modified-since (gethash url *last-modified*)))))
 28     ;; FIXME: this doesn't work all that well: it turns out cliki uses a single
 29     ;; last-modified header for the entire wiki, rather than one per page, so we
 30     ;; aren't saving ourselves (or cliki) much with this.
 31     (cond
 32       ;; If the page hasn't been modified, no need to update
 33       ((= 304 status) nil)
 34       ;; The "Recent Changes" page doesn't return 304s, but does return a last-modified.
 35       ((and (gethash url *last-modified*)
 36             (cdr (assoc :last-modified headers))
 37             (string= (cdr (assoc :last-modified headers)) (gethash url *last-modified*)))
 38        nil)
 39       ((= 200 status)
 40        (setf (gethash url *last-modified*) (cdr (assoc :last-modified headers)))
 41        page)
 42       (t (error "crap!")))))
 43 
 44 (defun get-recent-changes ()
 45   (when-let ((page (get-cliki-page "Recent%20Changes")))
 46     (match (#t(oh-ducks:html ("blockquote > b > a.internal" . ?links)) page)
 47         (loop :for link :in (remove-duplicates links :test #'equalp
 48                                                :key (lambda (x) (oh-ducks.traversal:element-attribute :href x))
 49                                                :from-end t)
 50               :collect (oh-ducks.traversal:element-attribute :href link)))))
 51 
 52 #+(or) (get-recent-changes)
 53 
 54 (defvar *spam-urls* (make-hash-table :test 'equal) "hash table of spam urls.")
 55 (defvar *okay-urls* (make-hash-table :test 'equal) "hash table of acceptable urls.")
 56 (defvar *urls-to-classify* (list) "list of (url-to-classify cliki-page-where-it-was-found page-version).")
 57 (defvar *has-spam* (list) "list of (cliki-page version) known to have spam.")
 58 (defvar *updated-pages* (list) "list of pages that have been updated in the format (cliki-page version).")
 59 (defvar *ignore-update* (list) "list of pages in the format (cliki-page version) which updates should not result in updating the last-known good version (e.g., because we did it and it was a reversion).")
 60 
 61 #+(or) (clrhash *okay-urls*)
 62 
 63 (defun url-domain (url)
 64   (puri:uri-host (puri:parse-uri url)))
 65 
 66 (defun parse-page (page-url)
 67   (when-let ((page (get-cliki-page page-url)))
 68     (match (#t(oh-ducks:html ("a[href^=http]" . ?a)
 69                              ("#footer > b" . #t(list ?b))) page)
 70         (let ((current-version (oh-ducks.traversal:element-content b)))
 71           (format t "; Page ~s modified, now at version ~a.~%" page-url current-version)
 72           (when (string= (gethash page-url *last-known-good*) current-version)
 73             (return-from parse-page nil))
 74           (pushnew (list page-url current-version) *updated-pages* :test #'equal)
 75           (dolist (link a)
 76             (let ((url (oh-ducks.traversal:element-attribute :href link)))
 77               (cond
 78                 ((or (gethash url *okay-urls*)
 79                      (gethash (url-domain url) *okay-urls*))
 80                  #+(or) (do-nothing))
 81                 ((or (gethash url *spam-urls*)
 82                      (gethash (url-domain url) *spam-urls*)
 83                      (auto-classify link))
 84                  (pushnew (list page-url current-version) *has-spam* :test #'equal))
 85                 (t
 86                  (pushnew (list url page-url current-version) *urls-to-classify* :test #'equal)))))))))
 87 
 88 #+(or) (parse-page "araneida")
 89 
 90 (defun auto-classify (link)
 91   "Auto-classify URLs based upon traits common to spammers."
 92   (let ((rel (oh-ducks.traversal:element-attribute :rel link))
 93         (style (oh-ducks.traversal:element-attribute :style link))
 94         (url (oh-ducks.traversal:element-attribute :href link)))
 95     (cond
 96       ((and (stringp rel)
 97             (or (string-equal "follow" rel)
 98                 (string-equal "dofollow" rel)))
 99        (setf (gethash url *spam-urls*) t))
100       ((and (stringp style)
101             (cl-ppcre:scan "text-decoration[ ]*:[ ]*none" style))
102        (setf (gethash url *spam-urls*) t))
103       (t nil))))
104 
105 (defun request-classification (url &optional page version)
106   (restart-case (error 'simple-error :format-control "Please classify the URL ~s."
107                        :format-arguments (list url))
108     (mark-url-okay ()
109       :report "Mark this URL as acceptable."
110       (setf (gethash url *okay-urls*) t))
111     (mark-domain-okay ()
112       :report "Mark the domain as acceptable."
113       (setf (gethash (url-domain url) *okay-urls*) t))
114     (mark-url-spam ()
115       :report "Mark this URL as spam."
116       (when (and page version)
117         (pushnew (list page version) *has-spam* :test #'equal))
118       (setf (gethash url *spam-urls*) t))
119     (mark-domain-spam ()
120       :report "Mark the domain as spam."
121       (when (and page version)
122         (pushnew (list page version) *has-spam* :test #'equal))
123       (setf (gethash (url-domain url) *spam-urls*) t))
124     (classify-later ()
125       :report "Don't classify this URL yet."
126       nil)))
127 
128 (defun classify-unknown-urls ()
129   (setf *urls-to-classify*
130         (loop :for (url page version) :in *urls-to-classify*
131               :unless (or (gethash url *okay-urls*)
132                           (gethash (url-domain url) *okay-urls*)
133                           (gethash url *spam-urls*)
134                           (gethash (url-domain url) *spam-urls*)
135                           (request-classification url page version))
136                 :collect (list url page version))))
137 
138 (defun mark-known-goods ()
139   (loop :for (page-url version) :in *has-spam*
140         :do (maybe-request-last-known-good page-url)))
141 
142 (defun revert-spam ()
143   (setf *has-spam*
144         (loop :for (page-url version) :in *has-spam*
145               :unless (and (gethash page-url *last-known-good*)
146                            (revert-page page-url version (gethash page-url *last-known-good*)))
147                 :collect (list page-url version))))
148 
149 (defvar *last-known-good* (make-hash-table :test 'equal) "hash table of cliki pages and the last-known \"good\" revision.")
150 
151 (defun read-number ()
152   (format t "Enter a version: ")
153   (list (format nil "~a" (parse-integer (read-line)))))
154 
155 (defun maybe-request-last-known-good (page)
156   (unless (gethash page *last-known-good*)
157     (restart-case (error 'simple-error :format-control "Do not know of a good version of cliki page ~s."
158                          :format-arguments (list page))
159       (specify-version (version)
160         :interactive read-number
161         :report "Specify a known-good version."
162         (setf (gethash page *last-known-good*) version)))))
163 
164 #+(or) (maybe-request-last-known-good "araneida")
165 
166 (defun update-last-known-good ()
167   ;; Remove known-spam pages from list of updates
168   (loop :for (page-url version) :in *has-spam*
169         :do (removef *updated-pages* (list page-url version) :test #'equal))
170   ;; Remove updates we made from list of updates (that way, we'll continue to
171   ;; use the old cached known-good, saving a bit of strain on cliki)
172   (loop :for (page-url version) :in *ignore-update*
173         :do (removef *updated-pages* (list page-url version) :test #'equal))
174   (setf *updated-pages*
175         (loop :for (page-url version) :in *updated-pages*
176               ;; If there are unclassified urls from this page, don't mark good
177               ;; (could be spam!)
178               :if (notany (lambda (x) (and (string= page-url (second x))
179                                            (string= version (third x))))
180                           *urls-to-classify*)
181                 :do (format t "; Updating last-known good of ~s from ~a to ~a~%" page-url (gethash page-url *last-known-good*) version)
182                     (setf (gethash page-url *last-known-good*) version)
183               :else
184                 :collect (list page-url version))))
185 
186 (defun numstring+1 (numstring)
187   (write-to-string (1+ (parse-integer numstring))))
188 
189 (defun revert-page (url current-version to-version)
190   (multiple-value-bind (page status headers)
191       (http-request (format nil "http://cliki.net/edit/~a" url)
192                     :method :post
193                     :parameters `(("version" . ,current-version)
194                                   ("T0"      . "BODY")
195                                   ("E0"      . ,(get-cliki-source url to-version))
196                                   ("summary" . "Spam detected, reverting to Known-Good.")
197                                   ("captcha" . "lisp")
198                                   ("name"    . "Claki (Revertobot Beta)")))
199     (declare (ignore headers))
200     (cond
201       ((and (= status 200)
202             (not (search "rejected" page :test #'char-equal)))
203        (format t "; Reverted page ~s to version ~a.~%" url to-version)
204        (pushnew (list url (numstring+1 current-version)) *ignore-update* :test #'equal)
205        page)
206       (t nil))))
207 
208 (defun get-cliki-source (url version)
209   "Fetches the source text of a given version of a cliki page.  That is, it
210 returns the text you should POST to revert a cliki page to the given version."
211   (or (find-in-cache url version)
212       (multiple-value-bind (page status headers)
213           (http-request (format nil "http://cliki.net/~a?source&v=~a" url version))
214         (declare (ignore headers))
215         (cond
216           ((= 200 status)
217            ;; We have to coerce to a simple-string because rucksack mistakenly
218            ;; assumes non-simple strings have a fill pointer
219            (cache-known-good url version (coerce page 'simple-string))
220            page)
221           (t (error "crap!"))))))
222 
223 #+(or) (get-cliki-source "araneida" "281")
224 
225 (defun attended-revert-new-spam ()
226   (mapcar #'parse-page (get-recent-changes))
227   (attendant))
228 
229 (defun attendant ()
230   (classify-unknown-urls)
231   (update-last-known-good)
232   (mark-known-goods)
233   (revert-spam))
234 
235 (defun unattended-revert-new-spam ()
236   (mapcar #'parse-page (get-recent-changes))
237   (update-last-known-good)
238   (revert-spam))
239 
240 #+(or) (attended-revert-new-spam)
241 #+(or) (attendant)
242 
243 (defun seconds (s) s)
244 (defun minutes (m) (* (seconds 60) m))
245 (defun hours (h) (* (minutes 60) h))
246 (defun days (d) (* (hours 24) d))
247 (defun plus-or-minus (x y) (+ (- x y) (random (* 2 y))))
248 
249 (defconstant +simple-time+ '(:year #\- (:month 2) #\- (:day 2) #\Space (:hour 2) #\: (:min 2) #\: (:sec 2)))
250 (defun now () (local-time:format-timestring nil (local-time:now) :format +simple-time+))
251 
252 (defun run-for-a-while (how-long how-often variance)
253   (format t "; Beginning run at ~a~%" (now))
254   (dotimes (i (floor how-long how-often))
255     (sleep (plus-or-minus how-often variance))
256     (format t "; Unattended run at ~a~%" (now))
257     (unattended-revert-new-spam)
258     (save-state))
259   (format t "; Run ended at ~a~%~%" (now)))
260 
261 #+(or)
262  (let ((stdout *standard-output*))
263    (sb-thread:make-thread (lambda () (let ((*standard-output* stdout)) (run-for-a-while (days 2) (minutes 30) (minutes 5))))
264                           :name "cliki reverter"))
265 
266 (defvar *state-file* (merge-pathnames #p"state/" (directory-namestring (or #.*load-pathname* #p"/home/pixel/repos/claki/"))))
267 
268 (defmacro with-rucksack-and-transaction ((rucksack) (&rest root-vars) &body body)
269   (with-unique-names (rest)
270     `(rucksack:with-rucksack (,rucksack *state-file*)
271        (rucksack:with-transaction ()
272          (destructuring-bind (&optional ,@root-vars &rest ,rest) (ordered-roots rs:*rucksack*)
273            (declare (ignore ,rest) ,@(when (member '_ root-vars) `((ignore _))))
274            ,@body)))))
275 
276 (defun ordered-roots (sack)
277   (sort (rs:rucksack-roots sack) #'< :key #'rs:object-id))
278 
279 (defun initialize-rucksack ()
280   (with-rucksack-and-transaction (rs) ()
281     (let ((roots (rs:rucksack-roots rs:*rucksack*)))
282       (dotimes (i (- 2 (length roots)))
283         (rs:add-rucksack-root (make-instance 'rs:btree :key< 'string<) rs:*rucksack*)))))
284 
285 #+(or) (initialize-rucksack)
286 
287 (defun save-state ()
288   (with-rucksack-and-transaction (rs) (btree)
289     (rs:btree-insert btree 'spam-urls *spam-urls*)
290     (rs:btree-insert btree 'ham-urls *okay-urls*)
291     (rs:btree-insert btree 'known-good *last-known-good*)
292     (rs:btree-insert btree 'updated-pages *updated-pages*)
293     (rs:btree-insert btree 'our-updates *ignore-update*)))
294 
295 #+(or) (save-state)
296 
297 (defun restore-state ()
298   (with-rucksack-and-transaction (rs) (btree)
299     (setf *spam-urls* (rs:btree-search btree 'spam-urls)
300           *okay-urls* (rs:btree-search btree 'ham-urls)
301           *last-known-good* (rs:btree-search btree 'known-good)
302           *updated-pages* (rs:btree-search btree 'updated-pages)
303           *ignore-update* (rs:btree-search btree 'our-updates))))
304 
305 #+(or) (restore-state)
306 #+(or) (with-rucksack-and-transaction (rs) ()
307          (ordered-roots rs:*rucksack*))
308 
309 (defun cache-known-good (cliki-page version content)
310   (with-rucksack-and-transaction (rs) (_ btree)
311     (rs:btree-insert btree cliki-page (list version content))))
312 
313 (defun find-in-cache (cliki-page version)
314   (with-rucksack-and-transaction (rs) (_ btree)
315     (destructuring-bind (&optional cached-version cached-content)
316         (rs:btree-search btree cliki-page :default-value nil :errorp nil)
317       (and cached-version
318            (string= cached-version version)
319            cached-content))))
320 
321 #+(or) (find-in-cache "araneida" "281")