Split out an unattended and an attented portion
Annotate for file claki.lisp
2011-06-13 pix 1 (defpackage #:claki
19:08:27 ' 2 (:use #:cl #:unify :alexandria))
' 3 (in-package #:claki)
' 4
' 5 (defvar *last-modified* (make-hash-table :test 'equal) "hash table of cliki pages and the last-modified header we last received.")
' 6
' 7 #+(or) (clrhash *last-modified*)
' 8
' 9 (defun get-cliki-page (url)
' 10 "Returns a page from cliki if it has been modified since we last saw it or nil
' 11 if it has not been modified. Signals an error otherwise."
' 12 (multiple-value-bind (page status headers)
2011-06-29 pix 13 (drakma:http-request (format nil "http://cliki.net/~a" url)
03:08:25 ' 14 :additional-headers (when (gethash url *last-modified*)
' 15 `((:if-modified-since (gethash url *last-modified*)))))
2011-06-13 pix 16 (cond
19:08:27 ' 17 ;; If the page hasn't been modified, no need to update
' 18 ((= 304 status) nil)
' 19 ;; The "Recent Changes" page doesn't return 304s, but does return a last-modified.
' 20 ((and (gethash url *last-modified*)
' 21 (cdr (assoc :last-modified headers))
' 22 (string= (cdr (assoc :last-modified headers)) (gethash url *last-modified*)))
' 23 nil)
' 24 ((= 200 status)
' 25 (setf (gethash url *last-modified*) (cdr (assoc :last-modified headers)))
' 26 page)
' 27 (t (error "crap!")))))
' 28
' 29 (defun get-recent-changes ()
' 30 (when-let ((page (get-cliki-page "Recent%20Changes")))
' 31 (match (#t(oh-ducks:html ("blockquote > b > a.internal" . ?links)) page)
' 32 (loop :for link :in (remove-duplicates links :test #'equalp
' 33 :key (lambda (x) (oh-ducks.traversal:element-attribute :href x))
' 34 :from-end t)
' 35 :collect (oh-ducks.traversal:element-attribute :href link)))))
' 36
' 37 #+(or) (get-recent-changes)
' 38
' 39 (defvar *spam-urls* (make-hash-table :test 'equal) "hash table of spam urls.")
' 40 (defvar *okay-urls* (make-hash-table :test 'equal) "hash table of acceptable urls.")
2011-06-13 pix 41 (defvar *urls-to-classify* (list) "list of (url-to-classify cliki-page-where-it-was-found page-version).")
22:35:27 ' 42 (defvar *has-spam* (list) "list of (cliki-page . version) known to have spam.")
2011-06-13 pix 43
19:08:27 ' 44 #+(or) (clrhash *okay-urls*)
' 45
' 46 (defun url-domain (url)
' 47 (puri:uri-host (puri:parse-uri url)))
' 48
' 49 (defun parse-page (page-url)
' 50 (when-let ((page (get-cliki-page page-url)))
' 51 (match (#t(oh-ducks:html ("a[href^=http]" . ?a)
' 52 ("#footer > b" . #t(list ?b))) page)
' 53 (let ((current-version (oh-ducks.traversal:element-content b)))
' 54 (dolist (link a)
2011-06-13 pix 55 (let ((url (oh-ducks.traversal:element-attribute :href link))
22:35:27 ' 56 (rel (oh-ducks.traversal:element-attribute :rel link)))
' 57 (cond
' 58 ((or (gethash url *okay-urls*)
' 59 (gethash (url-domain url) *okay-urls*))
' 60 #+(or) (do-nothing))
' 61 ((or (gethash url *spam-urls*)
' 62 (gethash (url-domain url) *spam-urls*))
' 63 (pushnew (list page-url current-version) *has-spam* :test #'equal))
' 64 ((and (stringp rel)
' 65 (or (string-equal "follow" rel)
' 66 (string-equal "dofollow" rel)))
' 67 (setf (gethash url *spam-urls*) t)
' 68 (pushnew (list page-url current-version) *has-spam* :test #'equal))
' 69 (t
' 70 (pushnew (list url page-url current-version) *urls-to-classify* :test #'equal)))))))))
2011-06-13 pix 71
19:08:27 ' 72 #+(or) (parse-page "araneida")
' 73
2011-06-18 pix 74 (defun request-classification (url)
2011-06-13 pix 75 (restart-case (error 'simple-error :format-control "Please classify the URL ~s."
19:08:27 ' 76 :format-arguments (list url))
' 77 (mark-url-okay ()
' 78 :report "Mark this URL as acceptable."
' 79 (setf (gethash url *okay-urls*) t))
' 80 (mark-domain-okay ()
' 81 :report "Mark the domain as acceptable."
' 82 (setf (gethash (url-domain url) *okay-urls*) t))
' 83 (mark-url-spam ()
' 84 :report "Mark this URL as spam."
' 85 (setf (gethash url *spam-urls*) t))
' 86 (mark-domain-spam ()
' 87 :report "Mark the domain as spam."
2011-06-13 pix 88 (setf (gethash (url-domain url) *spam-urls*) t))
22:35:27 ' 89 (classify-later ()
' 90 :report "Don't classify this URL yet."
' 91 nil)))
' 92
' 93 (defun classify-unknown-urls ()
' 94 (setf *urls-to-classify*
' 95 (loop :for (url page version) :in *urls-to-classify*
' 96 :unless (or (gethash url *okay-urls*)
' 97 (gethash (url-domain url) *okay-urls*)
' 98 (gethash url *spam-urls*)
' 99 (gethash (url-domain url) *spam-urls*)
' 100 (request-classification url))
' 101 :collect (list url page version))))
' 102
' 103 (defun mark-known-goods ()
' 104 (loop :for (page-url version) :in *has-spam*
' 105 :do (maybe-request-last-known-good page-url)))
' 106
' 107 (defun revert-spam ()
' 108 (setf *has-spam*
' 109 (loop :for (page-url version) :in *has-spam*
' 110 :unless (and (gethash page-url *last-known-good*)
' 111 (revert-page page-url version (gethash page-url *last-known-good*)))
' 112 :collect (list page-url version))))
2011-06-13 pix 113
19:08:27 ' 114 (defvar *last-known-good* (make-hash-table :test 'equal) "hash table of cliki pages and the last-known \"good\" revision.")
' 115
' 116 (defun read-number ()
' 117 (format t "Enter a version: ")
' 118 (list (format nil "~a" (parse-integer (read-line)))))
' 119
' 120 (defun maybe-request-last-known-good (page)
' 121 (unless (gethash page *last-known-good*)
' 122 (restart-case (error 'simple-error :format-control "Do not know of a good version of cliki page ~s."
' 123 :format-arguments (list page))
' 124 (specify-version (version)
' 125 :interactive read-number
' 126 :report "Specify a known-good version."
' 127 (setf (gethash page *last-known-good*) version)))))
' 128
' 129 #+(or) (maybe-request-last-known-good "araneida")
' 130
2011-06-13 pix 131 (defun revert-page (url current-version to-version)
22:35:27 ' 132 (multiple-value-bind (page status headers)
' 133 (drakma:http-request (format nil "http://cliki.net/edit/~a" url)
' 134 :method :post
' 135 :parameters `(("version" . ,current-version)
' 136 ("T0" . "BODY")
' 137 ("E0" . ,(get-cliki-source url to-version))
' 138 ("summary" . "Spam detected, reverting to Known-Good.")
' 139 ("captcha" . "lisp")
' 140 ("name" . "Claki (Revertobot Alpha)")))
' 141 (cond
' 142 ((and (= status 200)
' 143 (not (search "rejected" page :test #'char-equal)))
' 144 page)
' 145 (t nil))))
2011-06-13 pix 146
19:08:27 ' 147 (defun get-cliki-source (url version)
' 148 "Fetches the source text of a given version of a cliki page. That is, it
' 149 returns the text you should POST to revert a cliki page to the given version."
2011-06-15 pix 150 (multiple-value-bind (page status headers)
09:21:48 ' 151 (drakma:http-request (format nil "http://cliki.net/~a?source&v=~a" url version))
' 152 (cond
' 153 ((= 200 status) page)
' 154 (t (error "crap!")))))
2011-06-13 pix 155
2011-06-13 pix 156 (defun attented-revert-new-spam ()
22:35:27 ' 157 (mapcar #'parse-page (get-recent-changes))
' 158 (attendant))
' 159
' 160 (defun attendant ()
' 161 (classify-unknown-urls)
' 162 (mark-known-goods)
' 163 (revert-spam))
' 164
' 165 (defun unattented-revert-new-spam ()
' 166 (mapcar #'parse-page (get-recent-changes))
' 167 (revert-spam))
2011-06-13 pix 168
2011-06-13 pix 169 #+(or) (attented-revert-new-spam)
2011-06-13 pix 170
2011-06-13 pix 171 #+(or) (loop (sleep (* 60 60)) (unattented-revert-new-spam))