Initial import
Annotate for file claki.lisp
2011-06-13 pix 1 (defpackage #:claki
19:08:27 ' 2 (:use #:cl #:unify :alexandria))
' 3 (in-package #:claki)
' 4
' 5 (defvar *last-modified* (make-hash-table :test 'equal) "hash table of cliki pages and the last-modified header we last received.")
' 6
' 7 #+(or) (clrhash *last-modified*)
' 8
' 9 (defun get-cliki-page (url)
' 10 "Returns a page from cliki if it has been modified since we last saw it or nil
' 11 if it has not been modified. Signals an error otherwise."
' 12 (multiple-value-bind (page status headers)
' 13 (drakma:http-request (format nil "http://cliki.net/~a" url)
' 14 :additional-headers (when (gethash url *last-modified*)
' 15 `((:if-modified-since (gethash url *last-modified*)))))
' 16 (cond
' 17 ;; If the page hasn't been modified, no need to update
' 18 ((= 304 status) nil)
' 19 ;; The "Recent Changes" page doesn't return 304s, but does return a last-modified.
' 20 ((and (gethash url *last-modified*)
' 21 (cdr (assoc :last-modified headers))
' 22 (string= (cdr (assoc :last-modified headers)) (gethash url *last-modified*)))
' 23 nil)
' 24 ((= 200 status)
' 25 (setf (gethash url *last-modified*) (cdr (assoc :last-modified headers)))
' 26 page)
' 27 (t (error "crap!")))))
' 28
' 29 (defun get-recent-changes ()
' 30 (when-let ((page (get-cliki-page "Recent%20Changes")))
' 31 (match (#t(oh-ducks:html ("blockquote > b > a.internal" . ?links)) page)
' 32 (loop :for link :in (remove-duplicates links :test #'equalp
' 33 :key (lambda (x) (oh-ducks.traversal:element-attribute :href x))
' 34 :from-end t)
' 35 :collect (oh-ducks.traversal:element-attribute :href link)))))
' 36
' 37 #+(or) (get-recent-changes)
' 38
' 39 (defvar *spam-urls* (make-hash-table :test 'equal) "hash table of spam urls.")
' 40 (defvar *okay-urls* (make-hash-table :test 'equal) "hash table of acceptable urls.")
' 41
' 42 #+(or) (clrhash *okay-urls*)
' 43
' 44 (defun url-domain (url)
' 45 (puri:uri-host (puri:parse-uri url)))
' 46
' 47 (defun parse-page (page-url)
' 48 (when-let ((page (get-cliki-page page-url)))
' 49 (match (#t(oh-ducks:html ("a[href^=http]" . ?a)
' 50 ("#footer > b" . #t(list ?b))) page)
' 51 (let ((current-version (oh-ducks.traversal:element-content b)))
' 52 (dolist (link a)
' 53 (let ((url (oh-ducks.traversal:element-attribute :href link)))
' 54 (tagbody
' 55 :handle-url
' 56 (cond
' 57 ((or (gethash url *okay-urls*)
' 58 (gethash (url-domain url) *okay-urls*))
' 59 #+(or) (do-nothing))
' 60 ((or (gethash url *spam-urls*)
' 61 (gethash (url-domain url) *spam-urls*))
' 62 (maybe-request-last-known-good page-url)
' 63 (return-from parse-page (revert-page page-url current-version (gethash page-url *last-known-good*))))
' 64 (t
' 65 (request-classification url)
' 66 (go :handle-url))))))))))
' 67
' 68 #+(or) (parse-page "araneida")
' 69
' 70 (defun request-classification (url)
' 71 (restart-case (error 'simple-error :format-control "Please classify the URL ~s."
' 72 :format-arguments (list url))
' 73 (mark-url-okay ()
' 74 :report "Mark this URL as acceptable."
' 75 (setf (gethash url *okay-urls*) t))
' 76 (mark-domain-okay ()
' 77 :report "Mark the domain as acceptable."
' 78 (setf (gethash (url-domain url) *okay-urls*) t))
' 79 (mark-url-spam ()
' 80 :report "Mark this URL as spam."
' 81 (setf (gethash url *spam-urls*) t))
' 82 (mark-domain-spam ()
' 83 :report "Mark the domain as spam."
' 84 (setf (gethash (url-domain url) *spam-urls*) t))))
' 85
' 86 (defvar *last-known-good* (make-hash-table :test 'equal) "hash table of cliki pages and the last-known \"good\" revision.")
' 87
' 88 (defun read-number ()
' 89 (format t "Enter a version: ")
' 90 (list (format nil "~a" (parse-integer (read-line)))))
' 91
' 92 (defun maybe-request-last-known-good (page)
' 93 (unless (gethash page *last-known-good*)
' 94 (restart-case (error 'simple-error :format-control "Do not know of a good version of cliki page ~s."
' 95 :format-arguments (list page))
' 96 (specify-version (version)
' 97 :interactive read-number
' 98 :report "Specify a known-good version."
' 99 (setf (gethash page *last-known-good*) version)))))
' 100
' 101 #+(or) (maybe-request-last-known-good "araneida")
' 102
' 103 (defun revert-page (page current-version to-version)
' 104 (drakma:http-request (format nil "http://cliki.net/edit/~a" page)
' 105 :method :post
' 106 :parameters `(("version" . ,current-version)
' 107 ("T0" . "BODY")
' 108 ("E0" . ,(get-cliki-source page to-version))
' 109 ("summary" . "Spam detected, reverting to Known-Good.")
' 110 ("captcha" . "lisp")
' 111 ("name" . "Claki (Revertobot Alpha)"))))
' 112
' 113 (defun get-cliki-source (url version)
' 114 "Fetches the source text of a given version of a cliki page. That is, it
' 115 returns the text you should POST to revert a cliki page to the given version."
' 116 (multiple-value-bind (page status headers)
' 117 (drakma:http-request (format nil "http://cliki.net/~a?source&v=~a" url version))
' 118 (cond
' 119 ((= 200 status) page)
' 120 (t (error "crap!")))))
' 121
' 122 (defun revert-new-spam ()
' 123 (let ((modified-pages (get-recent-changes)))
' 124 (loop :for page :in modified-pages
' 125 :do (parse-page page))))
' 126
' 127 #+(or) (revert-new-spam)
' 128
' 129 #+(or) (loop (sleep (* 60 60)) (revert-new-spam))