Mon Jun 13 19:08:27 UTC 2011  pix@kepibu.org
  * Initial import
addfile ./claki.asd
hunk ./claki.asd 1
+(defsystem :claki
+  :depends-on (:oh-ducks :closure-html :cl-unification :drakma :alexandria)
+  :components ((:file "claki")))
addfile ./claki.lisp
hunk ./claki.lisp 1
+(defpackage #:claki
+  (:use #:cl #:unify :alexandria))
+(in-package #:claki)
+
+(defvar *last-modified* (make-hash-table :test 'equal) "hash table of cliki pages and the last-modified header we last received.")
+
+#+(or) (clrhash *last-modified*)
+
+(defun get-cliki-page (url)
+  "Returns a page from cliki if it has been modified since we last saw it or nil
+if it has not been modified.  Signals an error otherwise."
+  (multiple-value-bind (page status headers)
+      (drakma:http-request (format nil "http://cliki.net/~a" url)
+                           :additional-headers (when (gethash url *last-modified*)
+                                                 `((:if-modified-since (gethash url *last-modified*)))))
+    (cond
+      ;; If the page hasn't been modified, no need to update
+      ((= 304 status) nil)
+      ;; The "Recent Changes" page doesn't return 304s, but does return a last-modified.
+      ((and (gethash url *last-modified*)
+            (cdr (assoc :last-modified headers))
+            (string= (cdr (assoc :last-modified headers)) (gethash url *last-modified*)))
+       nil)
+      ((= 200 status)
+       (setf (gethash url *last-modified*) (cdr (assoc :last-modified headers)))
+       page)
+      (t (error "crap!")))))
+
+(defun get-recent-changes ()
+  (when-let ((page (get-cliki-page "Recent%20Changes")))
+    (match (#t(oh-ducks:html ("blockquote > b > a.internal" . ?links)) page)
+        (loop :for link :in (remove-duplicates links :test #'equalp
+                                               :key (lambda (x) (oh-ducks.traversal:element-attribute :href x))
+                                               :from-end t)
+              :collect (oh-ducks.traversal:element-attribute :href link)))))
+
+#+(or) (get-recent-changes)
+
+(defvar *spam-urls* (make-hash-table :test 'equal) "hash table of spam urls.")
+(defvar *okay-urls* (make-hash-table :test 'equal) "hash table of acceptable urls.")
+
+#+(or) (clrhash *okay-urls*)
+
+(defun url-domain (url)
+  (puri:uri-host (puri:parse-uri url)))
+
+(defun parse-page (page-url)
+  (when-let ((page (get-cliki-page page-url)))
+    (match (#t(oh-ducks:html ("a[href^=http]" . ?a)
+                             ("#footer > b" . #t(list ?b))) page)
+        (let ((current-version (oh-ducks.traversal:element-content b)))
+          (dolist (link a)
+            (let ((url (oh-ducks.traversal:element-attribute :href link)))
+              (tagbody
+               :handle-url
+                 (cond
+                   ((or (gethash url *okay-urls*)
+                        (gethash (url-domain url) *okay-urls*))
+                    #+(or) (do-nothing))
+                   ((or (gethash url *spam-urls*)
+                        (gethash (url-domain url) *spam-urls*))
+                    (maybe-request-last-known-good page-url)
+                    (return-from parse-page (revert-page page-url current-version (gethash page-url *last-known-good*))))
+                   (t
+                    (request-classification url)
+                    (go :handle-url))))))))))
+
+#+(or) (parse-page "araneida")
+
+(defun request-classification (url)
+  (restart-case (error 'simple-error :format-control "Please classify the URL ~s."
+                       :format-arguments (list url))
+    (mark-url-okay ()
+      :report "Mark this URL as acceptable."
+      (setf (gethash url *okay-urls*) t))
+    (mark-domain-okay ()
+      :report "Mark the domain as acceptable."
+      (setf (gethash (url-domain url) *okay-urls*) t))
+    (mark-url-spam ()
+      :report "Mark this URL as spam."
+      (setf (gethash url *spam-urls*) t))
+    (mark-domain-spam ()
+      :report "Mark the domain as spam."
+      (setf (gethash (url-domain url) *spam-urls*) t))))
+
+(defvar *last-known-good* (make-hash-table :test 'equal) "hash table of cliki pages and the last-known \"good\" revision.")
+
+(defun read-number ()
+  (format t "Enter a version: ")
+  (list (format nil "~a" (parse-integer (read-line)))))
+
+(defun maybe-request-last-known-good (page)
+  (unless (gethash page *last-known-good*)
+    (restart-case (error 'simple-error :format-control "Do not know of a good version of cliki page ~s."
+                         :format-arguments (list page))
+      (specify-version (version)
+        :interactive read-number
+        :report "Specify a known-good version."
+        (setf (gethash page *last-known-good*) version)))))
+
+#+(or) (maybe-request-last-known-good "araneida")
+
+(defun revert-page (page current-version to-version)
+  (drakma:http-request (format nil "http://cliki.net/edit/~a" page)
+                       :method :post
+                       :parameters `(("version" . ,current-version)
+                                     ("T0"      . "BODY")
+                                     ("E0"      . ,(get-cliki-source page to-version))
+                                     ("summary" . "Spam detected, reverting to Known-Good.")
+                                     ("captcha" . "lisp")
+                                     ("name"    . "Claki (Revertobot Alpha)"))))
+
+(defun get-cliki-source (url version)
+  "Fetches the source text of a given version of a cliki page.  That is, it
+returns the text you should POST to revert a cliki page to the given version."
+  (multiple-value-bind (page status headers)
+      (drakma:http-request (format nil "http://cliki.net/~a?source&v=~a" url version))
+    (cond
+      ((= 200 status) page)
+      (t (error "crap!")))))
+
+(defun revert-new-spam ()
+  (let ((modified-pages (get-recent-changes)))
+    (loop :for page :in modified-pages
+          :do (parse-page page))))
+
+#+(or) (revert-new-spam)
+
+#+(or) (loop (sleep (* 60 60)) (revert-new-spam))