Mon Jun 13 19:08:27 UTC 2011 pix@kepibu.org * Initial import diff -rN -u old-claki/claki.asd new-claki/claki.asd --- old-claki/claki.asd 1970-01-01 00:00:00.000000000 +0000 +++ new-claki/claki.asd 2013-07-18 20:08:35.000000000 +0000 @@ -0,0 +1,3 @@ +(defsystem :claki + :depends-on (:oh-ducks :closure-html :cl-unification :drakma :alexandria) + :components ((:file "claki"))) diff -rN -u old-claki/claki.lisp new-claki/claki.lisp --- old-claki/claki.lisp 1970-01-01 00:00:00.000000000 +0000 +++ new-claki/claki.lisp 2013-07-18 20:08:35.000000000 +0000 @@ -0,0 +1,129 @@ +(defpackage #:claki + (:use #:cl #:unify :alexandria)) +(in-package #:claki) + +(defvar *last-modified* (make-hash-table :test 'equal) "hash table of cliki pages and the last-modified header we last received.") + +#+(or) (clrhash *last-modified*) + +(defun get-cliki-page (url) + "Returns a page from cliki if it has been modified since we last saw it or nil +if it has not been modified. Signals an error otherwise." + (multiple-value-bind (page status headers) + (drakma:http-request (format nil "http://cliki.net/~a" url) + :additional-headers (when (gethash url *last-modified*) + `((:if-modified-since (gethash url *last-modified*))))) + (cond + ;; If the page hasn't been modified, no need to update + ((= 304 status) nil) + ;; The "Recent Changes" page doesn't return 304s, but does return a last-modified. + ((and (gethash url *last-modified*) + (cdr (assoc :last-modified headers)) + (string= (cdr (assoc :last-modified headers)) (gethash url *last-modified*))) + nil) + ((= 200 status) + (setf (gethash url *last-modified*) (cdr (assoc :last-modified headers))) + page) + (t (error "crap!"))))) + +(defun get-recent-changes () + (when-let ((page (get-cliki-page "Recent%20Changes"))) + (match (#t(oh-ducks:html ("blockquote > b > a.internal" . ?links)) page) + (loop :for link :in (remove-duplicates links :test #'equalp + :key (lambda (x) (oh-ducks.traversal:element-attribute :href x)) + :from-end t) + :collect (oh-ducks.traversal:element-attribute :href link))))) + +#+(or) (get-recent-changes) + +(defvar *spam-urls* (make-hash-table :test 'equal) "hash table of spam urls.") +(defvar *okay-urls* (make-hash-table :test 'equal) "hash table of acceptable urls.") + +#+(or) (clrhash *okay-urls*) + +(defun url-domain (url) + (puri:uri-host (puri:parse-uri url))) + +(defun parse-page (page-url) + (when-let ((page (get-cliki-page page-url))) + (match (#t(oh-ducks:html ("a[href^=http]" . ?a) + ("#footer > b" . #t(list ?b))) page) + (let ((current-version (oh-ducks.traversal:element-content b))) + (dolist (link a) + (let ((url (oh-ducks.traversal:element-attribute :href link))) + (tagbody + :handle-url + (cond + ((or (gethash url *okay-urls*) + (gethash (url-domain url) *okay-urls*)) + #+(or) (do-nothing)) + ((or (gethash url *spam-urls*) + (gethash (url-domain url) *spam-urls*)) + (maybe-request-last-known-good page-url) + (return-from parse-page (revert-page page-url current-version (gethash page-url *last-known-good*)))) + (t + (request-classification url) + (go :handle-url)))))))))) + +#+(or) (parse-page "araneida") + +(defun request-classification (url) + (restart-case (error 'simple-error :format-control "Please classify the URL ~s." + :format-arguments (list url)) + (mark-url-okay () + :report "Mark this URL as acceptable." + (setf (gethash url *okay-urls*) t)) + (mark-domain-okay () + :report "Mark the domain as acceptable." + (setf (gethash (url-domain url) *okay-urls*) t)) + (mark-url-spam () + :report "Mark this URL as spam." + (setf (gethash url *spam-urls*) t)) + (mark-domain-spam () + :report "Mark the domain as spam." + (setf (gethash (url-domain url) *spam-urls*) t)))) + +(defvar *last-known-good* (make-hash-table :test 'equal) "hash table of cliki pages and the last-known \"good\" revision.") + +(defun read-number () + (format t "Enter a version: ") + (list (format nil "~a" (parse-integer (read-line))))) + +(defun maybe-request-last-known-good (page) + (unless (gethash page *last-known-good*) + (restart-case (error 'simple-error :format-control "Do not know of a good version of cliki page ~s." + :format-arguments (list page)) + (specify-version (version) + :interactive read-number + :report "Specify a known-good version." + (setf (gethash page *last-known-good*) version))))) + +#+(or) (maybe-request-last-known-good "araneida") + +(defun revert-page (page current-version to-version) + (drakma:http-request (format nil "http://cliki.net/edit/~a" page) + :method :post + :parameters `(("version" . ,current-version) + ("T0" . "BODY") + ("E0" . ,(get-cliki-source page to-version)) + ("summary" . "Spam detected, reverting to Known-Good.") + ("captcha" . "lisp") + ("name" . "Claki (Revertobot Alpha)")))) + +(defun get-cliki-source (url version) + "Fetches the source text of a given version of a cliki page. That is, it +returns the text you should POST to revert a cliki page to the given version." + (multiple-value-bind (page status headers) + (drakma:http-request (format nil "http://cliki.net/~a?source&v=~a" url version)) + (cond + ((= 200 status) page) + (t (error "crap!"))))) + +(defun revert-new-spam () + (let ((modified-pages (get-recent-changes))) + (loop :for page :in modified-pages + :do (parse-page page)))) + +#+(or) (revert-new-spam) + +#+(or) (loop (sleep (* 60 60)) (revert-new-spam))