(defpackage #:claki (:use #:cl #:unify :alexandria)) (in-package #:claki) (defvar *last-modified* (make-hash-table :test 'equal) "hash table of cliki pages and the last-modified header we last received.") #+(or) (clrhash *last-modified*) (defun http-request (&rest drakma-args) "A wrapper around drakma:http-request which automatically retries the request in the event of a timeout." (let ((times-failed 0)) (tagbody :fetch-page (handler-case (return-from http-request (apply #'drakma:http-request drakma-args)) (usocket:timeout-error () (sleep (* 60 (expt 2 times-failed))) (incf times-failed) (go :fetch-page)))))) (defun get-cliki-page (url) "Returns a page from cliki if it has been modified since we last saw it or nil if it has not been modified. Signals an error otherwise." (multiple-value-bind (page status headers) (http-request (format nil "http://cliki.net/~a" url) :additional-headers (when (gethash url *last-modified*) `((:if-modified-since (gethash url *last-modified*))))) ;; FIXME: this doesn't work all that well: it turns out cliki uses a single ;; last-modified header for the entire wiki, rather than one per page, so we ;; aren't saving ourselves (or cliki) much with this. (cond ;; If the page hasn't been modified, no need to update ((= 304 status) nil) ;; The "Recent Changes" page doesn't return 304s, but does return a last-modified. ((and (gethash url *last-modified*) (cdr (assoc :last-modified headers)) (string= (cdr (assoc :last-modified headers)) (gethash url *last-modified*))) nil) ((= 200 status) (setf (gethash url *last-modified*) (cdr (assoc :last-modified headers))) page) (t (error "crap!"))))) (defun get-recent-changes () (when-let ((page (get-cliki-page "Recent%20Changes"))) (match (#t(oh-ducks:html ("blockquote > b > a.internal" . ?links)) page) (loop :for link :in (remove-duplicates links :test #'equalp :key (lambda (x) (oh-ducks.traversal:element-attribute :href x)) :from-end t) :collect (oh-ducks.traversal:element-attribute :href link))))) #+(or) (get-recent-changes) (defvar *spam-urls* (make-hash-table :test 'equal) "hash table of spam urls.") (defvar *okay-urls* (make-hash-table :test 'equal) "hash table of acceptable urls.") (defvar *urls-to-classify* (list) "list of (url-to-classify cliki-page-where-it-was-found page-version).") (defvar *has-spam* (list) "list of (cliki-page version) known to have spam.") (defvar *updated-pages* (list) "list of pages that have been updated in the format (cliki-page version).") (defvar *ignore-update* (list) "list of pages in the format (cliki-page version) which updates should not result in updating the last-known good version (e.g., because we did it and it was a reversion).") #+(or) (clrhash *okay-urls*) (defun url-domain (url) (puri:uri-host (puri:parse-uri url))) (defun parse-page (page-url) (when-let ((page (get-cliki-page page-url))) (match (#t(oh-ducks:html ("a[href^=http]" . ?a) ("#footer > b" . #t(list ?b))) page) (let ((current-version (oh-ducks.traversal:element-content b))) (format t "; Page ~s modified, now at version ~a.~%" page-url current-version) (when (string= (gethash page-url *last-known-good*) current-version) (return-from parse-page nil)) (pushnew (list page-url current-version) *updated-pages* :test #'equal) (dolist (link a) (let ((url (oh-ducks.traversal:element-attribute :href link))) (cond ((or (gethash url *okay-urls*) (gethash (url-domain url) *okay-urls*)) #+(or) (do-nothing)) ((or (gethash url *spam-urls*) (gethash (url-domain url) *spam-urls*) (auto-classify link)) (pushnew (list page-url current-version) *has-spam* :test #'equal)) (t (pushnew (list url page-url current-version) *urls-to-classify* :test #'equal))))))))) #+(or) (parse-page "araneida") (defun auto-classify (link) "Auto-classify URLs based upon traits common to spammers." (let ((rel (oh-ducks.traversal:element-attribute :rel link)) (style (oh-ducks.traversal:element-attribute :style link)) (url (oh-ducks.traversal:element-attribute :href link))) (cond ((and (stringp rel) (or (string-equal "follow" rel) (string-equal "dofollow" rel))) (setf (gethash url *spam-urls*) t)) ((and (stringp style) (cl-ppcre:scan "text-decoration[ ]*:[ ]*none" style)) (setf (gethash url *spam-urls*) t)) (t nil)))) (defun request-classification (url &optional page version) (restart-case (error 'simple-error :format-control "Please classify the URL ~s." :format-arguments (list url)) (mark-url-okay () :report "Mark this URL as acceptable." (setf (gethash url *okay-urls*) t)) (mark-domain-okay () :report "Mark the domain as acceptable." (setf (gethash (url-domain url) *okay-urls*) t)) (mark-url-spam () :report "Mark this URL as spam." (when (and page version) (pushnew (list page version) *has-spam* :test #'equal)) (setf (gethash url *spam-urls*) t)) (mark-domain-spam () :report "Mark the domain as spam." (when (and page version) (pushnew (list page version) *has-spam* :test #'equal)) (setf (gethash (url-domain url) *spam-urls*) t)) (classify-later () :report "Don't classify this URL yet." nil))) (defun classify-unknown-urls () (setf *urls-to-classify* (loop :for (url page version) :in *urls-to-classify* :unless (or (gethash url *okay-urls*) (gethash (url-domain url) *okay-urls*) (gethash url *spam-urls*) (gethash (url-domain url) *spam-urls*) (request-classification url page version)) :collect (list url page version)))) (defun mark-known-goods () (loop :for (page-url version) :in *has-spam* :do (maybe-request-last-known-good page-url))) (defun revert-spam () (setf *has-spam* (loop :for (page-url version) :in *has-spam* :unless (and (gethash page-url *last-known-good*) (revert-page page-url version (gethash page-url *last-known-good*))) :collect (list page-url version)))) (defvar *last-known-good* (make-hash-table :test 'equal) "hash table of cliki pages and the last-known \"good\" revision.") (defun read-number () (format t "Enter a version: ") (list (format nil "~a" (parse-integer (read-line))))) (defun maybe-request-last-known-good (page) (unless (gethash page *last-known-good*) (restart-case (error 'simple-error :format-control "Do not know of a good version of cliki page ~s." :format-arguments (list page)) (specify-version (version) :interactive read-number :report "Specify a known-good version." (setf (gethash page *last-known-good*) version))))) #+(or) (maybe-request-last-known-good "araneida") (defun update-last-known-good () ;; Remove known-spam pages from list of updates (loop :for (page-url version) :in *has-spam* :do (removef *updated-pages* (list page-url version) :test #'equal)) ;; Remove updates we made from list of updates (that way, we'll continue to ;; use the old cached known-good, saving a bit of strain on cliki) (loop :for (page-url version) :in *ignore-update* :do (removef *updated-pages* (list page-url version) :test #'equal)) (setf *updated-pages* (loop :for (page-url version) :in *updated-pages* ;; If there are unclassified urls from this page, don't mark good ;; (could be spam!) :if (notany (lambda (x) (and (string= page-url (second x)) (string= version (third x)))) *urls-to-classify*) :do (format t "; Updating last-known good of ~s from ~a to ~a~%" page-url (gethash page-url *last-known-good*) version) (setf (gethash page-url *last-known-good*) version) :else :collect (list page-url version)))) (defun numstring+1 (numstring) (write-to-string (1+ (parse-integer numstring)))) (defun revert-page (url current-version to-version) (multiple-value-bind (page status headers) (http-request (format nil "http://cliki.net/edit/~a" url) :method :post :parameters `(("version" . ,current-version) ("T0" . "BODY") ("E0" . ,(get-cliki-source url to-version)) ("summary" . "Spam detected, reverting to Known-Good.") ("captcha" . "lisp") ("name" . "Claki (Revertobot Beta)"))) (declare (ignore headers)) (cond ((and (= status 200) (not (search "rejected" page :test #'char-equal))) (format t "; Reverted page ~s to version ~a.~%" url to-version) (pushnew (list url (numstring+1 current-version)) *ignore-update* :test #'equal) page) (t nil)))) (defun get-cliki-source (url version) "Fetches the source text of a given version of a cliki page. That is, it returns the text you should POST to revert a cliki page to the given version." (or (find-in-cache url version) (multiple-value-bind (page status headers) (http-request (format nil "http://cliki.net/~a?source&v=~a" url version)) (declare (ignore headers)) (cond ((= 200 status) ;; We have to coerce to a simple-string because rucksack mistakenly ;; assumes non-simple strings have a fill pointer (cache-known-good url version (coerce page 'simple-string)) page) (t (error "crap!")))))) #+(or) (get-cliki-source "araneida" "281") (defun attended-revert-new-spam () (mapcar #'parse-page (get-recent-changes)) (attendant)) (defun attendant () (classify-unknown-urls) (update-last-known-good) (mark-known-goods) (revert-spam)) (defun unattended-revert-new-spam () (mapcar #'parse-page (get-recent-changes)) (update-last-known-good) (revert-spam)) #+(or) (attended-revert-new-spam) #+(or) (attendant) (defun seconds (s) s) (defun minutes (m) (* (seconds 60) m)) (defun hours (h) (* (minutes 60) h)) (defun days (d) (* (hours 24) d)) (defun plus-or-minus (x y) (+ (- x y) (random (* 2 y)))) (defconstant +simple-time+ '(:year #\- (:month 2) #\- (:day 2) #\Space (:hour 2) #\: (:min 2) #\: (:sec 2))) (defun now () (local-time:format-timestring nil (local-time:now) :format +simple-time+)) (defun run-for-a-while (how-long how-often variance) (format t "; Beginning run at ~a~%" (now)) (dotimes (i (floor how-long how-often)) (sleep (plus-or-minus how-often variance)) (format t "; Unattended run at ~a~%" (now)) (unattended-revert-new-spam) (save-state)) (format t "; Run ended at ~a~%~%" (now))) #+(or) (let ((stdout *standard-output*)) (sb-thread:make-thread (lambda () (let ((*standard-output* stdout)) (run-for-a-while (days 2) (minutes 30) (minutes 5)))) :name "cliki reverter")) (defvar *state-file* (merge-pathnames #p"state/" (directory-namestring (or #.*load-pathname* #p"/home/pixel/repos/claki/")))) (defmacro with-rucksack-and-transaction ((rucksack) (&rest root-vars) &body body) (with-unique-names (rest) `(rucksack:with-rucksack (,rucksack *state-file*) (rucksack:with-transaction () (destructuring-bind (&optional ,@root-vars &rest ,rest) (ordered-roots rs:*rucksack*) (declare (ignore ,rest) ,@(when (member '_ root-vars) `((ignore _)))) ,@body))))) (defun ordered-roots (sack) (sort (rs:rucksack-roots sack) #'< :key #'rs:object-id)) (defun initialize-rucksack () (with-rucksack-and-transaction (rs) () (let ((roots (rs:rucksack-roots rs:*rucksack*))) (dotimes (i (- 2 (length roots))) (rs:add-rucksack-root (make-instance 'rs:btree :key< 'string<) rs:*rucksack*))))) #+(or) (initialize-rucksack) (defun save-state () (with-rucksack-and-transaction (rs) (btree) (rs:btree-insert btree 'spam-urls *spam-urls*) (rs:btree-insert btree 'ham-urls *okay-urls*) (rs:btree-insert btree 'known-good *last-known-good*) (rs:btree-insert btree 'updated-pages *updated-pages*) (rs:btree-insert btree 'our-updates *ignore-update*))) #+(or) (save-state) (defun restore-state () (with-rucksack-and-transaction (rs) (btree) (setf *spam-urls* (rs:btree-search btree 'spam-urls) *okay-urls* (rs:btree-search btree 'ham-urls) *last-known-good* (rs:btree-search btree 'known-good) *updated-pages* (rs:btree-search btree 'updated-pages) *ignore-update* (rs:btree-search btree 'our-updates)))) #+(or) (restore-state) #+(or) (with-rucksack-and-transaction (rs) () (ordered-roots rs:*rucksack*)) (defun cache-known-good (cliki-page version content) (with-rucksack-and-transaction (rs) (_ btree) (rs:btree-insert btree cliki-page (list version content)))) (defun find-in-cache (cliki-page version) (with-rucksack-and-transaction (rs) (_ btree) (destructuring-bind (&optional cached-version cached-content) (rs:btree-search btree cliki-page :default-value nil :errorp nil) (and cached-version (string= cached-version version) cached-content)))) #+(or) (find-in-cache "araneida" "281")