repos
/
claki
/ annotate_shade
summary
|
shortlog
|
log
|
tree
|
commit
|
commitdiff
|
headdiff
|
annotate
|
headblob
|
headfilediff
|
filehistory
normal
|
plain
|
shade
|
zebra
Split out an unattended and an attented portion
Annotate for file claki.lisp
2011-06-13 pix
1
(defpackage #:claki
19:08:27 '
2
(:use #:cl #:unify :alexandria))
'
3
(in-package #:claki)
'
4
'
5
(defvar *last-modified* (make-hash-table :test 'equal) "hash table of cliki pages and the last-modified header we last received.")
'
6
'
7
#+(or) (clrhash *last-modified*)
'
8
'
9
(defun get-cliki-page (url)
'
10
"Returns a page from cliki if it has been modified since we last saw it or nil
'
11
if it has not been modified. Signals an error otherwise."
'
12
(multiple-value-bind (page status headers)
2011-06-29 pix
13
(drakma:http-request (format nil "http://cliki.net/~a" url)
03:08:25 '
14
:additional-headers (when (gethash url *last-modified*)
'
15
`((:if-modified-since (gethash url *last-modified*)))))
2011-06-13 pix
16
(cond
19:08:27 '
17
;; If the page hasn't been modified, no need to update
'
18
((= 304 status) nil)
'
19
;; The "Recent Changes" page doesn't return 304s, but does return a last-modified.
'
20
((and (gethash url *last-modified*)
'
21
(cdr (assoc :last-modified headers))
'
22
(string= (cdr (assoc :last-modified headers)) (gethash url *last-modified*)))
'
23
nil)
'
24
((= 200 status)
'
25
(setf (gethash url *last-modified*) (cdr (assoc :last-modified headers)))
'
26
page)
'
27
(t (error "crap!")))))
'
28
'
29
(defun get-recent-changes ()
'
30
(when-let ((page (get-cliki-page "Recent%20Changes")))
'
31
(match (#t(oh-ducks:html ("blockquote > b > a.internal" . ?links)) page)
'
32
(loop :for link :in (remove-duplicates links :test #'equalp
'
33
:key (lambda (x) (oh-ducks.traversal:element-attribute :href x))
'
34
:from-end t)
'
35
:collect (oh-ducks.traversal:element-attribute :href link)))))
'
36
'
37
#+(or) (get-recent-changes)
'
38
'
39
(defvar *spam-urls* (make-hash-table :test 'equal) "hash table of spam urls.")
'
40
(defvar *okay-urls* (make-hash-table :test 'equal) "hash table of acceptable urls.")
2011-06-13 pix
41
(defvar *urls-to-classify* (list) "list of (url-to-classify cliki-page-where-it-was-found page-version).")
22:35:27 '
42
(defvar *has-spam* (list) "list of (cliki-page . version) known to have spam.")
2011-06-13 pix
43
19:08:27 '
44
#+(or) (clrhash *okay-urls*)
'
45
'
46
(defun url-domain (url)
'
47
(puri:uri-host (puri:parse-uri url)))
'
48
'
49
(defun parse-page (page-url)
'
50
(when-let ((page (get-cliki-page page-url)))
'
51
(match (#t(oh-ducks:html ("a[href^=http]" . ?a)
'
52
("#footer > b" . #t(list ?b))) page)
'
53
(let ((current-version (oh-ducks.traversal:element-content b)))
'
54
(dolist (link a)
2011-06-13 pix
55
(let ((url (oh-ducks.traversal:element-attribute :href link))
22:35:27 '
56
(rel (oh-ducks.traversal:element-attribute :rel link)))
'
57
(cond
'
58
((or (gethash url *okay-urls*)
'
59
(gethash (url-domain url) *okay-urls*))
'
60
#+(or) (do-nothing))
'
61
((or (gethash url *spam-urls*)
'
62
(gethash (url-domain url) *spam-urls*))
'
63
(pushnew (list page-url current-version) *has-spam* :test #'equal))
'
64
((and (stringp rel)
'
65
(or (string-equal "follow" rel)
'
66
(string-equal "dofollow" rel)))
'
67
(setf (gethash url *spam-urls*) t)
'
68
(pushnew (list page-url current-version) *has-spam* :test #'equal))
'
69
(t
'
70
(pushnew (list url page-url current-version) *urls-to-classify* :test #'equal)))))))))
2011-06-13 pix
71
19:08:27 '
72
#+(or) (parse-page "araneida")
'
73
2011-06-18 pix
74
(defun request-classification (url)
2011-06-13 pix
75
(restart-case (error 'simple-error :format-control "Please classify the URL ~s."
19:08:27 '
76
:format-arguments (list url))
'
77
(mark-url-okay ()
'
78
:report "Mark this URL as acceptable."
'
79
(setf (gethash url *okay-urls*) t))
'
80
(mark-domain-okay ()
'
81
:report "Mark the domain as acceptable."
'
82
(setf (gethash (url-domain url) *okay-urls*) t))
'
83
(mark-url-spam ()
'
84
:report "Mark this URL as spam."
'
85
(setf (gethash url *spam-urls*) t))
'
86
(mark-domain-spam ()
'
87
:report "Mark the domain as spam."
2011-06-13 pix
88
(setf (gethash (url-domain url) *spam-urls*) t))
22:35:27 '
89
(classify-later ()
'
90
:report "Don't classify this URL yet."
'
91
nil)))
'
92
'
93
(defun classify-unknown-urls ()
'
94
(setf *urls-to-classify*
'
95
(loop :for (url page version) :in *urls-to-classify*
'
96
:unless (or (gethash url *okay-urls*)
'
97
(gethash (url-domain url) *okay-urls*)
'
98
(gethash url *spam-urls*)
'
99
(gethash (url-domain url) *spam-urls*)
'
100
(request-classification url))
'
101
:collect (list url page version))))
'
102
'
103
(defun mark-known-goods ()
'
104
(loop :for (page-url version) :in *has-spam*
'
105
:do (maybe-request-last-known-good page-url)))
'
106
'
107
(defun revert-spam ()
'
108
(setf *has-spam*
'
109
(loop :for (page-url version) :in *has-spam*
'
110
:unless (and (gethash page-url *last-known-good*)
'
111
(revert-page page-url version (gethash page-url *last-known-good*)))
'
112
:collect (list page-url version))))
2011-06-13 pix
113
19:08:27 '
114
(defvar *last-known-good* (make-hash-table :test 'equal) "hash table of cliki pages and the last-known \"good\" revision.")
'
115
'
116
(defun read-number ()
'
117
(format t "Enter a version: ")
'
118
(list (format nil "~a" (parse-integer (read-line)))))
'
119
'
120
(defun maybe-request-last-known-good (page)
'
121
(unless (gethash page *last-known-good*)
'
122
(restart-case (error 'simple-error :format-control "Do not know of a good version of cliki page ~s."
'
123
:format-arguments (list page))
'
124
(specify-version (version)
'
125
:interactive read-number
'
126
:report "Specify a known-good version."
'
127
(setf (gethash page *last-known-good*) version)))))
'
128
'
129
#+(or) (maybe-request-last-known-good "araneida")
'
130
2011-06-13 pix
131
(defun revert-page (url current-version to-version)
22:35:27 '
132
(multiple-value-bind (page status headers)
'
133
(drakma:http-request (format nil "http://cliki.net/edit/~a" url)
'
134
:method :post
'
135
:parameters `(("version" . ,current-version)
'
136
("T0" . "BODY")
'
137
("E0" . ,(get-cliki-source url to-version))
'
138
("summary" . "Spam detected, reverting to Known-Good.")
'
139
("captcha" . "lisp")
'
140
("name" . "Claki (Revertobot Alpha)")))
'
141
(cond
'
142
((and (= status 200)
'
143
(not (search "rejected" page :test #'char-equal)))
'
144
page)
'
145
(t nil))))
2011-06-13 pix
146
19:08:27 '
147
(defun get-cliki-source (url version)
'
148
"Fetches the source text of a given version of a cliki page. That is, it
'
149
returns the text you should POST to revert a cliki page to the given version."
2011-06-15 pix
150
(multiple-value-bind (page status headers)
09:21:48 '
151
(drakma:http-request (format nil "http://cliki.net/~a?source&v=~a" url version))
'
152
(cond
'
153
((= 200 status) page)
'
154
(t (error "crap!")))))
2011-06-13 pix
155
2011-06-13 pix
156
(defun attented-revert-new-spam ()
22:35:27 '
157
(mapcar #'parse-page (get-recent-changes))
'
158
(attendant))
'
159
'
160
(defun attendant ()
'
161
(classify-unknown-urls)
'
162
(mark-known-goods)
'
163
(revert-spam))
'
164
'
165
(defun unattented-revert-new-spam ()
'
166
(mapcar #'parse-page (get-recent-changes))
'
167
(revert-spam))
2011-06-13 pix
168
2011-06-13 pix
169
#+(or) (attented-revert-new-spam)
2011-06-13 pix
170
2011-06-13 pix
171
#+(or) (loop (sleep (* 60 60)) (unattented-revert-new-spam))