repos
/
claki
/ annotate_shade
summary
|
shortlog
|
log
|
tree
|
commit
|
commitdiff
|
headdiff
|
annotate
|
headblob
|
headfilediff
|
filehistory
normal
|
plain
|
shade
|
zebra
More declarative way of saying "every x minutes, give or take"
Annotate for file claki.lisp
2011-06-13 pix
1
(defpackage #:claki
19:08:27 '
2
(:use #:cl #:unify :alexandria))
'
3
(in-package #:claki)
'
4
'
5
(defvar *last-modified* (make-hash-table :test 'equal) "hash table of cliki pages and the last-modified header we last received.")
'
6
'
7
#+(or) (clrhash *last-modified*)
'
8
'
9
(defun get-cliki-page (url)
'
10
"Returns a page from cliki if it has been modified since we last saw it or nil
'
11
if it has not been modified. Signals an error otherwise."
'
12
(multiple-value-bind (page status headers)
2011-06-29 pix
13
(drakma:http-request (format nil "http://cliki.net/~a" url)
03:08:25 '
14
:additional-headers (when (gethash url *last-modified*)
'
15
`((:if-modified-since (gethash url *last-modified*)))))
2011-06-13 pix
16
(cond
19:08:27 '
17
;; If the page hasn't been modified, no need to update
'
18
((= 304 status) nil)
'
19
;; The "Recent Changes" page doesn't return 304s, but does return a last-modified.
'
20
((and (gethash url *last-modified*)
'
21
(cdr (assoc :last-modified headers))
'
22
(string= (cdr (assoc :last-modified headers)) (gethash url *last-modified*)))
'
23
nil)
'
24
((= 200 status)
'
25
(setf (gethash url *last-modified*) (cdr (assoc :last-modified headers)))
'
26
page)
'
27
(t (error "crap!")))))
'
28
'
29
(defun get-recent-changes ()
'
30
(when-let ((page (get-cliki-page "Recent%20Changes")))
'
31
(match (#t(oh-ducks:html ("blockquote > b > a.internal" . ?links)) page)
'
32
(loop :for link :in (remove-duplicates links :test #'equalp
'
33
:key (lambda (x) (oh-ducks.traversal:element-attribute :href x))
'
34
:from-end t)
'
35
:collect (oh-ducks.traversal:element-attribute :href link)))))
'
36
'
37
#+(or) (get-recent-changes)
'
38
'
39
(defvar *spam-urls* (make-hash-table :test 'equal) "hash table of spam urls.")
'
40
(defvar *okay-urls* (make-hash-table :test 'equal) "hash table of acceptable urls.")
2011-06-13 pix
41
(defvar *urls-to-classify* (list) "list of (url-to-classify cliki-page-where-it-was-found page-version).")
2011-06-16 pix
42
(defvar *has-spam* (list) "list of (cliki-page . version) known to have spam.")
2011-06-13 pix
43
19:08:27 '
44
#+(or) (clrhash *okay-urls*)
'
45
'
46
(defun url-domain (url)
'
47
(puri:uri-host (puri:parse-uri url)))
'
48
'
49
(defun parse-page (page-url)
'
50
(when-let ((page (get-cliki-page page-url)))
'
51
(match (#t(oh-ducks:html ("a[href^=http]" . ?a)
'
52
("#footer > b" . #t(list ?b))) page)
'
53
(let ((current-version (oh-ducks.traversal:element-content b)))
'
54
(dolist (link a)
2011-06-13 pix
55
(let ((url (oh-ducks.traversal:element-attribute :href link)))
2011-06-13 pix
56
(cond
22:35:27 '
57
((or (gethash url *okay-urls*)
'
58
(gethash (url-domain url) *okay-urls*))
'
59
#+(or) (do-nothing))
'
60
((or (gethash url *spam-urls*)
2011-06-13 pix
61
(gethash (url-domain url) *spam-urls*)
22:58:13 '
62
(auto-classify link))
2011-06-13 pix
63
(pushnew (list page-url current-version) *has-spam* :test #'equal))
22:35:27 '
64
(t
'
65
(pushnew (list url page-url current-version) *urls-to-classify* :test #'equal)))))))))
2011-06-13 pix
66
19:08:27 '
67
#+(or) (parse-page "araneida")
'
68
2011-06-13 pix
69
(defun auto-classify (link)
22:58:13 '
70
"Auto-classify URLs based upon traits common to spammers."
'
71
(let ((rel (oh-ducks.traversal:element-attribute :rel link))
'
72
(url (oh-ducks.traversal:element-attribute :href link)))
'
73
(cond
'
74
((and (stringp rel)
'
75
(or (string-equal "follow" rel)
'
76
(string-equal "dofollow" rel)))
'
77
(setf (gethash url *spam-urls*) t))
'
78
(t nil))))
'
79
2011-06-18 pix
80
(defun request-classification (url)
2011-06-13 pix
81
(restart-case (error 'simple-error :format-control "Please classify the URL ~s."
19:08:27 '
82
:format-arguments (list url))
'
83
(mark-url-okay ()
'
84
:report "Mark this URL as acceptable."
'
85
(setf (gethash url *okay-urls*) t))
'
86
(mark-domain-okay ()
'
87
:report "Mark the domain as acceptable."
'
88
(setf (gethash (url-domain url) *okay-urls*) t))
'
89
(mark-url-spam ()
'
90
:report "Mark this URL as spam."
'
91
(setf (gethash url *spam-urls*) t))
'
92
(mark-domain-spam ()
'
93
:report "Mark the domain as spam."
2011-06-13 pix
94
(setf (gethash (url-domain url) *spam-urls*) t))
22:35:27 '
95
(classify-later ()
'
96
:report "Don't classify this URL yet."
'
97
nil)))
'
98
'
99
(defun classify-unknown-urls ()
'
100
(setf *urls-to-classify*
'
101
(loop :for (url page version) :in *urls-to-classify*
'
102
:unless (or (gethash url *okay-urls*)
'
103
(gethash (url-domain url) *okay-urls*)
'
104
(gethash url *spam-urls*)
'
105
(gethash (url-domain url) *spam-urls*)
2011-06-18 pix
106
(request-classification url))
2011-06-13 pix
107
:collect (list url page version))))
22:35:27 '
108
'
109
(defun mark-known-goods ()
'
110
(loop :for (page-url version) :in *has-spam*
'
111
:do (maybe-request-last-known-good page-url)))
'
112
'
113
(defun revert-spam ()
'
114
(setf *has-spam*
'
115
(loop :for (page-url version) :in *has-spam*
'
116
:unless (and (gethash page-url *last-known-good*)
'
117
(revert-page page-url version (gethash page-url *last-known-good*)))
'
118
:collect (list page-url version))))
2011-06-13 pix
119
19:08:27 '
120
(defvar *last-known-good* (make-hash-table :test 'equal) "hash table of cliki pages and the last-known \"good\" revision.")
'
121
'
122
(defun read-number ()
'
123
(format t "Enter a version: ")
'
124
(list (format nil "~a" (parse-integer (read-line)))))
'
125
'
126
(defun maybe-request-last-known-good (page)
'
127
(unless (gethash page *last-known-good*)
'
128
(restart-case (error 'simple-error :format-control "Do not know of a good version of cliki page ~s."
'
129
:format-arguments (list page))
'
130
(specify-version (version)
'
131
:interactive read-number
'
132
:report "Specify a known-good version."
'
133
(setf (gethash page *last-known-good*) version)))))
'
134
'
135
#+(or) (maybe-request-last-known-good "araneida")
'
136
2011-06-13 pix
137
(defun revert-page (url current-version to-version)
22:35:27 '
138
(multiple-value-bind (page status headers)
2011-06-29 pix
139
(drakma:http-request (format nil "http://cliki.net/edit/~a" url)
03:08:25 '
140
:method :post
'
141
:parameters `(("version" . ,current-version)
'
142
("T0" . "BODY")
'
143
("E0" . ,(get-cliki-source url to-version))
'
144
("summary" . "Spam detected, reverting to Known-Good.")
'
145
("captcha" . "lisp")
2011-06-16 pix
146
("name" . "Claki (Revertobot Alpha)")))
2011-06-14 pix
147
(declare (ignore headers))
2011-06-13 pix
148
(cond
22:35:27 '
149
((and (= status 200)
'
150
(not (search "rejected" page :test #'char-equal)))
2011-06-14 pix
151
(format t "; Reverted page ~s to version ~a.~%" url to-version)
2011-06-13 pix
152
page)
22:35:27 '
153
(t nil))))
2011-06-13 pix
154
19:08:27 '
155
(defun get-cliki-source (url version)
'
156
"Fetches the source text of a given version of a cliki page. That is, it
'
157
returns the text you should POST to revert a cliki page to the given version."
2011-06-15 pix
158
(or (find-in-cache url version)
09:21:48 '
159
(multiple-value-bind (page status headers)
2011-06-29 pix
160
(drakma:http-request (format nil "http://cliki.net/~a?source&v=~a" url version))
2011-06-15 pix
161
(declare (ignore headers))
09:21:48 '
162
(cond
'
163
((= 200 status)
'
164
;; We have to coerce to a simple-string because rucksack mistakenly
'
165
;; assumes non-simple strings have a fill pointer
'
166
(cache-known-good url version (coerce page 'simple-string))
'
167
page)
'
168
(t (error "crap!"))))))
'
169
'
170
#+(or) (get-cliki-source "araneida" "281")
2011-06-13 pix
171
2011-06-14 pix
172
(defun attended-revert-new-spam ()
2011-06-13 pix
173
(mapcar #'parse-page (get-recent-changes))
22:35:27 '
174
(attendant))
'
175
'
176
(defun attendant ()
'
177
(classify-unknown-urls)
'
178
(mark-known-goods)
'
179
(revert-spam))
'
180
2011-06-14 pix
181
(defun unattended-revert-new-spam ()
2011-06-13 pix
182
(mapcar #'parse-page (get-recent-changes))
22:35:27 '
183
(revert-spam))
2011-06-13 pix
184
2011-06-14 pix
185
#+(or) (attended-revert-new-spam)
08:41:24 '
186
#+(or) (attendant)
2011-06-13 pix
187
2011-06-13 pix
188
(defun seconds (s) s)
22:58:32 '
189
(defun minutes (m) (* (seconds 60) m))
'
190
(defun hours (h) (* (minutes 60) h))
2011-06-15 pix
191
(defun plus-or-minus (x y) (+ (- x y) (random (* 2 y))))
2011-06-13 pix
192
2011-06-28 pix
193
(defconstant +simple-time+ '(:year #\- :month #\- :day #\Space :hour #\: :min #\: :sec))
2011-06-14 pix
194
2011-06-28 pix
195
(defun run-for-a-while ()
2011-06-18 pix
196
(dotimes (i 20)
2011-06-15 pix
197
(sleep (minutes (plus-or-minus 30 5)))
2011-06-18 pix
198
(format t "; Unattented run at ~a~%" (local-time:format-timestring nil (local-time:now) :format +simple-time+))
2011-06-14 pix
199
(unattended-revert-new-spam)
2011-06-18 pix
200
(save-state)))
2011-06-14 pix
201
2011-06-18 pix
202
#+(or) (sb-thread:make-thread #'run-for-a-while :name "cliki reverter")
2011-06-14 pix
203
^L
2011-06-28 pix
204
(defvar *state-file* (or #.*load-pathname* #p"/home/pixel/repos/claki/state/"))
2011-06-14 pix
205
2011-06-15 pix
206
(defmacro with-rucksack-and-transaction ((rucksack) (&rest root-vars) &body body)
09:21:48 '
207
(with-unique-names (rest)
'
208
`(rucksack:with-rucksack (,rucksack *state-file*)
'
209
(rucksack:with-transaction ()
'
210
(destructuring-bind (&optional ,@root-vars &rest ,rest) (ordered-roots rs:*rucksack*)
'
211
(declare (ignore ,rest) ,@(when (member '_ root-vars) `((ignore _))))
'
212
,@body)))))
'
213
'
214
(defun ordered-roots (sack)
'
215
(sort (rs:rucksack-roots sack) #'< :key #'rs:object-id))
'
216
'
217
(defun initialize-rucksack ()
'
218
(with-rucksack-and-transaction (rs) ()
'
219
(let ((roots (rs:rucksack-roots rs:*rucksack*)))
'
220
(dotimes (i (- 2 (length roots)))
'
221
(rs:add-rucksack-root (make-instance 'rs:btree :key< 'string<) rs:*rucksack*)))))
'
222
'
223
#+(or) (initialize-rucksack)
'
224
2011-06-14 pix
225
(defun save-state ()
2011-06-15 pix
226
(with-rucksack-and-transaction (rs) (btree)
09:21:48 '
227
(rs:btree-insert btree 'spam-urls *spam-urls*)
'
228
(rs:btree-insert btree 'ham-urls *okay-urls*)
2011-06-16 pix
229
(rs:btree-insert btree 'known-good *last-known-good*)))
2011-06-14 pix
230
18:17:05 '
231
#+(or) (save-state)
'
232
'
233
(defun restore-state ()
2011-06-15 pix
234
(with-rucksack-and-transaction (rs) (btree)
09:21:48 '
235
(setf *spam-urls* (rs:btree-search btree 'spam-urls)
'
236
*okay-urls* (rs:btree-search btree 'ham-urls)
2011-06-16 pix
237
*last-known-good* (rs:btree-search btree 'known-good))))
2011-06-14 pix
238
18:17:05 '
239
#+(or) (restore-state)
2011-06-15 pix
240
#+(or) (with-rucksack-and-transaction (rs) ()
09:21:48 '
241
(ordered-roots rs:*rucksack*))
'
242
'
243
(defun cache-known-good (cliki-page version content)
'
244
(with-rucksack-and-transaction (rs) (_ btree)
'
245
(rs:btree-insert btree cliki-page (list version content))))
'
246
'
247
(defun find-in-cache (cliki-page version)
'
248
(with-rucksack-and-transaction (rs) (_ btree)
'
249
(destructuring-bind (&optional cached-version cached-content)
'
250
(rs:btree-search btree cliki-page :default-value nil :errorp nil)
'
251
(and cached-version
'
252
(string= cached-version version)
'
253
cached-content))))
'
254
'
255
#+(or) (find-in-cache "araneida" "281")