repos
/
claki
/ annotate_shade
summary
|
shortlog
|
log
|
tree
|
commit
|
commitdiff
|
headdiff
|
annotate
|
headblob
|
headfilediff
|
filehistory
normal
|
plain
|
shade
|
zebra
Fix some problems discovered on fresh-load
Annotate for file /claki.lisp
2011-06-13 pix
1
(defpackage #:claki
19:08:27 '
2
(:use #:cl #:unify :alexandria))
'
3
(in-package #:claki)
'
4
'
5
(defvar *last-modified* (make-hash-table :test 'equal) "hash table of cliki pages and the last-modified header we last received.")
'
6
'
7
#+(or) (clrhash *last-modified*)
'
8
2011-06-29 pix
9
(defun http-request (&rest drakma-args)
03:08:25 '
10
"A wrapper around drakma:http-request which automatically retries the request
'
11
in the event of a timeout."
'
12
(let ((times-failed 0))
'
13
(tagbody
'
14
:fetch-page
'
15
(handler-case (return-from http-request (apply #'drakma:http-request drakma-args))
'
16
(usocket:timeout-error ()
'
17
(sleep (* 60 (expt 2 times-failed)))
'
18
(incf times-failed)
'
19
(go :fetch-page))))))
'
20
2011-06-13 pix
21
(defun get-cliki-page (url)
19:08:27 '
22
"Returns a page from cliki if it has been modified since we last saw it or nil
'
23
if it has not been modified. Signals an error otherwise."
'
24
(multiple-value-bind (page status headers)
2011-06-29 pix
25
(http-request (format nil "http://cliki.net/~a" url)
03:08:25 '
26
:additional-headers (when (gethash url *last-modified*)
'
27
`((:if-modified-since (gethash url *last-modified*)))))
2011-06-18 pix
28
;; FIXME: this doesn't work all that well: it turns out cliki uses a single
23:15:57 '
29
;; last-modified header for the entire wiki, rather than one per page, so we
'
30
;; aren't saving ourselves (or cliki) much with this.
2011-06-13 pix
31
(cond
19:08:27 '
32
;; If the page hasn't been modified, no need to update
'
33
((= 304 status) nil)
'
34
;; The "Recent Changes" page doesn't return 304s, but does return a last-modified.
'
35
((and (gethash url *last-modified*)
'
36
(cdr (assoc :last-modified headers))
'
37
(string= (cdr (assoc :last-modified headers)) (gethash url *last-modified*)))
'
38
nil)
'
39
((= 200 status)
'
40
(setf (gethash url *last-modified*) (cdr (assoc :last-modified headers)))
'
41
page)
'
42
(t (error "crap!")))))
'
43
'
44
(defun get-recent-changes ()
'
45
(when-let ((page (get-cliki-page "Recent%20Changes")))
'
46
(match (#t(oh-ducks:html ("blockquote > b > a.internal" . ?links)) page)
'
47
(loop :for link :in (remove-duplicates links :test #'equalp
'
48
:key (lambda (x) (oh-ducks.traversal:element-attribute :href x))
'
49
:from-end t)
'
50
:collect (oh-ducks.traversal:element-attribute :href link)))))
'
51
'
52
#+(or) (get-recent-changes)
'
53
'
54
(defvar *spam-urls* (make-hash-table :test 'equal) "hash table of spam urls.")
'
55
(defvar *okay-urls* (make-hash-table :test 'equal) "hash table of acceptable urls.")
2011-06-13 pix
56
(defvar *urls-to-classify* (list) "list of (url-to-classify cliki-page-where-it-was-found page-version).")
2011-06-16 pix
57
(defvar *has-spam* (list) "list of (cliki-page version) known to have spam.")
05:58:14 '
58
(defvar *updated-pages* (list) "list of pages that have been updated in the format (cliki-page version).")
'
59
(defvar *ignore-update* (list) "list of pages in the format (cliki-page version) which updates should not result in updating the last-known good version (e.g., because we did it and it was a reversion).")
2011-06-13 pix
60
19:08:27 '
61
#+(or) (clrhash *okay-urls*)
'
62
'
63
(defun url-domain (url)
'
64
(puri:uri-host (puri:parse-uri url)))
'
65
'
66
(defun parse-page (page-url)
'
67
(when-let ((page (get-cliki-page page-url)))
'
68
(match (#t(oh-ducks:html ("a[href^=http]" . ?a)
'
69
("#footer > b" . #t(list ?b))) page)
'
70
(let ((current-version (oh-ducks.traversal:element-content b)))
2011-06-16 pix
71
(format t "; Page ~s modified, now at version ~a.~%" page-url current-version)
2011-06-28 pix
72
(when (string= (gethash page-url *last-known-good*) current-version)
08:04:14 '
73
(return-from parse-page nil))
2011-06-16 pix
74
(pushnew (list page-url current-version) *updated-pages* :test #'equal)
2011-06-13 pix
75
(dolist (link a)
2011-06-13 pix
76
(let ((url (oh-ducks.traversal:element-attribute :href link)))
2011-06-13 pix
77
(cond
22:35:27 '
78
((or (gethash url *okay-urls*)
'
79
(gethash (url-domain url) *okay-urls*))
'
80
#+(or) (do-nothing))
'
81
((or (gethash url *spam-urls*)
2011-06-13 pix
82
(gethash (url-domain url) *spam-urls*)
22:58:13 '
83
(auto-classify link))
2011-06-13 pix
84
(pushnew (list page-url current-version) *has-spam* :test #'equal))
22:35:27 '
85
(t
'
86
(pushnew (list url page-url current-version) *urls-to-classify* :test #'equal)))))))))
2011-06-13 pix
87
19:08:27 '
88
#+(or) (parse-page "araneida")
'
89
2011-06-13 pix
90
(defun auto-classify (link)
22:58:13 '
91
"Auto-classify URLs based upon traits common to spammers."
'
92
(let ((rel (oh-ducks.traversal:element-attribute :rel link))
2011-06-28 pix
93
(style (oh-ducks.traversal:element-attribute :style link))
2011-06-13 pix
94
(url (oh-ducks.traversal:element-attribute :href link)))
22:58:13 '
95
(cond
'
96
((and (stringp rel)
'
97
(or (string-equal "follow" rel)
'
98
(string-equal "dofollow" rel)))
'
99
(setf (gethash url *spam-urls*) t))
2011-06-28 pix
100
((and (stringp style)
08:04:46 '
101
(cl-ppcre:scan "text-decoration[ ]*:[ ]*none" style))
'
102
(setf (gethash url *spam-urls*) t))
2011-06-13 pix
103
(t nil))))
22:58:13 '
104
2011-06-18 pix
105
(defun request-classification (url &optional page version)
2011-06-13 pix
106
(restart-case (error 'simple-error :format-control "Please classify the URL ~s."
19:08:27 '
107
:format-arguments (list url))
'
108
(mark-url-okay ()
'
109
:report "Mark this URL as acceptable."
'
110
(setf (gethash url *okay-urls*) t))
'
111
(mark-domain-okay ()
'
112
:report "Mark the domain as acceptable."
'
113
(setf (gethash (url-domain url) *okay-urls*) t))
'
114
(mark-url-spam ()
'
115
:report "Mark this URL as spam."
2011-06-18 pix
116
(when (and page version)
23:14:59 '
117
(pushnew (list page version) *has-spam* :test #'equal))
2011-06-13 pix
118
(setf (gethash url *spam-urls*) t))
19:08:27 '
119
(mark-domain-spam ()
'
120
:report "Mark the domain as spam."
2011-06-18 pix
121
(when (and page version)
23:14:59 '
122
(pushnew (list page version) *has-spam* :test #'equal))
2011-06-13 pix
123
(setf (gethash (url-domain url) *spam-urls*) t))
22:35:27 '
124
(classify-later ()
'
125
:report "Don't classify this URL yet."
'
126
nil)))
'
127
'
128
(defun classify-unknown-urls ()
'
129
(setf *urls-to-classify*
'
130
(loop :for (url page version) :in *urls-to-classify*
'
131
:unless (or (gethash url *okay-urls*)
'
132
(gethash (url-domain url) *okay-urls*)
'
133
(gethash url *spam-urls*)
'
134
(gethash (url-domain url) *spam-urls*)
2011-06-18 pix
135
(request-classification url page version))
2011-06-13 pix
136
:collect (list url page version))))
22:35:27 '
137
'
138
(defun mark-known-goods ()
'
139
(loop :for (page-url version) :in *has-spam*
'
140
:do (maybe-request-last-known-good page-url)))
'
141
'
142
(defun revert-spam ()
'
143
(setf *has-spam*
'
144
(loop :for (page-url version) :in *has-spam*
'
145
:unless (and (gethash page-url *last-known-good*)
'
146
(revert-page page-url version (gethash page-url *last-known-good*)))
'
147
:collect (list page-url version))))
2011-06-13 pix
148
19:08:27 '
149
(defvar *last-known-good* (make-hash-table :test 'equal) "hash table of cliki pages and the last-known \"good\" revision.")
'
150
'
151
(defun read-number ()
'
152
(format t "Enter a version: ")
'
153
(list (format nil "~a" (parse-integer (read-line)))))
'
154
'
155
(defun maybe-request-last-known-good (page)
'
156
(unless (gethash page *last-known-good*)
'
157
(restart-case (error 'simple-error :format-control "Do not know of a good version of cliki page ~s."
'
158
:format-arguments (list page))
'
159
(specify-version (version)
'
160
:interactive read-number
'
161
:report "Specify a known-good version."
'
162
(setf (gethash page *last-known-good*) version)))))
'
163
'
164
#+(or) (maybe-request-last-known-good "araneida")
'
165
2011-06-16 pix
166
(defun update-last-known-good ()
05:58:14 '
167
;; Remove known-spam pages from list of updates
'
168
(loop :for (page-url version) :in *has-spam*
'
169
:do (removef *updated-pages* (list page-url version) :test #'equal))
'
170
;; Remove updates we made from list of updates (that way, we'll continue to
'
171
;; use the old cached known-good, saving a bit of strain on cliki)
'
172
(loop :for (page-url version) :in *ignore-update*
'
173
:do (removef *updated-pages* (list page-url version) :test #'equal))
'
174
(setf *updated-pages*
'
175
(loop :for (page-url version) :in *updated-pages*
'
176
;; If there are unclassified urls from this page, don't mark good
'
177
;; (could be spam!)
'
178
:if (notany (lambda (x) (and (string= page-url (second x))
'
179
(string= version (third x))))
'
180
*urls-to-classify*)
'
181
:do (format t "; Updating last-known good of ~s from ~a to ~a~%" page-url (gethash page-url *last-known-good*) version)
'
182
(setf (gethash page-url *last-known-good*) version)
'
183
:else
'
184
:collect (list page-url version))))
'
185
'
186
(defun numstring+1 (numstring)
'
187
(write-to-string (1+ (parse-integer numstring))))
'
188
2011-06-13 pix
189
(defun revert-page (url current-version to-version)
22:35:27 '
190
(multiple-value-bind (page status headers)
2011-06-29 pix
191
(http-request (format nil "http://cliki.net/edit/~a" url)
03:08:25 '
192
:method :post
'
193
:parameters `(("version" . ,current-version)
'
194
("T0" . "BODY")
'
195
("E0" . ,(get-cliki-source url to-version))
'
196
("summary" . "Spam detected, reverting to Known-Good.")
'
197
("captcha" . "lisp")
'
198
("name" . "Claki (Revertobot Beta)")))
2011-06-14 pix
199
(declare (ignore headers))
2011-06-13 pix
200
(cond
22:35:27 '
201
((and (= status 200)
'
202
(not (search "rejected" page :test #'char-equal)))
2011-06-14 pix
203
(format t "; Reverted page ~s to version ~a.~%" url to-version)
2011-06-16 pix
204
(pushnew (list url (numstring+1 current-version)) *ignore-update* :test #'equal)
2011-06-13 pix
205
page)
22:35:27 '
206
(t nil))))
2011-06-13 pix
207
19:08:27 '
208
(defun get-cliki-source (url version)
'
209
"Fetches the source text of a given version of a cliki page. That is, it
'
210
returns the text you should POST to revert a cliki page to the given version."
2011-06-15 pix
211
(or (find-in-cache url version)
09:21:48 '
212
(multiple-value-bind (page status headers)
2011-06-29 pix
213
(http-request (format nil "http://cliki.net/~a?source&v=~a" url version))
2011-06-15 pix
214
(declare (ignore headers))
09:21:48 '
215
(cond
'
216
((= 200 status)
'
217
;; We have to coerce to a simple-string because rucksack mistakenly
'
218
;; assumes non-simple strings have a fill pointer
'
219
(cache-known-good url version (coerce page 'simple-string))
'
220
page)
'
221
(t (error "crap!"))))))
'
222
'
223
#+(or) (get-cliki-source "araneida" "281")
2011-06-13 pix
224
2011-06-14 pix
225
(defun attended-revert-new-spam ()
2011-06-13 pix
226
(mapcar #'parse-page (get-recent-changes))
22:35:27 '
227
(attendant))
'
228
'
229
(defun attendant ()
'
230
(classify-unknown-urls)
2011-06-16 pix
231
(update-last-known-good)
2011-06-13 pix
232
(mark-known-goods)
22:35:27 '
233
(revert-spam))
'
234
2011-06-14 pix
235
(defun unattended-revert-new-spam ()
2011-06-13 pix
236
(mapcar #'parse-page (get-recent-changes))
2011-06-16 pix
237
(update-last-known-good)
2011-06-13 pix
238
(revert-spam))
2011-06-13 pix
239
2011-06-14 pix
240
#+(or) (attended-revert-new-spam)
08:41:24 '
241
#+(or) (attendant)
2011-06-13 pix
242
2011-06-13 pix
243
(defun seconds (s) s)
22:58:32 '
244
(defun minutes (m) (* (seconds 60) m))
'
245
(defun hours (h) (* (minutes 60) h))
2011-07-01 pix
246
(defun days (d) (* (hours 24) d))
2011-06-15 pix
247
(defun plus-or-minus (x y) (+ (- x y) (random (* 2 y))))
2011-06-13 pix
248
2011-06-28 pix
249
(defconstant +simple-time+ '(:year #\- (:month 2) #\- (:day 2) #\Space (:hour 2) #\: (:min 2) #\: (:sec 2)))
2011-06-18 pix
250
(defun now () (local-time:format-timestring nil (local-time:now) :format +simple-time+))
2011-06-14 pix
251
2011-07-01 pix
252
(defun run-for-a-while (how-long how-often variance)
2011-06-18 pix
253
(format t "; Beginning run at ~a~%" (now))
2011-07-01 pix
254
(dotimes (i (floor how-long how-often))
22:52:58 '
255
(sleep (plus-or-minus how-often variance))
2011-06-18 pix
256
(format t "; Unattended run at ~a~%" (now))
2011-06-14 pix
257
(unattended-revert-new-spam)
2011-06-18 pix
258
(save-state))
23:15:27 '
259
(format t "; Run ended at ~a~%~%" (now)))
2011-06-14 pix
260
2011-06-18 pix
261
#+(or)
23:16:27 '
262
(let ((stdout *standard-output*))
2011-07-01 pix
263
(sb-thread:make-thread (lambda () (let ((*standard-output* stdout)) (run-for-a-while (days 2) (minutes 30) (minutes 5))))
2011-06-18 pix
264
:name "cliki reverter"))
2011-06-14 pix
265
^L
2011-07-03 pix
266
(defvar *state-file* (merge-pathnames #p"state/" (directory-namestring (or #.*load-pathname* #p"/home/pixel/repos/claki/"))))
2011-06-14 pix
267
2011-06-15 pix
268
(defmacro with-rucksack-and-transaction ((rucksack) (&rest root-vars) &body body)
09:21:48 '
269
(with-unique-names (rest)
'
270
`(rucksack:with-rucksack (,rucksack *state-file*)
'
271
(rucksack:with-transaction ()
'
272
(destructuring-bind (&optional ,@root-vars &rest ,rest) (ordered-roots rs:*rucksack*)
'
273
(declare (ignore ,rest) ,@(when (member '_ root-vars) `((ignore _))))
'
274
,@body)))))
'
275
'
276
(defun ordered-roots (sack)
'
277
(sort (rs:rucksack-roots sack) #'< :key #'rs:object-id))
'
278
'
279
(defun initialize-rucksack ()
'
280
(with-rucksack-and-transaction (rs) ()
'
281
(let ((roots (rs:rucksack-roots rs:*rucksack*)))
'
282
(dotimes (i (- 2 (length roots)))
'
283
(rs:add-rucksack-root (make-instance 'rs:btree :key< 'string<) rs:*rucksack*)))))
'
284
'
285
#+(or) (initialize-rucksack)
'
286
2011-06-14 pix
287
(defun save-state ()
2011-06-15 pix
288
(with-rucksack-and-transaction (rs) (btree)
09:21:48 '
289
(rs:btree-insert btree 'spam-urls *spam-urls*)
'
290
(rs:btree-insert btree 'ham-urls *okay-urls*)
2011-06-16 pix
291
(rs:btree-insert btree 'known-good *last-known-good*)
05:58:14 '
292
(rs:btree-insert btree 'updated-pages *updated-pages*)
'
293
(rs:btree-insert btree 'our-updates *ignore-update*)))
2011-06-14 pix
294
18:17:05 '
295
#+(or) (save-state)
'
296
'
297
(defun restore-state ()
2011-06-15 pix
298
(with-rucksack-and-transaction (rs) (btree)
09:21:48 '
299
(setf *spam-urls* (rs:btree-search btree 'spam-urls)
'
300
*okay-urls* (rs:btree-search btree 'ham-urls)
2011-06-16 pix
301
*last-known-good* (rs:btree-search btree 'known-good)
05:58:14 '
302
*updated-pages* (rs:btree-search btree 'updated-pages)
'
303
*ignore-update* (rs:btree-search btree 'our-updates))))
2011-06-14 pix
304
18:17:05 '
305
#+(or) (restore-state)
2011-06-15 pix
306
#+(or) (with-rucksack-and-transaction (rs) ()
09:21:48 '
307
(ordered-roots rs:*rucksack*))
'
308
'
309
(defun cache-known-good (cliki-page version content)
'
310
(with-rucksack-and-transaction (rs) (_ btree)
'
311
(rs:btree-insert btree cliki-page (list version content))))
'
312
'
313
(defun find-in-cache (cliki-page version)
'
314
(with-rucksack-and-transaction (rs) (_ btree)
'
315
(destructuring-bind (&optional cached-version cached-content)
'
316
(rs:btree-search btree cliki-page :default-value nil :errorp nil)
'
317
(and cached-version
'
318
(string= cached-version version)
'
319
cached-content))))
'
320
'
321
#+(or) (find-in-cache "araneida" "281")