repos
/
claki
/ annotate_shade
summary
|
shortlog
|
log
|
tree
|
commit
|
commitdiff
|
headdiff
|
annotate
|
headblob
|
headfilediff
|
filehistory
normal
|
plain
|
shade
|
zebra
Add FIXME noting a behavior of cliki I wasn't expecting
Annotate for file claki.lisp
2011-06-13 pix
1
(defpackage #:claki
19:08:27 '
2
(:use #:cl #:unify :alexandria))
'
3
(in-package #:claki)
'
4
'
5
(defvar *last-modified* (make-hash-table :test 'equal) "hash table of cliki pages and the last-modified header we last received.")
'
6
'
7
#+(or) (clrhash *last-modified*)
'
8
'
9
(defun get-cliki-page (url)
'
10
"Returns a page from cliki if it has been modified since we last saw it or nil
'
11
if it has not been modified. Signals an error otherwise."
'
12
(multiple-value-bind (page status headers)
2011-06-29 pix
13
(drakma:http-request (format nil "http://cliki.net/~a" url)
03:08:25 '
14
:additional-headers (when (gethash url *last-modified*)
'
15
`((:if-modified-since (gethash url *last-modified*)))))
2011-06-18 pix
16
;; FIXME: this doesn't work all that well: it turns out cliki uses a single
23:15:57 '
17
;; last-modified header for the entire wiki, rather than one per page, so we
'
18
;; aren't saving ourselves (or cliki) much with this.
2011-06-13 pix
19
(cond
19:08:27 '
20
;; If the page hasn't been modified, no need to update
'
21
((= 304 status) nil)
'
22
;; The "Recent Changes" page doesn't return 304s, but does return a last-modified.
'
23
((and (gethash url *last-modified*)
'
24
(cdr (assoc :last-modified headers))
'
25
(string= (cdr (assoc :last-modified headers)) (gethash url *last-modified*)))
'
26
nil)
'
27
((= 200 status)
'
28
(setf (gethash url *last-modified*) (cdr (assoc :last-modified headers)))
'
29
page)
'
30
(t (error "crap!")))))
'
31
'
32
(defun get-recent-changes ()
'
33
(when-let ((page (get-cliki-page "Recent%20Changes")))
'
34
(match (#t(oh-ducks:html ("blockquote > b > a.internal" . ?links)) page)
'
35
(loop :for link :in (remove-duplicates links :test #'equalp
'
36
:key (lambda (x) (oh-ducks.traversal:element-attribute :href x))
'
37
:from-end t)
'
38
:collect (oh-ducks.traversal:element-attribute :href link)))))
'
39
'
40
#+(or) (get-recent-changes)
'
41
'
42
(defvar *spam-urls* (make-hash-table :test 'equal) "hash table of spam urls.")
'
43
(defvar *okay-urls* (make-hash-table :test 'equal) "hash table of acceptable urls.")
2011-06-13 pix
44
(defvar *urls-to-classify* (list) "list of (url-to-classify cliki-page-where-it-was-found page-version).")
2011-06-16 pix
45
(defvar *has-spam* (list) "list of (cliki-page version) known to have spam.")
05:58:14 '
46
(defvar *updated-pages* (list) "list of pages that have been updated in the format (cliki-page version).")
'
47
(defvar *ignore-update* (list) "list of pages in the format (cliki-page version) which updates should not result in updating the last-known good version (e.g., because we did it and it was a reversion).")
2011-06-13 pix
48
19:08:27 '
49
#+(or) (clrhash *okay-urls*)
'
50
'
51
(defun url-domain (url)
'
52
(puri:uri-host (puri:parse-uri url)))
'
53
'
54
(defun parse-page (page-url)
'
55
(when-let ((page (get-cliki-page page-url)))
'
56
(match (#t(oh-ducks:html ("a[href^=http]" . ?a)
'
57
("#footer > b" . #t(list ?b))) page)
'
58
(let ((current-version (oh-ducks.traversal:element-content b)))
2011-06-16 pix
59
(format t "; Page ~s modified, now at version ~a.~%" page-url current-version)
05:58:14 '
60
(pushnew (list page-url current-version) *updated-pages* :test #'equal)
2011-06-13 pix
61
(dolist (link a)
2011-06-13 pix
62
(let ((url (oh-ducks.traversal:element-attribute :href link)))
2011-06-13 pix
63
(cond
22:35:27 '
64
((or (gethash url *okay-urls*)
'
65
(gethash (url-domain url) *okay-urls*))
'
66
#+(or) (do-nothing))
'
67
((or (gethash url *spam-urls*)
2011-06-13 pix
68
(gethash (url-domain url) *spam-urls*)
22:58:13 '
69
(auto-classify link))
2011-06-13 pix
70
(pushnew (list page-url current-version) *has-spam* :test #'equal))
22:35:27 '
71
(t
'
72
(pushnew (list url page-url current-version) *urls-to-classify* :test #'equal)))))))))
2011-06-13 pix
73
19:08:27 '
74
#+(or) (parse-page "araneida")
'
75
2011-06-13 pix
76
(defun auto-classify (link)
22:58:13 '
77
"Auto-classify URLs based upon traits common to spammers."
'
78
(let ((rel (oh-ducks.traversal:element-attribute :rel link))
'
79
(url (oh-ducks.traversal:element-attribute :href link)))
'
80
(cond
'
81
((and (stringp rel)
'
82
(or (string-equal "follow" rel)
'
83
(string-equal "dofollow" rel)))
'
84
(setf (gethash url *spam-urls*) t))
'
85
(t nil))))
'
86
2011-06-18 pix
87
(defun request-classification (url &optional page version)
2011-06-13 pix
88
(restart-case (error 'simple-error :format-control "Please classify the URL ~s."
19:08:27 '
89
:format-arguments (list url))
'
90
(mark-url-okay ()
'
91
:report "Mark this URL as acceptable."
'
92
(setf (gethash url *okay-urls*) t))
'
93
(mark-domain-okay ()
'
94
:report "Mark the domain as acceptable."
'
95
(setf (gethash (url-domain url) *okay-urls*) t))
'
96
(mark-url-spam ()
'
97
:report "Mark this URL as spam."
2011-06-18 pix
98
(when (and page version)
23:14:59 '
99
(pushnew (list page version) *has-spam* :test #'equal))
2011-06-13 pix
100
(setf (gethash url *spam-urls*) t))
19:08:27 '
101
(mark-domain-spam ()
'
102
:report "Mark the domain as spam."
2011-06-18 pix
103
(when (and page version)
23:14:59 '
104
(pushnew (list page version) *has-spam* :test #'equal))
2011-06-13 pix
105
(setf (gethash (url-domain url) *spam-urls*) t))
22:35:27 '
106
(classify-later ()
'
107
:report "Don't classify this URL yet."
'
108
nil)))
'
109
'
110
(defun classify-unknown-urls ()
'
111
(setf *urls-to-classify*
'
112
(loop :for (url page version) :in *urls-to-classify*
'
113
:unless (or (gethash url *okay-urls*)
'
114
(gethash (url-domain url) *okay-urls*)
'
115
(gethash url *spam-urls*)
'
116
(gethash (url-domain url) *spam-urls*)
2011-06-18 pix
117
(request-classification url page version))
2011-06-13 pix
118
:collect (list url page version))))
22:35:27 '
119
'
120
(defun mark-known-goods ()
'
121
(loop :for (page-url version) :in *has-spam*
'
122
:do (maybe-request-last-known-good page-url)))
'
123
'
124
(defun revert-spam ()
'
125
(setf *has-spam*
'
126
(loop :for (page-url version) :in *has-spam*
'
127
:unless (and (gethash page-url *last-known-good*)
'
128
(revert-page page-url version (gethash page-url *last-known-good*)))
'
129
:collect (list page-url version))))
2011-06-13 pix
130
19:08:27 '
131
(defvar *last-known-good* (make-hash-table :test 'equal) "hash table of cliki pages and the last-known \"good\" revision.")
'
132
'
133
(defun read-number ()
'
134
(format t "Enter a version: ")
'
135
(list (format nil "~a" (parse-integer (read-line)))))
'
136
'
137
(defun maybe-request-last-known-good (page)
'
138
(unless (gethash page *last-known-good*)
'
139
(restart-case (error 'simple-error :format-control "Do not know of a good version of cliki page ~s."
'
140
:format-arguments (list page))
'
141
(specify-version (version)
'
142
:interactive read-number
'
143
:report "Specify a known-good version."
'
144
(setf (gethash page *last-known-good*) version)))))
'
145
'
146
#+(or) (maybe-request-last-known-good "araneida")
'
147
2011-06-16 pix
148
(defun update-last-known-good ()
05:58:14 '
149
;; Remove known-spam pages from list of updates
'
150
(loop :for (page-url version) :in *has-spam*
'
151
:do (removef *updated-pages* (list page-url version) :test #'equal))
'
152
;; Remove updates we made from list of updates (that way, we'll continue to
'
153
;; use the old cached known-good, saving a bit of strain on cliki)
'
154
(loop :for (page-url version) :in *ignore-update*
'
155
:do (removef *updated-pages* (list page-url version) :test #'equal))
'
156
(setf *updated-pages*
'
157
(loop :for (page-url version) :in *updated-pages*
'
158
;; If there are unclassified urls from this page, don't mark good
'
159
;; (could be spam!)
'
160
:if (notany (lambda (x) (and (string= page-url (second x))
'
161
(string= version (third x))))
'
162
*urls-to-classify*)
'
163
:do (format t "; Updating last-known good of ~s from ~a to ~a~%" page-url (gethash page-url *last-known-good*) version)
'
164
(setf (gethash page-url *last-known-good*) version)
'
165
:else
'
166
:collect (list page-url version))))
'
167
'
168
(defun numstring+1 (numstring)
'
169
(write-to-string (1+ (parse-integer numstring))))
'
170
2011-06-13 pix
171
(defun revert-page (url current-version to-version)
22:35:27 '
172
(multiple-value-bind (page status headers)
2011-06-29 pix
173
(drakma:http-request (format nil "http://cliki.net/edit/~a" url)
03:08:25 '
174
:method :post
'
175
:parameters `(("version" . ,current-version)
'
176
("T0" . "BODY")
'
177
("E0" . ,(get-cliki-source url to-version))
'
178
("summary" . "Spam detected, reverting to Known-Good.")
'
179
("captcha" . "lisp")
'
180
("name" . "Claki (Revertobot Beta)")))
2011-06-14 pix
181
(declare (ignore headers))
2011-06-13 pix
182
(cond
22:35:27 '
183
((and (= status 200)
'
184
(not (search "rejected" page :test #'char-equal)))
2011-06-14 pix
185
(format t "; Reverted page ~s to version ~a.~%" url to-version)
2011-06-16 pix
186
(pushnew (list url (numstring+1 current-version)) *ignore-update* :test #'equal)
2011-06-13 pix
187
page)
22:35:27 '
188
(t nil))))
2011-06-13 pix
189
19:08:27 '
190
(defun get-cliki-source (url version)
'
191
"Fetches the source text of a given version of a cliki page. That is, it
'
192
returns the text you should POST to revert a cliki page to the given version."
2011-06-15 pix
193
(or (find-in-cache url version)
09:21:48 '
194
(multiple-value-bind (page status headers)
2011-06-29 pix
195
(drakma:http-request (format nil "http://cliki.net/~a?source&v=~a" url version))
2011-06-15 pix
196
(declare (ignore headers))
09:21:48 '
197
(cond
'
198
((= 200 status)
'
199
;; We have to coerce to a simple-string because rucksack mistakenly
'
200
;; assumes non-simple strings have a fill pointer
'
201
(cache-known-good url version (coerce page 'simple-string))
'
202
page)
'
203
(t (error "crap!"))))))
'
204
'
205
#+(or) (get-cliki-source "araneida" "281")
2011-06-13 pix
206
2011-06-14 pix
207
(defun attended-revert-new-spam ()
2011-06-13 pix
208
(mapcar #'parse-page (get-recent-changes))
22:35:27 '
209
(attendant))
'
210
'
211
(defun attendant ()
'
212
(classify-unknown-urls)
2011-06-16 pix
213
(update-last-known-good)
2011-06-13 pix
214
(mark-known-goods)
22:35:27 '
215
(revert-spam))
'
216
2011-06-14 pix
217
(defun unattended-revert-new-spam ()
2011-06-13 pix
218
(mapcar #'parse-page (get-recent-changes))
2011-06-16 pix
219
(update-last-known-good)
2011-06-13 pix
220
(revert-spam))
2011-06-13 pix
221
2011-06-14 pix
222
#+(or) (attended-revert-new-spam)
08:41:24 '
223
#+(or) (attendant)
2011-06-13 pix
224
2011-06-13 pix
225
(defun seconds (s) s)
22:58:32 '
226
(defun minutes (m) (* (seconds 60) m))
'
227
(defun hours (h) (* (minutes 60) h))
2011-06-15 pix
228
(defun plus-or-minus (x y) (+ (- x y) (random (* 2 y))))
2011-06-13 pix
229
2011-06-28 pix
230
(defconstant +simple-time+ '(:year #\- :month #\- :day #\Space :hour #\: :min #\: :sec))
2011-06-18 pix
231
(defun now () (local-time:format-timestring nil (local-time:now) :format +simple-time+))
2011-06-14 pix
232
2011-06-28 pix
233
(defun run-for-a-while ()
2011-06-18 pix
234
(format t "; Beginning run at ~a~%" (now))
2011-06-28 pix
235
(dotimes (i (* 2 24))
2011-07-01 pix
236
(sleep (minutes (plus-or-minus 30 5)))
2011-06-18 pix
237
(format t "; Unattended run at ~a~%" (now))
2011-06-14 pix
238
(unattended-revert-new-spam)
2011-06-18 pix
239
(save-state))
23:15:27 '
240
(format t "; Run ended at ~a~%~%" (now)))
2011-06-14 pix
241
2011-06-18 pix
242
#+(or) (sb-thread:make-thread #'run-for-a-while :name "cliki reverter")
2011-06-14 pix
243
^L
2011-06-28 pix
244
(defvar *state-file* (or #.*load-pathname* #p"/home/pixel/repos/claki/state/"))
2011-06-14 pix
245
2011-06-15 pix
246
(defmacro with-rucksack-and-transaction ((rucksack) (&rest root-vars) &body body)
09:21:48 '
247
(with-unique-names (rest)
'
248
`(rucksack:with-rucksack (,rucksack *state-file*)
'
249
(rucksack:with-transaction ()
'
250
(destructuring-bind (&optional ,@root-vars &rest ,rest) (ordered-roots rs:*rucksack*)
'
251
(declare (ignore ,rest) ,@(when (member '_ root-vars) `((ignore _))))
'
252
,@body)))))
'
253
'
254
(defun ordered-roots (sack)
'
255
(sort (rs:rucksack-roots sack) #'< :key #'rs:object-id))
'
256
'
257
(defun initialize-rucksack ()
'
258
(with-rucksack-and-transaction (rs) ()
'
259
(let ((roots (rs:rucksack-roots rs:*rucksack*)))
'
260
(dotimes (i (- 2 (length roots)))
'
261
(rs:add-rucksack-root (make-instance 'rs:btree :key< 'string<) rs:*rucksack*)))))
'
262
'
263
#+(or) (initialize-rucksack)
'
264
2011-06-14 pix
265
(defun save-state ()
2011-06-15 pix
266
(with-rucksack-and-transaction (rs) (btree)
09:21:48 '
267
(rs:btree-insert btree 'spam-urls *spam-urls*)
'
268
(rs:btree-insert btree 'ham-urls *okay-urls*)
2011-06-16 pix
269
(rs:btree-insert btree 'known-good *last-known-good*)
05:58:14 '
270
(rs:btree-insert btree 'updated-pages *updated-pages*)
'
271
(rs:btree-insert btree 'our-updates *ignore-update*)))
2011-06-14 pix
272
18:17:05 '
273
#+(or) (save-state)
'
274
'
275
(defun restore-state ()
2011-06-15 pix
276
(with-rucksack-and-transaction (rs) (btree)
09:21:48 '
277
(setf *spam-urls* (rs:btree-search btree 'spam-urls)
'
278
*okay-urls* (rs:btree-search btree 'ham-urls)
2011-06-16 pix
279
*last-known-good* (rs:btree-search btree 'known-good)
05:58:14 '
280
*updated-pages* (rs:btree-search btree 'updated-pages)
'
281
*ignore-update* (rs:btree-search btree 'our-updates))))
2011-06-14 pix
282
18:17:05 '
283
#+(or) (restore-state)
2011-06-15 pix
284
#+(or) (with-rucksack-and-transaction (rs) ()
09:21:48 '
285
(ordered-roots rs:*rucksack*))
'
286
'
287
(defun cache-known-good (cliki-page version content)
'
288
(with-rucksack-and-transaction (rs) (_ btree)
'
289
(rs:btree-insert btree cliki-page (list version content))))
'
290
'
291
(defun find-in-cache (cliki-page version)
'
292
(with-rucksack-and-transaction (rs) (_ btree)
'
293
(destructuring-bind (&optional cached-version cached-content)
'
294
(rs:btree-search btree cliki-page :default-value nil :errorp nil)
'
295
(and cached-version
'
296
(string= cached-version version)
'
297
cached-content))))
'
298
'
299
#+(or) (find-in-cache "araneida" "281")