repos
/
claki
/ annotate_shade
summary
|
shortlog
|
log
|
tree
|
commit
|
commitdiff
|
headdiff
|
annotate
|
headblob
|
headfilediff
|
filehistory
normal
|
plain
|
shade
|
zebra
Bugfix: if we manually classify an URL as spam, don't then mark the page that
Annotate for file claki.lisp
2011-06-13 pix
1
(defpackage #:claki
19:08:27 '
2
(:use #:cl #:unify :alexandria))
'
3
(in-package #:claki)
'
4
'
5
(defvar *last-modified* (make-hash-table :test 'equal) "hash table of cliki pages and the last-modified header we last received.")
'
6
'
7
#+(or) (clrhash *last-modified*)
'
8
'
9
(defun get-cliki-page (url)
'
10
"Returns a page from cliki if it has been modified since we last saw it or nil
'
11
if it has not been modified. Signals an error otherwise."
'
12
(multiple-value-bind (page status headers)
2011-06-29 pix
13
(drakma:http-request (format nil "http://cliki.net/~a" url)
03:08:25 '
14
:additional-headers (when (gethash url *last-modified*)
'
15
`((:if-modified-since (gethash url *last-modified*)))))
2011-06-13 pix
16
(cond
19:08:27 '
17
;; If the page hasn't been modified, no need to update
'
18
((= 304 status) nil)
'
19
;; The "Recent Changes" page doesn't return 304s, but does return a last-modified.
'
20
((and (gethash url *last-modified*)
'
21
(cdr (assoc :last-modified headers))
'
22
(string= (cdr (assoc :last-modified headers)) (gethash url *last-modified*)))
'
23
nil)
'
24
((= 200 status)
'
25
(setf (gethash url *last-modified*) (cdr (assoc :last-modified headers)))
'
26
page)
'
27
(t (error "crap!")))))
'
28
'
29
(defun get-recent-changes ()
'
30
(when-let ((page (get-cliki-page "Recent%20Changes")))
'
31
(match (#t(oh-ducks:html ("blockquote > b > a.internal" . ?links)) page)
'
32
(loop :for link :in (remove-duplicates links :test #'equalp
'
33
:key (lambda (x) (oh-ducks.traversal:element-attribute :href x))
'
34
:from-end t)
'
35
:collect (oh-ducks.traversal:element-attribute :href link)))))
'
36
'
37
#+(or) (get-recent-changes)
'
38
'
39
(defvar *spam-urls* (make-hash-table :test 'equal) "hash table of spam urls.")
'
40
(defvar *okay-urls* (make-hash-table :test 'equal) "hash table of acceptable urls.")
2011-06-13 pix
41
(defvar *urls-to-classify* (list) "list of (url-to-classify cliki-page-where-it-was-found page-version).")
2011-06-16 pix
42
(defvar *has-spam* (list) "list of (cliki-page version) known to have spam.")
05:58:14 '
43
(defvar *updated-pages* (list) "list of pages that have been updated in the format (cliki-page version).")
'
44
(defvar *ignore-update* (list) "list of pages in the format (cliki-page version) which updates should not result in updating the last-known good version (e.g., because we did it and it was a reversion).")
2011-06-13 pix
45
19:08:27 '
46
#+(or) (clrhash *okay-urls*)
'
47
'
48
(defun url-domain (url)
'
49
(puri:uri-host (puri:parse-uri url)))
'
50
'
51
(defun parse-page (page-url)
'
52
(when-let ((page (get-cliki-page page-url)))
'
53
(match (#t(oh-ducks:html ("a[href^=http]" . ?a)
'
54
("#footer > b" . #t(list ?b))) page)
'
55
(let ((current-version (oh-ducks.traversal:element-content b)))
2011-06-16 pix
56
(format t "; Page ~s modified, now at version ~a.~%" page-url current-version)
05:58:14 '
57
(pushnew (list page-url current-version) *updated-pages* :test #'equal)
2011-06-13 pix
58
(dolist (link a)
2011-06-13 pix
59
(let ((url (oh-ducks.traversal:element-attribute :href link)))
2011-06-13 pix
60
(cond
22:35:27 '
61
((or (gethash url *okay-urls*)
'
62
(gethash (url-domain url) *okay-urls*))
'
63
#+(or) (do-nothing))
'
64
((or (gethash url *spam-urls*)
2011-06-13 pix
65
(gethash (url-domain url) *spam-urls*)
22:58:13 '
66
(auto-classify link))
2011-06-13 pix
67
(pushnew (list page-url current-version) *has-spam* :test #'equal))
22:35:27 '
68
(t
'
69
(pushnew (list url page-url current-version) *urls-to-classify* :test #'equal)))))))))
2011-06-13 pix
70
19:08:27 '
71
#+(or) (parse-page "araneida")
'
72
2011-06-13 pix
73
(defun auto-classify (link)
22:58:13 '
74
"Auto-classify URLs based upon traits common to spammers."
'
75
(let ((rel (oh-ducks.traversal:element-attribute :rel link))
'
76
(url (oh-ducks.traversal:element-attribute :href link)))
'
77
(cond
'
78
((and (stringp rel)
'
79
(or (string-equal "follow" rel)
'
80
(string-equal "dofollow" rel)))
'
81
(setf (gethash url *spam-urls*) t))
'
82
(t nil))))
'
83
2011-06-18 pix
84
(defun request-classification (url &optional page version)
2011-06-13 pix
85
(restart-case (error 'simple-error :format-control "Please classify the URL ~s."
19:08:27 '
86
:format-arguments (list url))
'
87
(mark-url-okay ()
'
88
:report "Mark this URL as acceptable."
'
89
(setf (gethash url *okay-urls*) t))
'
90
(mark-domain-okay ()
'
91
:report "Mark the domain as acceptable."
'
92
(setf (gethash (url-domain url) *okay-urls*) t))
'
93
(mark-url-spam ()
'
94
:report "Mark this URL as spam."
2011-06-18 pix
95
(when (and page version)
23:14:59 '
96
(pushnew (list page version) *has-spam* :test #'equal))
2011-06-13 pix
97
(setf (gethash url *spam-urls*) t))
19:08:27 '
98
(mark-domain-spam ()
'
99
:report "Mark the domain as spam."
2011-06-18 pix
100
(when (and page version)
23:14:59 '
101
(pushnew (list page version) *has-spam* :test #'equal))
2011-06-13 pix
102
(setf (gethash (url-domain url) *spam-urls*) t))
22:35:27 '
103
(classify-later ()
'
104
:report "Don't classify this URL yet."
'
105
nil)))
'
106
'
107
(defun classify-unknown-urls ()
'
108
(setf *urls-to-classify*
'
109
(loop :for (url page version) :in *urls-to-classify*
'
110
:unless (or (gethash url *okay-urls*)
'
111
(gethash (url-domain url) *okay-urls*)
'
112
(gethash url *spam-urls*)
'
113
(gethash (url-domain url) *spam-urls*)
2011-06-18 pix
114
(request-classification url page version))
2011-06-13 pix
115
:collect (list url page version))))
22:35:27 '
116
'
117
(defun mark-known-goods ()
'
118
(loop :for (page-url version) :in *has-spam*
'
119
:do (maybe-request-last-known-good page-url)))
'
120
'
121
(defun revert-spam ()
'
122
(setf *has-spam*
'
123
(loop :for (page-url version) :in *has-spam*
'
124
:unless (and (gethash page-url *last-known-good*)
'
125
(revert-page page-url version (gethash page-url *last-known-good*)))
'
126
:collect (list page-url version))))
2011-06-13 pix
127
19:08:27 '
128
(defvar *last-known-good* (make-hash-table :test 'equal) "hash table of cliki pages and the last-known \"good\" revision.")
'
129
'
130
(defun read-number ()
'
131
(format t "Enter a version: ")
'
132
(list (format nil "~a" (parse-integer (read-line)))))
'
133
'
134
(defun maybe-request-last-known-good (page)
'
135
(unless (gethash page *last-known-good*)
'
136
(restart-case (error 'simple-error :format-control "Do not know of a good version of cliki page ~s."
'
137
:format-arguments (list page))
'
138
(specify-version (version)
'
139
:interactive read-number
'
140
:report "Specify a known-good version."
'
141
(setf (gethash page *last-known-good*) version)))))
'
142
'
143
#+(or) (maybe-request-last-known-good "araneida")
'
144
2011-06-16 pix
145
(defun update-last-known-good ()
05:58:14 '
146
;; Remove known-spam pages from list of updates
'
147
(loop :for (page-url version) :in *has-spam*
'
148
:do (removef *updated-pages* (list page-url version) :test #'equal))
'
149
;; Remove updates we made from list of updates (that way, we'll continue to
'
150
;; use the old cached known-good, saving a bit of strain on cliki)
'
151
(loop :for (page-url version) :in *ignore-update*
'
152
:do (removef *updated-pages* (list page-url version) :test #'equal))
'
153
(setf *updated-pages*
'
154
(loop :for (page-url version) :in *updated-pages*
'
155
;; If there are unclassified urls from this page, don't mark good
'
156
;; (could be spam!)
'
157
:if (notany (lambda (x) (and (string= page-url (second x))
'
158
(string= version (third x))))
'
159
*urls-to-classify*)
'
160
:do (format t "; Updating last-known good of ~s from ~a to ~a~%" page-url (gethash page-url *last-known-good*) version)
'
161
(setf (gethash page-url *last-known-good*) version)
'
162
:else
'
163
:collect (list page-url version))))
'
164
'
165
(defun numstring+1 (numstring)
'
166
(write-to-string (1+ (parse-integer numstring))))
'
167
2011-06-13 pix
168
(defun revert-page (url current-version to-version)
22:35:27 '
169
(multiple-value-bind (page status headers)
2011-06-29 pix
170
(drakma:http-request (format nil "http://cliki.net/edit/~a" url)
03:08:25 '
171
:method :post
'
172
:parameters `(("version" . ,current-version)
'
173
("T0" . "BODY")
'
174
("E0" . ,(get-cliki-source url to-version))
'
175
("summary" . "Spam detected, reverting to Known-Good.")
'
176
("captcha" . "lisp")
'
177
("name" . "Claki (Revertobot Beta)")))
2011-06-14 pix
178
(declare (ignore headers))
2011-06-13 pix
179
(cond
22:35:27 '
180
((and (= status 200)
'
181
(not (search "rejected" page :test #'char-equal)))
2011-06-14 pix
182
(format t "; Reverted page ~s to version ~a.~%" url to-version)
2011-06-16 pix
183
(pushnew (list url (numstring+1 current-version)) *ignore-update* :test #'equal)
2011-06-13 pix
184
page)
22:35:27 '
185
(t nil))))
2011-06-13 pix
186
19:08:27 '
187
(defun get-cliki-source (url version)
'
188
"Fetches the source text of a given version of a cliki page. That is, it
'
189
returns the text you should POST to revert a cliki page to the given version."
2011-06-15 pix
190
(or (find-in-cache url version)
09:21:48 '
191
(multiple-value-bind (page status headers)
2011-06-29 pix
192
(drakma:http-request (format nil "http://cliki.net/~a?source&v=~a" url version))
2011-06-15 pix
193
(declare (ignore headers))
09:21:48 '
194
(cond
'
195
((= 200 status)
'
196
;; We have to coerce to a simple-string because rucksack mistakenly
'
197
;; assumes non-simple strings have a fill pointer
'
198
(cache-known-good url version (coerce page 'simple-string))
'
199
page)
'
200
(t (error "crap!"))))))
'
201
'
202
#+(or) (get-cliki-source "araneida" "281")
2011-06-13 pix
203
2011-06-14 pix
204
(defun attended-revert-new-spam ()
2011-06-13 pix
205
(mapcar #'parse-page (get-recent-changes))
22:35:27 '
206
(attendant))
'
207
'
208
(defun attendant ()
'
209
(classify-unknown-urls)
2011-06-16 pix
210
(update-last-known-good)
2011-06-13 pix
211
(mark-known-goods)
22:35:27 '
212
(revert-spam))
'
213
2011-06-14 pix
214
(defun unattended-revert-new-spam ()
2011-06-13 pix
215
(mapcar #'parse-page (get-recent-changes))
2011-06-16 pix
216
(update-last-known-good)
2011-06-13 pix
217
(revert-spam))
2011-06-13 pix
218
2011-06-14 pix
219
#+(or) (attended-revert-new-spam)
08:41:24 '
220
#+(or) (attendant)
2011-06-13 pix
221
2011-06-13 pix
222
(defun seconds (s) s)
22:58:32 '
223
(defun minutes (m) (* (seconds 60) m))
'
224
(defun hours (h) (* (minutes 60) h))
2011-06-15 pix
225
(defun plus-or-minus (x y) (+ (- x y) (random (* 2 y))))
2011-06-13 pix
226
2011-06-28 pix
227
(defconstant +simple-time+ '(:year #\- :month #\- :day #\Space :hour #\: :min #\: :sec))
2011-06-14 pix
228
2011-06-28 pix
229
(defun run-for-a-while ()
2011-06-18 pix
230
(dotimes (i 20)
2011-07-01 pix
231
(sleep (minutes (plus-or-minus 30 5)))
2011-06-18 pix
232
(format t "; Unattented run at ~a~%" (local-time:format-timestring nil (local-time:now) :format +simple-time+))
2011-06-14 pix
233
(unattended-revert-new-spam)
2011-06-18 pix
234
(save-state)))
2011-06-14 pix
235
2011-06-18 pix
236
#+(or) (sb-thread:make-thread #'run-for-a-while :name "cliki reverter")
2011-06-14 pix
237
^L
2011-06-28 pix
238
(defvar *state-file* (or #.*load-pathname* #p"/home/pixel/repos/claki/state/"))
2011-06-14 pix
239
2011-06-15 pix
240
(defmacro with-rucksack-and-transaction ((rucksack) (&rest root-vars) &body body)
09:21:48 '
241
(with-unique-names (rest)
'
242
`(rucksack:with-rucksack (,rucksack *state-file*)
'
243
(rucksack:with-transaction ()
'
244
(destructuring-bind (&optional ,@root-vars &rest ,rest) (ordered-roots rs:*rucksack*)
'
245
(declare (ignore ,rest) ,@(when (member '_ root-vars) `((ignore _))))
'
246
,@body)))))
'
247
'
248
(defun ordered-roots (sack)
'
249
(sort (rs:rucksack-roots sack) #'< :key #'rs:object-id))
'
250
'
251
(defun initialize-rucksack ()
'
252
(with-rucksack-and-transaction (rs) ()
'
253
(let ((roots (rs:rucksack-roots rs:*rucksack*)))
'
254
(dotimes (i (- 2 (length roots)))
'
255
(rs:add-rucksack-root (make-instance 'rs:btree :key< 'string<) rs:*rucksack*)))))
'
256
'
257
#+(or) (initialize-rucksack)
'
258
2011-06-14 pix
259
(defun save-state ()
2011-06-15 pix
260
(with-rucksack-and-transaction (rs) (btree)
09:21:48 '
261
(rs:btree-insert btree 'spam-urls *spam-urls*)
'
262
(rs:btree-insert btree 'ham-urls *okay-urls*)
2011-06-16 pix
263
(rs:btree-insert btree 'known-good *last-known-good*)
05:58:14 '
264
(rs:btree-insert btree 'updated-pages *updated-pages*)
'
265
(rs:btree-insert btree 'our-updates *ignore-update*)))
2011-06-14 pix
266
18:17:05 '
267
#+(or) (save-state)
'
268
'
269
(defun restore-state ()
2011-06-15 pix
270
(with-rucksack-and-transaction (rs) (btree)
09:21:48 '
271
(setf *spam-urls* (rs:btree-search btree 'spam-urls)
'
272
*okay-urls* (rs:btree-search btree 'ham-urls)
2011-06-16 pix
273
*last-known-good* (rs:btree-search btree 'known-good)
05:58:14 '
274
*updated-pages* (rs:btree-search btree 'updated-pages)
'
275
*ignore-update* (rs:btree-search btree 'our-updates))))
2011-06-14 pix
276
18:17:05 '
277
#+(or) (restore-state)
2011-06-15 pix
278
#+(or) (with-rucksack-and-transaction (rs) ()
09:21:48 '
279
(ordered-roots rs:*rucksack*))
'
280
'
281
(defun cache-known-good (cliki-page version content)
'
282
(with-rucksack-and-transaction (rs) (_ btree)
'
283
(rs:btree-insert btree cliki-page (list version content))))
'
284
'
285
(defun find-in-cache (cliki-page version)
'
286
(with-rucksack-and-transaction (rs) (_ btree)
'
287
(destructuring-bind (&optional cached-version cached-content)
'
288
(rs:btree-search btree cliki-page :default-value nil :errorp nil)
'
289
(and cached-version
'
290
(string= cached-version version)
'
291
cached-content))))
'
292
'
293
#+(or) (find-in-cache "araneida" "281")