Move automatic URL classification into its own function.
Mon Jun 13 22:58:13 UTC 2011 pix@kepibu.org
* Move automatic URL classification into its own function.
diff -rN -u old-claki/claki.lisp new-claki/claki.lisp
--- old-claki/claki.lisp 2013-07-22 03:24:02.000000000 +0000
+++ new-claki/claki.lisp 2013-07-22 03:24:02.000000000 +0000
@@ -52,25 +52,31 @@
("#footer > b" . #t(list ?b))) page)
(let ((current-version (oh-ducks.traversal:element-content b)))
(dolist (link a)
- (let ((url (oh-ducks.traversal:element-attribute :href link))
- (rel (oh-ducks.traversal:element-attribute :rel link)))
+ (let ((url (oh-ducks.traversal:element-attribute :href link)))
(cond
((or (gethash url *okay-urls*)
(gethash (url-domain url) *okay-urls*))
#+(or) (do-nothing))
((or (gethash url *spam-urls*)
- (gethash (url-domain url) *spam-urls*))
- (pushnew (list page-url current-version) *has-spam* :test #'equal))
- ((and (stringp rel)
- (or (string-equal "follow" rel)
- (string-equal "dofollow" rel)))
- (setf (gethash url *spam-urls*) t)
+ (gethash (url-domain url) *spam-urls*)
+ (auto-classify link))
(pushnew (list page-url current-version) *has-spam* :test #'equal))
(t
(pushnew (list url page-url current-version) *urls-to-classify* :test #'equal)))))))))
#+(or) (parse-page "araneida")
+(defun auto-classify (link)
+ "Auto-classify URLs based upon traits common to spammers."
+ (let ((rel (oh-ducks.traversal:element-attribute :rel link))
+ (url (oh-ducks.traversal:element-attribute :href link)))
+ (cond
+ ((and (stringp rel)
+ (or (string-equal "follow" rel)
+ (string-equal "dofollow" rel)))
+ (setf (gethash url *spam-urls*) t))
+ (t nil))))
+
(defun request-classification (url)
(restart-case (error 'simple-error :format-control "Please classify the URL ~s."
:format-arguments (list url))