Mon Jun 13 22:58:13 UTC 2011 pix@kepibu.org * Move automatic URL classification into its own function. diff -rN -u old-claki/claki.lisp new-claki/claki.lisp --- old-claki/claki.lisp 2013-07-18 20:08:44.000000000 +0000 +++ new-claki/claki.lisp 2013-07-18 20:08:44.000000000 +0000 @@ -52,25 +52,31 @@ ("#footer > b" . #t(list ?b))) page) (let ((current-version (oh-ducks.traversal:element-content b))) (dolist (link a) - (let ((url (oh-ducks.traversal:element-attribute :href link)) - (rel (oh-ducks.traversal:element-attribute :rel link))) + (let ((url (oh-ducks.traversal:element-attribute :href link))) (cond ((or (gethash url *okay-urls*) (gethash (url-domain url) *okay-urls*)) #+(or) (do-nothing)) ((or (gethash url *spam-urls*) - (gethash (url-domain url) *spam-urls*)) - (pushnew (list page-url current-version) *has-spam* :test #'equal)) - ((and (stringp rel) - (or (string-equal "follow" rel) - (string-equal "dofollow" rel))) - (setf (gethash url *spam-urls*) t) + (gethash (url-domain url) *spam-urls*) + (auto-classify link)) (pushnew (list page-url current-version) *has-spam* :test #'equal)) (t (pushnew (list url page-url current-version) *urls-to-classify* :test #'equal))))))))) #+(or) (parse-page "araneida") +(defun auto-classify (link) + "Auto-classify URLs based upon traits common to spammers." + (let ((rel (oh-ducks.traversal:element-attribute :rel link)) + (url (oh-ducks.traversal:element-attribute :href link))) + (cond + ((and (stringp rel) + (or (string-equal "follow" rel) + (string-equal "dofollow" rel))) + (setf (gethash url *spam-urls*) t)) + (t nil)))) + (defun request-classification (url) (restart-case (error 'simple-error :format-control "Please classify the URL ~s." :format-arguments (list url))