Edgewall Software

source: plugins/1.0/spam-filter/tracspamfilter/filters/url_blacklist.py

Last change on this file was 14824, checked in by Ryan J Ollos, 7 years ago

1.0.9dev: Conform to PEP8

File size: 4.4 KB
Line 
1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2015 Edgewall Software
4# Copyright (C) 2015 Dirk Stöcker <trac@dstoecker.de>
5# All rights reserved.
6#
7# This software is licensed as described in the file COPYING, which
8# you should have received as part of this distribution. The terms
9# are also available at http://trac.edgewall.com/license.html.
10#
11# This software consists of voluntary contributions made by many
12# individuals. For the exact contribution history, see the revision
13# history and logs, available at http://projects.edgewall.com/trac/.
14#
15# Author: Dirk Stöcker <trac@dstoecker.de>
16
17import re
18from dns.name import from_text
19from dns.resolver import NXDOMAIN, NoAnswer, NoNameservers, Timeout, query
20
21from trac.config import IntOption, ListOption
22from trac.core import Component, implements
23
24from tracspamfilter.api import IFilterStrategy, N_
25
26
27class URLBlacklistFilterStrategy(Component):
28 """Spam filter based on URL blacklistings.
29
30 Requires the dnspython module from http://www.dnspython.org/.
31 """
32 implements(IFilterStrategy)
33
34 karma_points = IntOption('spam-filter', 'url_blacklist_karma', '3',
35 """By how many points blacklisting by a single bad URL impacts the
36 overall karma of a submission.""", doc_domain='tracspamfilter')
37
38 servers_default = 'urired.spameatingmonkey.net, multi.surbl.org, ' \
39 'dbl.spamhaus.org'
40 servers = ListOption('spam-filter', 'url_blacklist_servers',
41 servers_default, doc="Servers used for URL blacklisting.",
42 doc_domain='tracspamfilter')
43
44 # IFilterStrategy implementation
45
46 def is_external(self):
47 return True
48
49 def test(self, req, author, content, ip):
50 if not self._check_preconditions(req, author, content, ip):
51 return
52
53 urls = self._geturls(author + "\n" + content)
54
55 if not urls:
56 return
57
58 self.log.debug('Checking for URL blacklisting on "%s"',
59 ", ".join(urls))
60
61 points = 0
62 servers = []
63
64 for server in self.servers:
65 for url in sorted(urls.keys()):
66 self.log.debug("Checking blacklist %s for %s", server, url)
67 try:
68 servers.append(self._query(url, server))
69 points -= abs(self.karma_points)
70 except NXDOMAIN: # not blacklisted on this server
71 #if url.startswith("www.") and not url[4:] in urls:
72 # try:
73 # self.log.debug("Checking blacklist %s for %s",
74 # server, url[4:])
75 # servers.append("[www.]%s" % self._query(url[4:], server))
76 # points -= abs(self.karma_points)
77 # except:
78 # pass
79 continue
80 except (Timeout, NoAnswer, NoNameservers), e:
81 self.log.warning('Error checking URL blacklist server '
82 '"%s" for URL "%s": %s', server, url, e)
83
84 if points != 0:
85 return points, N_("URL's blacklisted by %s"), ', '.join(servers)
86
87 def train(self, req, author, content, ip, spam=True):
88 return 0
89
90 # Internal methods
91
92 def _query(self, url, server):
93 res = query(from_text(url + '.' + server.encode('utf-8')))[0].to_text()
94 if res == '127.0.0.1':
95 return '%s (%s)' % (server, url)
96 # strip the common part of responses
97 if res.startswith('127.0.0.'):
98 res = res[8:]
99 elif res.startswith('127.'):
100 res = res[4:]
101 return '%s (%s[%s])' % (server, url, res)
102
103 def _check_preconditions(self, req, author, content, ip):
104 if self.karma_points == 0 or not self.servers:
105 return False
106
107 return True
108
109 def _geturls(self, content):
110 urls = {}
111 content = content.lower()
112 # no IDN domains, only punnycode
113 urlstr = re.compile("^([a-z0-9][a-z0-9.-]+[a-z0-9])(.?)")
114
115 while 1:
116 pos = content.find('//')
117 if pos < 0:
118 break
119 content = content[pos + 2:]
120 res = urlstr.search(content)
121 if res:
122 u = res.group(1)
123 urls[u] = urls.get(u, 0)
124 if res.group(2) not in ('"', '\'', '/', '\n', '.', '!',
125 '?', ',', ';', ''):
126 self.log.warn("Strange URL '%s' found.", u)
127 return urls
Note: See TracBrowser for help on using the repository browser.