| 1 | # -*- coding: utf-8 -*-
|
|---|
| 2 | #
|
|---|
| 3 | # Copyright (C) 2015 Edgewall Software
|
|---|
| 4 | # Copyright (C) 2015 Dirk Stöcker <trac@dstoecker.de>
|
|---|
| 5 | # All rights reserved.
|
|---|
| 6 | #
|
|---|
| 7 | # This software is licensed as described in the file COPYING, which
|
|---|
| 8 | # you should have received as part of this distribution. The terms
|
|---|
| 9 | # are also available at http://trac.edgewall.com/license.html.
|
|---|
| 10 | #
|
|---|
| 11 | # This software consists of voluntary contributions made by many
|
|---|
| 12 | # individuals. For the exact contribution history, see the revision
|
|---|
| 13 | # history and logs, available at http://projects.edgewall.com/trac/.
|
|---|
| 14 | #
|
|---|
| 15 | # Author: Dirk Stöcker <trac@dstoecker.de>
|
|---|
| 16 |
|
|---|
| 17 | import re
|
|---|
| 18 | from dns.name import from_text
|
|---|
| 19 | from dns.resolver import NXDOMAIN, NoAnswer, NoNameservers, Timeout, query
|
|---|
| 20 |
|
|---|
| 21 | from trac.config import IntOption, ListOption
|
|---|
| 22 | from trac.core import Component, implements
|
|---|
| 23 |
|
|---|
| 24 | from tracspamfilter.api import IFilterStrategy, N_
|
|---|
| 25 |
|
|---|
| 26 |
|
|---|
| 27 | class URLBlacklistFilterStrategy(Component):
|
|---|
| 28 | """Spam filter based on URL blacklistings.
|
|---|
| 29 |
|
|---|
| 30 | Requires the dnspython module from http://www.dnspython.org/.
|
|---|
| 31 | """
|
|---|
| 32 | implements(IFilterStrategy)
|
|---|
| 33 |
|
|---|
| 34 | karma_points = IntOption('spam-filter', 'url_blacklist_karma', '3',
|
|---|
| 35 | """By how many points blacklisting by a single bad URL impacts the
|
|---|
| 36 | overall karma of a submission.""", doc_domain='tracspamfilter')
|
|---|
| 37 |
|
|---|
| 38 | servers_default = 'urired.spameatingmonkey.net, multi.surbl.org, ' \
|
|---|
| 39 | 'dbl.spamhaus.org'
|
|---|
| 40 | servers = ListOption('spam-filter', 'url_blacklist_servers',
|
|---|
| 41 | servers_default, doc="Servers used for URL blacklisting.",
|
|---|
| 42 | doc_domain='tracspamfilter')
|
|---|
| 43 |
|
|---|
| 44 | # IFilterStrategy implementation
|
|---|
| 45 |
|
|---|
| 46 | def is_external(self):
|
|---|
| 47 | return True
|
|---|
| 48 |
|
|---|
| 49 | def test(self, req, author, content, ip):
|
|---|
| 50 | if not self._check_preconditions(req, author, content, ip):
|
|---|
| 51 | return
|
|---|
| 52 |
|
|---|
| 53 | urls = self._geturls(author + "\n" + content)
|
|---|
| 54 |
|
|---|
| 55 | if not urls:
|
|---|
| 56 | return
|
|---|
| 57 |
|
|---|
| 58 | self.log.debug('Checking for URL blacklisting on "%s"',
|
|---|
| 59 | ", ".join(urls))
|
|---|
| 60 |
|
|---|
| 61 | points = 0
|
|---|
| 62 | servers = []
|
|---|
| 63 |
|
|---|
| 64 | for server in self.servers:
|
|---|
| 65 | for url in sorted(urls.keys()):
|
|---|
| 66 | self.log.debug("Checking blacklist %s for %s", server, url)
|
|---|
| 67 | try:
|
|---|
| 68 | servers.append(self._query(url, server))
|
|---|
| 69 | points -= abs(self.karma_points)
|
|---|
| 70 | except NXDOMAIN: # not blacklisted on this server
|
|---|
| 71 | #if url.startswith("www.") and not url[4:] in urls:
|
|---|
| 72 | # try:
|
|---|
| 73 | # self.log.debug("Checking blacklist %s for %s",
|
|---|
| 74 | # server, url[4:])
|
|---|
| 75 | # servers.append("[www.]%s" % self._query(url[4:], server))
|
|---|
| 76 | # points -= abs(self.karma_points)
|
|---|
| 77 | # except:
|
|---|
| 78 | # pass
|
|---|
| 79 | continue
|
|---|
| 80 | except (Timeout, NoAnswer, NoNameservers), e:
|
|---|
| 81 | self.log.warning('Error checking URL blacklist server '
|
|---|
| 82 | '"%s" for URL "%s": %s', server, url, e)
|
|---|
| 83 |
|
|---|
| 84 | if points != 0:
|
|---|
| 85 | return points, N_("URL's blacklisted by %s"), ', '.join(servers)
|
|---|
| 86 |
|
|---|
| 87 | def train(self, req, author, content, ip, spam=True):
|
|---|
| 88 | return 0
|
|---|
| 89 |
|
|---|
| 90 | # Internal methods
|
|---|
| 91 |
|
|---|
| 92 | def _query(self, url, server):
|
|---|
| 93 | res = query(from_text(url + '.' + server.encode('utf-8')))[0].to_text()
|
|---|
| 94 | if res == '127.0.0.1':
|
|---|
| 95 | return '%s (%s)' % (server, url)
|
|---|
| 96 | # strip the common part of responses
|
|---|
| 97 | if res.startswith('127.0.0.'):
|
|---|
| 98 | res = res[8:]
|
|---|
| 99 | elif res.startswith('127.'):
|
|---|
| 100 | res = res[4:]
|
|---|
| 101 | return '%s (%s[%s])' % (server, url, res)
|
|---|
| 102 |
|
|---|
| 103 | def _check_preconditions(self, req, author, content, ip):
|
|---|
| 104 | if self.karma_points == 0 or not self.servers:
|
|---|
| 105 | return False
|
|---|
| 106 |
|
|---|
| 107 | return True
|
|---|
| 108 |
|
|---|
| 109 | def _geturls(self, content):
|
|---|
| 110 | urls = {}
|
|---|
| 111 | content = content.lower()
|
|---|
| 112 | # no IDN domains, only punnycode
|
|---|
| 113 | urlstr = re.compile("^([a-z0-9][a-z0-9.-]+[a-z0-9])(.?)")
|
|---|
| 114 |
|
|---|
| 115 | while 1:
|
|---|
| 116 | pos = content.find('//')
|
|---|
| 117 | if pos < 0:
|
|---|
| 118 | break
|
|---|
| 119 | content = content[pos + 2:]
|
|---|
| 120 | res = urlstr.search(content)
|
|---|
| 121 | if res:
|
|---|
| 122 | u = res.group(1)
|
|---|
| 123 | urls[u] = urls.get(u, 0)
|
|---|
| 124 | if res.group(2) not in ('"', '\'', '/', '\n', '.', '!',
|
|---|
| 125 | '?', ',', ';', ''):
|
|---|
| 126 | self.log.warn("Strange URL '%s' found.", u)
|
|---|
| 127 | return urls
|
|---|