378 lines
11 KiB
Python
Executable File
378 lines
11 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
|
|
import re
|
|
import sys
|
|
import time
|
|
import codecs
|
|
import gevent
|
|
import logging
|
|
import urlnorm
|
|
import datetime
|
|
import urllib
|
|
import urlparse
|
|
import requests
|
|
import tldextract
|
|
|
|
from gsb import client
|
|
from pprint import pprint
|
|
from gsb import datastore
|
|
from bs4 import BeautifulSoup
|
|
from spam.surbl import SurblChecker
|
|
from spam.spamhaus import SpamHausChecker
|
|
|
|
# Unicode fixup
|
|
UTF8Writer = codecs.getwriter('utf8')
|
|
sys.stdout = UTF8Writer(sys.stdout)
|
|
|
|
urlsseen = set()
|
|
urlschecked = dict()
|
|
cookiejar = None
|
|
ds = None
|
|
sbc = None
|
|
|
|
safebrowse_apikey = 'YourAPIKeyHere'
|
|
debug = False
|
|
want_safebrowse = True
|
|
want_spamhaus = False
|
|
|
|
def RateLimited(maxPerSecond):
|
|
"""
|
|
Decorator for rate limiting
|
|
"""
|
|
minInterval = 1.0 / float(maxPerSecond)
|
|
def decorate(func):
|
|
lastTimeCalled = [0.0]
|
|
def rateLimitedFunction(*args,**kargs):
|
|
elapsed = time.clock() - lastTimeCalled[0]
|
|
leftToWait = minInterval - elapsed
|
|
if leftToWait>0:
|
|
time.sleep(leftToWait)
|
|
ret = func(*args,**kargs)
|
|
lastTimeCalled[0] = time.clock()
|
|
return ret
|
|
return rateLimitedFunction
|
|
return decorate
|
|
|
|
def safebrowse_init(apikey, storename):
|
|
|
|
global ds, sbc
|
|
|
|
chunk_range_str = None
|
|
num_expressions = None
|
|
num_addchunks = None
|
|
num_subchunks = None
|
|
|
|
ds = datastore.DataStore(storename)
|
|
|
|
sbc = client.Client(ds,
|
|
apikey=apikey,
|
|
use_mac=True)
|
|
|
|
|
|
def find_url(txt):
|
|
urlfinder = re.compile( # stolen from django
|
|
r'^https?://' # http:// or https://
|
|
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain...
|
|
r'localhost|' # localhost...
|
|
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
|
|
r'(?::\d+)?' # optional port
|
|
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
|
|
urllist = [ mgroups[0] for mgroups in urlfinder.findall(txt)]
|
|
return urllist
|
|
|
|
def fix_urls(urls, hostinfo):
|
|
|
|
ret = []
|
|
for url in urls:
|
|
if url:
|
|
if not urlparse.urlparse(url).scheme:
|
|
if not url.startswith('//'):
|
|
|
|
url = url.encode('utf8','ignore')
|
|
url = hostinfo['scheme'] + "://" + hostinfo['hostname'] + '/' + url
|
|
url = urlnorm.norm(url)
|
|
else:
|
|
url = hostinfo['scheme'] + ':' + url
|
|
|
|
|
|
if url.endswith('#'):
|
|
url = url[:-1]
|
|
|
|
if url.startswith('javascript:'):
|
|
continue
|
|
|
|
#print "fixed up url on %s: %s" % (hostinfo['hostname'], url)
|
|
ret.append(url)
|
|
return ret
|
|
|
|
def get_domain(url):
|
|
domain = tldextract.extract(url)
|
|
result = domain.domain + "." + domain.tld
|
|
return result
|
|
|
|
|
|
def check_surbl(url):
|
|
global urlschecked
|
|
|
|
domain = get_domain(url)
|
|
|
|
# check for links we cannot handle
|
|
if url.startswith('http') or url.startswith('https'):
|
|
# short cirquit (caching is good!)
|
|
if urlschecked.has_key("surbl-" + domain):
|
|
return urlschecked["surbl-" + domain]
|
|
checker = SurblChecker()
|
|
try:
|
|
ret = checker.is_spam(url)
|
|
except IndexError as e:
|
|
print "Whoops, trying again later."
|
|
return False
|
|
urlschecked["surbl-" + domain] = ret
|
|
return ret
|
|
else:
|
|
return False
|
|
|
|
def check_spamhaus(url):
|
|
global urlschecked, want_spamhaus
|
|
|
|
domain = get_domain(url)
|
|
|
|
if not want_spamhaus:
|
|
return False
|
|
|
|
# check for links we cannot handle
|
|
if url.startswith('http') or url.startswith('https'):
|
|
# short cirquit (caching is good!)
|
|
if urlschecked.has_key("sh-" + domain):
|
|
return urlschecked["sh-" + domain]
|
|
checker = SpamHausChecker()
|
|
try:
|
|
ret = checker.is_spam(url)
|
|
except Exception as e:
|
|
print "Whoops, trying again later: %s" % e
|
|
return False
|
|
urlschecked["sh-" + domain] = ret
|
|
return ret
|
|
else:
|
|
return False
|
|
|
|
def check_safebrowse(url):
|
|
global urlschecked, want_safebrowse, cookiejar, sbc
|
|
|
|
ret = False
|
|
|
|
if not want_safebrowse:
|
|
return False
|
|
|
|
if url.startswith('javascript:'):
|
|
ret = False
|
|
|
|
try:
|
|
url = urllib.quote(url, safe="%/:=&?~#+!$,;'@()*[]").encode('utf-8')
|
|
|
|
url = get_domain(url)
|
|
|
|
if urlschecked.has_key('sb-' + url):
|
|
return urlschecked['sb-' + url]
|
|
|
|
## Lookup API (slow!)
|
|
# checkurl = "https://sb-ssl.google.com/safebrowsing/api/lookup?client=firefox&apikey=%s&appver=1.5.2&pver=3.0" % safebrowse_apikey
|
|
# payload = {'1': url}
|
|
# ret = requests.post(checkurl, data=payload)
|
|
|
|
matches = sbc.CheckUrl(url, debug_info=True)
|
|
if len(matches) == 0:
|
|
ret = False
|
|
else:
|
|
for listname, match, addchunknum in matches:
|
|
if ret:
|
|
ret += '%s: addchunk number: %d: %s\n' % (listname, addchunknum, match)
|
|
else:
|
|
ret = '%s: addchunk number: %d: %s\n' % (listname, addchunknum, match)
|
|
|
|
|
|
except Exception as ex:
|
|
print "SBC: Skipped this url: %s\nReason: %s" % (url, ex)
|
|
ret = False
|
|
|
|
urlschecked['sb-' + url] = ret
|
|
|
|
return ret
|
|
|
|
def extract_urls(r, hostinfo):
|
|
|
|
global urlsseen
|
|
|
|
# Make sure r actually contains something, otherwise
|
|
# we throw exceptions
|
|
if r == None:
|
|
return
|
|
|
|
urls = []
|
|
|
|
# check mime type and act accordingly
|
|
if r.headers['content-type'].startswith('text/html'):
|
|
soup = BeautifulSoup(r.content)
|
|
|
|
urls = [link.get('src') for link in soup.find_all('script')]
|
|
urls += [link.get('href') for link in soup.find_all('a')]
|
|
urls += [link.get('src') for link in soup.find_all('iframe')]
|
|
urls += [link.get('href') for link in soup.find_all('link')]
|
|
urls += [link.get('url') for link in soup.find_all('applet')]
|
|
urls += [link.get('data') for link in soup.find_all('object')]
|
|
print "Found %d references in markup" % len(urls)
|
|
elif r.headers['content-type'].startswith('application/javascript'):
|
|
# just look for stuff that looks like a URI
|
|
urls = find_url(r.text)
|
|
pprint(urls)
|
|
elif r.headers['content-type'].startswith('text/plain'):
|
|
# just look for stuff that looks like a URI
|
|
urls = find_url(r.text)
|
|
pprint(urls)
|
|
else:
|
|
# anything else?
|
|
return []
|
|
|
|
if urls:
|
|
# fix up b0rked urls (e.g. relative links)
|
|
urls = fix_urls(urls, hostinfo)
|
|
|
|
# preventively strip out urls we already seen
|
|
for url in urls:
|
|
if url in urlsseen:
|
|
urls.remove(url)
|
|
|
|
for url in urls:
|
|
if check_surbl(url):
|
|
print "Malicious domain found on %s:\n\t %s" % (hostinfo['fullurl'], url)
|
|
f = open('assets.txt', 'a')
|
|
f.write('SURBL :' + str(hostinfo['fullurl']) + '\t=>\t' + url + '\n')
|
|
f.close
|
|
if check_spamhaus(url):
|
|
print "Spamhaus domain found on %s:\n\t %s" % (hostinfo['fullurl'], url)
|
|
f = open('assets.txt', 'a')
|
|
f.write('SPAMHAUS:' + str(hostinfo['fullurl']) + '\t=>\t' + url + '\n')
|
|
f.close
|
|
ret = check_safebrowse(url)
|
|
if ret:
|
|
print "SAFEBROWSE: %url -> %s" % (hostinfo['fullurl'], ret)
|
|
f = open('assets.txt', 'a')
|
|
f.write('SAFEBROWSE: %s -> %s\n' % (hostinfo['fullurl'], ret))
|
|
f.close
|
|
|
|
|
|
print "Saw %d new links on this page." % len(urls)
|
|
return urls
|
|
else:
|
|
return []
|
|
|
|
|
|
def print_url(r, *args, **kwargs):
|
|
global urlsseen
|
|
|
|
if r == None:
|
|
return
|
|
|
|
urlsseen.add(r.url)
|
|
|
|
|
|
def recurse_url(urls, domain):
|
|
global urlsseen, cookiejar
|
|
|
|
domain = get_domain(domain)
|
|
|
|
while True:
|
|
if len(urls) == 0:
|
|
return
|
|
|
|
# prune
|
|
for url in urls:
|
|
if url in urlsseen:
|
|
urls.remove(url)
|
|
|
|
print "urls contains %d elements" % len(urls)
|
|
|
|
# remove None values from urls
|
|
urls = [x for x in urls if x is not None]
|
|
|
|
hooks = {'response': print_url}
|
|
|
|
rs = []
|
|
urlindex = 0
|
|
for url in urls:
|
|
|
|
# don't investigate a link if we have already seen it.
|
|
if url in urlsseen:
|
|
#print "Not fetching %s. (%d in cache, %d pending)" % (url, len(urlsseen), len(urls))
|
|
if url in urls:
|
|
urls.remove(url)
|
|
continue
|
|
else:
|
|
urlsseen.add(url)
|
|
|
|
if get_domain(url) != domain:
|
|
#print "%s != %s, not fetching" % (get_domain(url), domain)
|
|
continue
|
|
|
|
if url.startswith('javascript:'):
|
|
continue
|
|
|
|
|
|
if url.startswith('mailto:'):
|
|
continue
|
|
|
|
if url:
|
|
url_lists = []
|
|
|
|
print "Fetching %s. (%d in cache, %d pending)" % (url, len(urlsseen), len(urls))
|
|
headers = { # Let's pretend we're internet explorer, because we can
|
|
'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0',
|
|
}
|
|
try:
|
|
response = requests.get(url, hooks=hooks, headers=headers, cookies=cookiejar)
|
|
except Exception as ex:
|
|
print "Whoops... %s" % ex
|
|
continue # fuck it
|
|
|
|
cookiejar = response.cookies
|
|
pprint(cookiejar.get_dict())
|
|
|
|
|
|
hostinfo = { 'hostname': urlparse.urlparse(url).hostname.encode('utf8'),
|
|
'scheme': urlparse.urlparse(url).scheme.encode('utf8'),
|
|
'fullurl':url.encode('utf8')}
|
|
items = extract_urls(response, hostinfo)
|
|
url_lists.append(items)
|
|
url_lists = [x for x in url_lists if x is not None]
|
|
urls += sum(url_lists, []) # flatten
|
|
urlindex += 1
|
|
|
|
|
|
def main():
|
|
global debug, safebrowse_apikey
|
|
|
|
if debug:
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
|
|
if want_safebrowse:
|
|
print "Checking datastore for SBC"
|
|
safebrowse_init(safebrowse_apikey, 'sbcstore')
|
|
|
|
if len(sys.argv) < 2:
|
|
sys.exit('Need list of urls to crawl')
|
|
|
|
urllist = []
|
|
for line in open(sys.argv[1]):
|
|
url = line.strip()
|
|
if not url.startswith('http'):
|
|
url = 'http://' + url
|
|
print "added %s" % url
|
|
urllist.append(url)
|
|
|
|
for url in urllist:
|
|
recurse_url([url], url)
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|