Added bs4b dependancy.

Removed selenium stuff (not used anyway)
master
Emiel Kollof 2014-07-07 14:46:57 +02:00
parent 9e70b87b7d
commit 897589846f
2 changed files with 1 additions and 24 deletions

View File

@ -9,6 +9,7 @@ Prerequisites
- Google Safe Browsing: https://code.google.com/p/google-safe-browsing/
- Spam Blocklist: https://pypi.python.org/pypi/spam-blocklists/0.9.3 (pip fetchable)
- TLD extract: https://pypi.python.org/pypi/tldextract (pip fetchable)
- BeautifulSoup
TODO:
======

View File

@ -18,7 +18,6 @@ from gsb import client
from pprint import pprint
from gsb import datastore
from bs4 import BeautifulSoup
from selenium import webdriver
from spam.surbl import SurblChecker
from spam.spamhaus import SpamHausChecker
@ -201,28 +200,6 @@ def check_safebrowse(url):
return ret
def js_click(url, urllist, urlindex):
# crap, a javascript redirect. Whip out selenium
print "JS: %s" % url
driver = webdriver.PhantomJS()
prevurl = urls[urlindex - 1]
print "JS: Loading %s first for context." % prevurl
# get previous page for context
driver.get(prevurl)
# wait for page load
while prevurl == driver.current_url:
time.sleep(2)
# get js link
driver.get(url)
while url == driver.current_url:
time.sleep(2)
url = driver.current_url
print "Javascript url resolved into %s" % url
def extract_urls(r, hostinfo):
global urlsseen
@ -339,7 +316,6 @@ def recurse_url(urls, domain):
continue
if url.startswith('javascript:'):
js_click(url, urls, urlindex)
continue