-
-
Save attacker34/f5d5228c93d95f89c4302470f7fd4f50 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
import sys | |
import time | |
import pymongo | |
import requests | |
import urlparse | |
import datetime | |
from lxml import html | |
from urlparse import urljoin | |
from fake_useragent import UserAgent | |
from lxml.etree import ParserError | |
from lxml.etree import XMLSyntaxError | |
from requests.exceptions import Timeout | |
from requests.exceptions import InvalidSchema | |
from requests.exceptions import MissingSchema | |
from requests.exceptions import ConnectionError | |
from requests.exceptions import ChunkedEncodingError | |
from requests.exceptions import TooManyRedirects | |
from pymongo.errors import WriteError | |
def get_connection(database_url): | |
client = pymongo.MongoClient(database_url) | |
db = client['certs'] | |
db.links.create_index('url', unique=True) | |
return db | |
def get_data(db, skip, limit): | |
data = db.links.find({}, {'_id': False})[skip:limit] | |
return data | |
def add_data(db, data): | |
for link in data: | |
try: | |
timestamp = int(time.time()) | |
post = { 'url': link, 'created': timestamp } | |
try: | |
post_id = db.links.insert(post) | |
except WriteError as e: | |
continue | |
try: | |
link = link.encode('ascii') | |
except UnicodeEncodeError as e: | |
continue | |
print 'INFO: The url {}, was succefully saved with the id {}'.format(link, post_id) | |
except pymongo.errors.DuplicateKeyError, e: | |
continue | |
def get_links(ua, url): | |
url_list = [] | |
try: | |
headers = {'User-Agent': ua.chrome} | |
res = requests.get(url, timeout=1, headers=headers) | |
content = res.text | |
except (Timeout, InvalidSchema, MissingSchema, ConnectionError, TooManyRedirects, ChunkedEncodingError) as e: | |
return None | |
try: | |
doc = html.document_fromstring(content) | |
except (ValueError, ParserError, XMLSyntaxError) as e: | |
return None | |
links = doc.xpath('//a/@href') | |
for link in links: | |
link = link.strip() | |
if link.startswith('#') or link.startswith('+') or link.startswith('javascript:') or link.startswith('mailto:'): | |
continue | |
elif link.startswith('/'): | |
link = urljoin(url, link) | |
elif link.startswith('..'): | |
link = urljoin(url, link.replace('..', '')) | |
if link.startswith('http'): | |
try: | |
url_list.append(urljoin(link, urlparse.urlparse(link).path)) | |
except ValueError as e: | |
continue | |
else: | |
url_list.append(urljoin(url, urlparse.urlparse(link).path)) | |
return url_list | |
def main(): | |
db = get_connection('localhost:27017') | |
ua = UserAgent() | |
try: | |
data = get_links(ua, sys.argv[1]) | |
if data is not None: | |
add_data(db, data) | |
except IndexError as e: | |
skip = 0 | |
limit = 4000 | |
known_urls = get_data(db, skip, limit) | |
for url in known_urls: | |
data = get_links(ua, url['url']) | |
if data is not None: | |
add_data(db, data) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment