Created
January 28, 2017 20:54
-
-
Save p3t3r67x0/47137a63a91892ec6273dbbdd0c0e55f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
import sys | |
import time | |
import pymongo | |
import requests | |
import urlparse | |
import datetime | |
from lxml import html | |
from urlparse import urljoin | |
from fake_useragent import UserAgent | |
from lxml.etree import ParserError | |
from lxml.etree import XMLSyntaxError | |
from requests.exceptions import Timeout | |
from requests.exceptions import InvalidSchema | |
from requests.exceptions import MissingSchema | |
from requests.exceptions import ConnectionError | |
from requests.exceptions import ChunkedEncodingError | |
from requests.exceptions import TooManyRedirects | |
from pymongo.errors import WriteError | |
def get_connection(database_url): | |
client = pymongo.MongoClient(database_url) | |
db = client['certs'] | |
db.links.create_index('url', unique=True) | |
return db | |
def get_data(db, skip, limit): | |
data = db.links.find({}, {'_id': False})[skip:limit] | |
return data | |
def add_data(db, data): | |
for link in data: | |
try: | |
timestamp = int(time.time()) | |
post = { 'url': link, 'created': timestamp } | |
try: | |
post_id = db.links.insert(post) | |
except WriteError as e: | |
continue | |
try: | |
link = link.encode('ascii') | |
except UnicodeEncodeError as e: | |
continue | |
print 'INFO: The url {}, was succefully saved with the id {}'.format(link, post_id) | |
except pymongo.errors.DuplicateKeyError, e: | |
continue | |
def get_links(ua, url): | |
url_list = [] | |
try: | |
headers = {'User-Agent': ua.chrome} | |
res = requests.get(url, timeout=1, headers=headers) | |
content = res.text | |
except (Timeout, InvalidSchema, MissingSchema, ConnectionError, TooManyRedirects, ChunkedEncodingError) as e: | |
return None | |
try: | |
doc = html.document_fromstring(content) | |
except (ValueError, ParserError, XMLSyntaxError) as e: | |
return None | |
links = doc.xpath('//a/@href') | |
for link in links: | |
link = link.strip() | |
if link.startswith('#') or link.startswith('+') or link.startswith('javascript:') or link.startswith('mailto:'): | |
continue | |
elif link.startswith('/'): | |
link = urljoin(url, link) | |
elif link.startswith('..'): | |
link = urljoin(url, link.replace('..', '')) | |
if link.startswith('http'): | |
try: | |
url_list.append(urljoin(link, urlparse.urlparse(link).path)) | |
except ValueError as e: | |
continue | |
else: | |
url_list.append(urljoin(url, urlparse.urlparse(link).path)) | |
return url_list | |
def main(): | |
db = get_connection('localhost:27017') | |
ua = UserAgent() | |
try: | |
data = get_links(ua, sys.argv[1]) | |
if data is not None: | |
add_data(db, data) | |
except IndexError as e: | |
skip = 0 | |
limit = 4000 | |
known_urls = get_data(db, skip, limit) | |
for url in known_urls: | |
data = get_links(ua, url['url']) | |
if data is not None: | |
add_data(db, data) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment