p3t3r67x0 · January 28, 2017 20:54
diff --git a/link_crawler.py b/link_crawler.py
 #!/usr/bin/env python

 import os
 import sys
 import time
 import pymongo
 import requests
 import urlparse
 import datetime
 from lxml import html
 from urlparse import urljoin
 from fake_useragent import UserAgent
 from lxml.etree import ParserError
 from lxml.etree import XMLSyntaxError
 from requests.exceptions import Timeout
 from requests.exceptions import InvalidSchema
 from requests.exceptions import MissingSchema
 from requests.exceptions import ConnectionError
 from requests.exceptions import ChunkedEncodingError
 from requests.exceptions import TooManyRedirects
 from pymongo.errors import WriteError


 def get_connection(database_url):
    client = pymongo.MongoClient(database_url)
    db = client['certs']

    db.links.create_index('url', unique=True)
    return db


 def get_data(db, skip, limit):
    data = db.links.find({}, {'_id': False})[skip:limit]
    return data


 def add_data(db, data):
    for link in data:
        try:
            timestamp = int(time.time())
            post = { 'url': link, 'created': timestamp }

            try:
                post_id = db.links.insert(post)
            except WriteError as e:
                continue

            try:
                link = link.encode('ascii')
            except UnicodeEncodeError as e:
                continue                

            print 'INFO: The url {}, was succefully saved with the id {}'.format(link, post_id)
        except pymongo.errors.DuplicateKeyError, e:
            continue


 def get_links(ua, url):
    url_list = []

    try:
        headers = {'User-Agent': ua.chrome}
        res = requests.get(url, timeout=1, headers=headers)
        content = res.text
    except (Timeout, InvalidSchema, MissingSchema, ConnectionError, TooManyRedirects, ChunkedEncodingError) as e:
        return None

    try:
        doc = html.document_fromstring(content)
    except (ValueError, ParserError, XMLSyntaxError) as e:
        return None
    
    links = doc.xpath('//a/@href')
    
    for link in links:
        link = link.strip()
        
        if link.startswith('#') or link.startswith('+') or link.startswith('javascript:') or link.startswith('mailto:'):
            continue
        
        elif link.startswith('/'):
            link = urljoin(url, link)
        
        elif link.startswith('..'):
            link = urljoin(url, link.replace('..', ''))
            
        if link.startswith('http'):
            try:
                url_list.append(urljoin(link, urlparse.urlparse(link).path))
            except ValueError as e:
                continue
        else:
            url_list.append(urljoin(url, urlparse.urlparse(link).path))


    return url_list


 def main():
    db = get_connection('localhost:27017')
    ua = UserAgent()

    try:
        data = get_links(ua, sys.argv[1])

        if data is not None:
            add_data(db, data)

    except IndexError as e:
        skip = 0
        limit = 4000

        known_urls = get_data(db, skip, limit)

        for url in known_urls:
            data = get_links(ua, url['url'])

            if data is not None:
                add_data(db, data)


 if __name__ == '__main__':
    main()
	#!/usr/bin/env python

	import os
	import sys
	import time
	import pymongo
	import requests
	import urlparse
	import datetime
	from lxml import html
	from urlparse import urljoin
	from fake_useragent import UserAgent
	from lxml.etree import ParserError
	from lxml.etree import XMLSyntaxError
	from requests.exceptions import Timeout
	from requests.exceptions import InvalidSchema
	from requests.exceptions import MissingSchema
	from requests.exceptions import ConnectionError
	from requests.exceptions import ChunkedEncodingError
	from requests.exceptions import TooManyRedirects
	from pymongo.errors import WriteError


	def get_connection(database_url):
	client = pymongo.MongoClient(database_url)
	db = client['certs']

	db.links.create_index('url', unique=True)
	return db


	def get_data(db, skip, limit):
	data = db.links.find({}, {'_id': False})[skip:limit]
	return data


	def add_data(db, data):
	for link in data:
	try:
	timestamp = int(time.time())
	post = { 'url': link, 'created': timestamp }

	try:
	post_id = db.links.insert(post)
	except WriteError as e:
	continue

	try:
	link = link.encode('ascii')
	except UnicodeEncodeError as e:
	continue

	print 'INFO: The url {}, was succefully saved with the id {}'.format(link, post_id)
	except pymongo.errors.DuplicateKeyError, e:
	continue


	def get_links(ua, url):
	url_list = []

	try:
	headers = {'User-Agent': ua.chrome}
	res = requests.get(url, timeout=1, headers=headers)
	content = res.text
	except (Timeout, InvalidSchema, MissingSchema, ConnectionError, TooManyRedirects, ChunkedEncodingError) as e:
	return None

	try:
	doc = html.document_fromstring(content)
	except (ValueError, ParserError, XMLSyntaxError) as e:
	return None

	links = doc.xpath('//a/@href')

	for link in links:
	link = link.strip()

	if link.startswith('#') or link.startswith('+') or link.startswith('javascript:') or link.startswith('mailto:'):
	continue

	elif link.startswith('/'):
	link = urljoin(url, link)

	elif link.startswith('..'):
	link = urljoin(url, link.replace('..', ''))

	if link.startswith('http'):
	try:
	url_list.append(urljoin(link, urlparse.urlparse(link).path))
	except ValueError as e:
	continue
	else:
	url_list.append(urljoin(url, urlparse.urlparse(link).path))


	return url_list


	def main():
	db = get_connection('localhost:27017')
	ua = UserAgent()

	try:
	data = get_links(ua, sys.argv[1])

	if data is not None:
	add_data(db, data)

	except IndexError as e:
	skip = 0
	limit = 4000

	known_urls = get_data(db, skip, limit)

	for url in known_urls:
	data = get_links(ua, url['url'])

	if data is not None:
	add_data(db, data)


	if __name__ == '__main__':
	main()