erayerdin · July 27, 2016 14:18
diff --git a/A Simple Crawling and Scraping Algorithm Using Python b/A Simple Crawling and Scraping Algorithm Using Python
 A Simple Crawling and Scraping Algorithm Using Python
diff --git a/!LICENSE b/!LICENSE
        DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 
                    Version 2, December 2004 

 Copyright (C) 2004 Sam Hocevar <sam@hocevar.net> 

 Everyone is permitted to copy and distribute verbatim or modified 
 copies of this license document, and changing it is allowed as long 
 as the name is changed. 

            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 
   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 

  0. You just DO WHAT THE FUCK YOU WANT TO.
diff --git a/!README.md b/!README.md
diff --git a/__init__.py b/__init__.py
 import re, requests, logging, os, time, uuid, sys
 from bs4 import BeautifulSoup as bs
 from couchdb import ResourceConflict
 #from selenium import webdriver
 #from selenium.webdriver.common.keys import Keys

 class Crawler(object):
    logger = logging.getLogger("crawler_logger")
    logger.setLevel(logging.DEBUG)
    file_handler = logging.FileHandler(os.getcwd()+"/logs/crawler_{}.log".format( str(int(round(time.time()*1000))) ))
    file_handler.setLevel(logging.DEBUG)
    terminal_handler = logging.StreamHandler()
    terminal_handler.setLevel(logging.INFO)
    log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    file_handler.setFormatter(log_format)
    terminal_handler.setFormatter(log_format)
    logger.addHandler(terminal_handler)
    logger.addHandler(file_handler)

    def __init__(self, idn, couch, rule, *args):
        self.logger.info("Initializing Crawler object on '{}' domain with {} urls.".format(args[0], str(args)))
        self.idn=idn+"_"+str(uuid.uuid1())
        self.couch = couch
        if rule==None: self.rule="^/"
        else: self.rule=rule
        self.domain = args[0]
        self.urls = list(args)
        self.crawled = []
        self.limit = [0, 0]
        self.sleep = 1

    def crawl(self, proxies=None):
        # while urls in self.urls
        while self.urls:
            if self.limit[0] != 0:
                if self.limit[0] == self.limit[1]:
                    self.logger.info("Limit reached, returns.")
                    break

            try:
                self.logger.info("Connecting to {}...".format(self.urls[0]))
                response = requests.get(self.urls[0], proxies)
                time.sleep(self.sleep)
                response.encoding = "utf-8"
                self.logger.debug("Analyzing to structures...")
                soup = bs(response.text, "html.parser")
                links = soup.find_all("a", {"href" : re.compile(self.rule)})
                hrefs = [x.attrs["href"] for x in links]
                self.logger.info("Links are checked if they are crawled...")
                for href in hrefs:
                    if self.domain[0:-1]+href in self.crawled:
                        self.logger.warn("{} already crawled.".format(str(href)))
                        pass
                    else: self.urls.append(self.domain[0:-1]+href)
                self.crawled.append(self.urls[0])
                # Remove first url from reversed self.urls list
                #self.urls[::-1].pop()
                del self.urls[0]
                self.limit[1]+=1
                self.logger.debug("Updating database {}".format(self.idn))
                try:
                    self.couch.database["{}".format(self.idn)] = {
                        "idn" : self.idn,
                        "rule" : self.rule,
                        "domain" : self.domain,
                        "urls" : self.urls,
                    }
                except ResourceConflict:
                    revno = self.couch.database["{}".format(self.idn)]["_rev"]
                    self.couch.database["{}".format(self.idn)] = {
                        "_rev" : revno,
                        "idn" : self.idn,
                        "rule" : self.rule,
                        "domain" : self.domain,
                        "urls" : self.urls,
                    }
                self.logger.info("Reached count {}/{}".format(str(self.limit[1]), str(self.limit[0])))
            except Exception as e:
                exc_type, exc_obj, exc_tb = sys.exc_info()
                f = exc_tb.tb_frame
                lineno = exc_tb.tb_lineno
                filename = f.f_code.co_filename
                self.logger.error("Crawling function raised an error in line {} @ {}, passing: {}".format(lineno, filename, str(e)))

        if len(self.urls) == 0:
            self.logger.info("No url left to crawl, returns.")
            return

 class Scraper(object):
    logger = logging.getLogger("scraper_logger")
    logger.setLevel(logging.DEBUG)
    file_handler = logging.FileHandler(os.getcwd()+"/logs/scraper_{}.log".format( str(int(round(time.time()*1000))) ))
    file_handler.setLevel(logging.DEBUG)
    terminal_handler = logging.StreamHandler()
    terminal_handler.setLevel(logging.INFO)
    log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    file_handler.setFormatter(log_format)
    terminal_handler.setFormatter(log_format)
    logger.addHandler(terminal_handler)
    logger.addHandler(file_handler)
    
    def __init__(self, idn, couch):
        self.idn=idn+"_"+str(uuid.uuid1())
        self.rules = []
        self.data = {}
        self.couch = couch
        self.sleep = 1
    
    def add_rule(self, title, tag, attrs):
        self.rules.append((title, tag, attrs))
        self.logger.info("{} rule added.".format(str((title,tag,attrs))))
    
    def select_rules(self, **where):
        selected = []
        for item in self.rules:
            try:
                if item[0] in where["title"]:
                    selected.append(item)
                elif item[1] in where["tag"]:
                    selected.append(item)
                elif item[2] in where["attrs"]:
                    selected.append(item)
                else: pass
            except Exception:
                continue
        return selected
    
    def remove_rules(self, *rules):
        for item in rules:
            self.rules.remove(item)
        self.logger.info("{} rules removed.".format(str(rules)))
    
    def scrap(self, proxies=None, *urls):
        for url in urls:
            self.logger.info("Connecting to {}...".format(url))
            response = requests.get(url, proxies)
            time.sleep(self.sleep)
            response.encoding = "utf-8"
            self.logger.debug("Parsing to the structure...")
            soup = bs(response.text, "html.parser")
            self.data[url] = []
            for rule in self.rules:
                self.logger.debug("Scraping rule {}".format(str(rule)))
                rule_search = soup.find_all(rule[1], rule[2])
                if len(rule_search) == 0: pass
                result = [x.get_text() for x in rule_search]
                self.data[url].append((rule, result))
                
            self.logger.info("Updating database...")
            try:
                self.couch.database["{}".format(self.idn)] = {
                    "idn" : self.idn,
                    "data" : self.data,
                    "rules" : self.rules,
                    "urls" : urls,
                }
            except ResourceConflict:
                revno = self.couch.database["{}".format(self.idn)]["_rev"]
                self.couch.database["{}".format(self.idn)] = {
                    "_rev" : revno,
                    "idn" : self.idn,
                    "data" : self.data,
                    "rules" : self.rules,
                    "urls" : urls,
                }
diff --git a/db.py b/db.py
 import couchdb
    
 class Couch(object):
    def __init__(self, username, password):
        self.username = username
        self.password = password
        
        self.server = couchdb.Server()
        self.server.resource.credentials = (username, password)
        self.database = None
    
    def select_database(self, dbname):
        try:
            self.database=self.server.create(dbname)
        except Exception:
            self.database=self.server[dbname]
	DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
	Version 2, December 2004

	Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>

	Everyone is permitted to copy and distribute verbatim or modified
	copies of this license document, and changing it is allowed as long
	as the name is changed.

	DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
	TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION

	0. You just DO WHAT THE FUCK YOU WANT TO.
	import re, requests, logging, os, time, uuid, sys
	from bs4 import BeautifulSoup as bs
	from couchdb import ResourceConflict
	#from selenium import webdriver
	#from selenium.webdriver.common.keys import Keys

	class Crawler(object):
	logger = logging.getLogger("crawler_logger")
	logger.setLevel(logging.DEBUG)
	file_handler = logging.FileHandler(os.getcwd()+"/logs/crawler_{}.log".format( str(int(round(time.time()*1000))) ))
	file_handler.setLevel(logging.DEBUG)
	terminal_handler = logging.StreamHandler()
	terminal_handler.setLevel(logging.INFO)
	log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
	file_handler.setFormatter(log_format)
	terminal_handler.setFormatter(log_format)
	logger.addHandler(terminal_handler)
	logger.addHandler(file_handler)

	def __init__(self, idn, couch, rule, *args):
	self.logger.info("Initializing Crawler object on '{}' domain with {} urls.".format(args[0], str(args)))
	self.idn=idn+"_"+str(uuid.uuid1())
	self.couch = couch
	if rule==None: self.rule="^/"
	else: self.rule=rule
	self.domain = args[0]
	self.urls = list(args)
	self.crawled = []
	self.limit = [0, 0]
	self.sleep = 1

	def crawl(self, proxies=None):
	# while urls in self.urls
	while self.urls:
	if self.limit[0] != 0:
	if self.limit[0] == self.limit[1]:
	self.logger.info("Limit reached, returns.")
	break

	try:
	self.logger.info("Connecting to {}...".format(self.urls[0]))
	response = requests.get(self.urls[0], proxies)
	time.sleep(self.sleep)
	response.encoding = "utf-8"
	self.logger.debug("Analyzing to structures...")
	soup = bs(response.text, "html.parser")
	links = soup.find_all("a", {"href" : re.compile(self.rule)})
	hrefs = [x.attrs["href"] for x in links]
	self.logger.info("Links are checked if they are crawled...")
	for href in hrefs:
	if self.domain[0:-1]+href in self.crawled:
	self.logger.warn("{} already crawled.".format(str(href)))
	pass
	else: self.urls.append(self.domain[0:-1]+href)
	self.crawled.append(self.urls[0])
	# Remove first url from reversed self.urls list
	#self.urls[::-1].pop()
	del self.urls[0]
	self.limit[1]+=1
	self.logger.debug("Updating database {}".format(self.idn))
	try:
	self.couch.database["{}".format(self.idn)] = {
	"idn" : self.idn,
	"rule" : self.rule,
	"domain" : self.domain,
	"urls" : self.urls,
	}
	except ResourceConflict:
	revno = self.couch.database["{}".format(self.idn)]["_rev"]
	self.couch.database["{}".format(self.idn)] = {
	"_rev" : revno,
	"idn" : self.idn,
	"rule" : self.rule,
	"domain" : self.domain,
	"urls" : self.urls,
	}
	self.logger.info("Reached count {}/{}".format(str(self.limit[1]), str(self.limit[0])))
	except Exception as e:
	exc_type, exc_obj, exc_tb = sys.exc_info()
	f = exc_tb.tb_frame
	lineno = exc_tb.tb_lineno
	filename = f.f_code.co_filename
	self.logger.error("Crawling function raised an error in line {} @ {}, passing: {}".format(lineno, filename, str(e)))

	if len(self.urls) == 0:
	self.logger.info("No url left to crawl, returns.")
	return

	class Scraper(object):
	logger = logging.getLogger("scraper_logger")
	logger.setLevel(logging.DEBUG)
	file_handler = logging.FileHandler(os.getcwd()+"/logs/scraper_{}.log".format( str(int(round(time.time()*1000))) ))
	file_handler.setLevel(logging.DEBUG)
	terminal_handler = logging.StreamHandler()
	terminal_handler.setLevel(logging.INFO)
	log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
	file_handler.setFormatter(log_format)
	terminal_handler.setFormatter(log_format)
	logger.addHandler(terminal_handler)
	logger.addHandler(file_handler)

	def __init__(self, idn, couch):
	self.idn=idn+"_"+str(uuid.uuid1())
	self.rules = []
	self.data = {}
	self.couch = couch
	self.sleep = 1

	def add_rule(self, title, tag, attrs):
	self.rules.append((title, tag, attrs))
	self.logger.info("{} rule added.".format(str((title,tag,attrs))))

	def select_rules(self, **where):
	selected = []
	for item in self.rules:
	try:
	if item[0] in where["title"]:
	selected.append(item)
	elif item[1] in where["tag"]:
	selected.append(item)
	elif item[2] in where["attrs"]:
	selected.append(item)
	else: pass
	except Exception:
	continue
	return selected

	def remove_rules(self, *rules):
	for item in rules:
	self.rules.remove(item)
	self.logger.info("{} rules removed.".format(str(rules)))

	def scrap(self, proxies=None, *urls):
	for url in urls:
	self.logger.info("Connecting to {}...".format(url))
	response = requests.get(url, proxies)
	time.sleep(self.sleep)
	response.encoding = "utf-8"
	self.logger.debug("Parsing to the structure...")
	soup = bs(response.text, "html.parser")
	self.data[url] = []
	for rule in self.rules:
	self.logger.debug("Scraping rule {}".format(str(rule)))
	rule_search = soup.find_all(rule[1], rule[2])
	if len(rule_search) == 0: pass
	result = [x.get_text() for x in rule_search]
	self.data[url].append((rule, result))

	self.logger.info("Updating database...")
	try:
	self.couch.database["{}".format(self.idn)] = {
	"idn" : self.idn,
	"data" : self.data,
	"rules" : self.rules,
	"urls" : urls,
	}
	except ResourceConflict:
	revno = self.couch.database["{}".format(self.idn)]["_rev"]
	self.couch.database["{}".format(self.idn)] = {
	"_rev" : revno,
	"idn" : self.idn,
	"data" : self.data,
	"rules" : self.rules,
	"urls" : urls,
	}
	import couchdb

	class Couch(object):
	def __init__(self, username, password):
	self.username = username
	self.password = password

	self.server = couchdb.Server()
	self.server.resource.credentials = (username, password)
	self.database = None

	def select_database(self, dbname):
	try:
	self.database=self.server.create(dbname)
	except Exception:
	self.database=self.server[dbname]