Created for the blog AlreadyCoded. Check out the thread.
License: Do What The Fuck You Want To Public License
- BeautifulSoup4
- Couchdb
- Logging
- Dumping
- Static Page Parsing
| A Simple Crawling and Scraping Algorithm Using Python |
| DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE | |
| Version 2, December 2004 | |
| Copyright (C) 2004 Sam Hocevar <sam@hocevar.net> | |
| Everyone is permitted to copy and distribute verbatim or modified | |
| copies of this license document, and changing it is allowed as long | |
| as the name is changed. | |
| DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE | |
| TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION | |
| 0. You just DO WHAT THE FUCK YOU WANT TO. |
Created for the blog AlreadyCoded. Check out the thread.
License: Do What The Fuck You Want To Public License
| import re, requests, logging, os, time, uuid, sys | |
| from bs4 import BeautifulSoup as bs | |
| from couchdb import ResourceConflict | |
| #from selenium import webdriver | |
| #from selenium.webdriver.common.keys import Keys | |
| class Crawler(object): | |
| logger = logging.getLogger("crawler_logger") | |
| logger.setLevel(logging.DEBUG) | |
| file_handler = logging.FileHandler(os.getcwd()+"/logs/crawler_{}.log".format( str(int(round(time.time()*1000))) )) | |
| file_handler.setLevel(logging.DEBUG) | |
| terminal_handler = logging.StreamHandler() | |
| terminal_handler.setLevel(logging.INFO) | |
| log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") | |
| file_handler.setFormatter(log_format) | |
| terminal_handler.setFormatter(log_format) | |
| logger.addHandler(terminal_handler) | |
| logger.addHandler(file_handler) | |
| def __init__(self, idn, couch, rule, *args): | |
| self.logger.info("Initializing Crawler object on '{}' domain with {} urls.".format(args[0], str(args))) | |
| self.idn=idn+"_"+str(uuid.uuid1()) | |
| self.couch = couch | |
| if rule==None: self.rule="^/" | |
| else: self.rule=rule | |
| self.domain = args[0] | |
| self.urls = list(args) | |
| self.crawled = [] | |
| self.limit = [0, 0] | |
| self.sleep = 1 | |
| def crawl(self, proxies=None): | |
| # while urls in self.urls | |
| while self.urls: | |
| if self.limit[0] != 0: | |
| if self.limit[0] == self.limit[1]: | |
| self.logger.info("Limit reached, returns.") | |
| break | |
| try: | |
| self.logger.info("Connecting to {}...".format(self.urls[0])) | |
| response = requests.get(self.urls[0], proxies) | |
| time.sleep(self.sleep) | |
| response.encoding = "utf-8" | |
| self.logger.debug("Analyzing to structures...") | |
| soup = bs(response.text, "html.parser") | |
| links = soup.find_all("a", {"href" : re.compile(self.rule)}) | |
| hrefs = [x.attrs["href"] for x in links] | |
| self.logger.info("Links are checked if they are crawled...") | |
| for href in hrefs: | |
| if self.domain[0:-1]+href in self.crawled: | |
| self.logger.warn("{} already crawled.".format(str(href))) | |
| pass | |
| else: self.urls.append(self.domain[0:-1]+href) | |
| self.crawled.append(self.urls[0]) | |
| # Remove first url from reversed self.urls list | |
| #self.urls[::-1].pop() | |
| del self.urls[0] | |
| self.limit[1]+=1 | |
| self.logger.debug("Updating database {}".format(self.idn)) | |
| try: | |
| self.couch.database["{}".format(self.idn)] = { | |
| "idn" : self.idn, | |
| "rule" : self.rule, | |
| "domain" : self.domain, | |
| "urls" : self.urls, | |
| } | |
| except ResourceConflict: | |
| revno = self.couch.database["{}".format(self.idn)]["_rev"] | |
| self.couch.database["{}".format(self.idn)] = { | |
| "_rev" : revno, | |
| "idn" : self.idn, | |
| "rule" : self.rule, | |
| "domain" : self.domain, | |
| "urls" : self.urls, | |
| } | |
| self.logger.info("Reached count {}/{}".format(str(self.limit[1]), str(self.limit[0]))) | |
| except Exception as e: | |
| exc_type, exc_obj, exc_tb = sys.exc_info() | |
| f = exc_tb.tb_frame | |
| lineno = exc_tb.tb_lineno | |
| filename = f.f_code.co_filename | |
| self.logger.error("Crawling function raised an error in line {} @ {}, passing: {}".format(lineno, filename, str(e))) | |
| if len(self.urls) == 0: | |
| self.logger.info("No url left to crawl, returns.") | |
| return | |
| class Scraper(object): | |
| logger = logging.getLogger("scraper_logger") | |
| logger.setLevel(logging.DEBUG) | |
| file_handler = logging.FileHandler(os.getcwd()+"/logs/scraper_{}.log".format( str(int(round(time.time()*1000))) )) | |
| file_handler.setLevel(logging.DEBUG) | |
| terminal_handler = logging.StreamHandler() | |
| terminal_handler.setLevel(logging.INFO) | |
| log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") | |
| file_handler.setFormatter(log_format) | |
| terminal_handler.setFormatter(log_format) | |
| logger.addHandler(terminal_handler) | |
| logger.addHandler(file_handler) | |
| def __init__(self, idn, couch): | |
| self.idn=idn+"_"+str(uuid.uuid1()) | |
| self.rules = [] | |
| self.data = {} | |
| self.couch = couch | |
| self.sleep = 1 | |
| def add_rule(self, title, tag, attrs): | |
| self.rules.append((title, tag, attrs)) | |
| self.logger.info("{} rule added.".format(str((title,tag,attrs)))) | |
| def select_rules(self, **where): | |
| selected = [] | |
| for item in self.rules: | |
| try: | |
| if item[0] in where["title"]: | |
| selected.append(item) | |
| elif item[1] in where["tag"]: | |
| selected.append(item) | |
| elif item[2] in where["attrs"]: | |
| selected.append(item) | |
| else: pass | |
| except Exception: | |
| continue | |
| return selected | |
| def remove_rules(self, *rules): | |
| for item in rules: | |
| self.rules.remove(item) | |
| self.logger.info("{} rules removed.".format(str(rules))) | |
| def scrap(self, proxies=None, *urls): | |
| for url in urls: | |
| self.logger.info("Connecting to {}...".format(url)) | |
| response = requests.get(url, proxies) | |
| time.sleep(self.sleep) | |
| response.encoding = "utf-8" | |
| self.logger.debug("Parsing to the structure...") | |
| soup = bs(response.text, "html.parser") | |
| self.data[url] = [] | |
| for rule in self.rules: | |
| self.logger.debug("Scraping rule {}".format(str(rule))) | |
| rule_search = soup.find_all(rule[1], rule[2]) | |
| if len(rule_search) == 0: pass | |
| result = [x.get_text() for x in rule_search] | |
| self.data[url].append((rule, result)) | |
| self.logger.info("Updating database...") | |
| try: | |
| self.couch.database["{}".format(self.idn)] = { | |
| "idn" : self.idn, | |
| "data" : self.data, | |
| "rules" : self.rules, | |
| "urls" : urls, | |
| } | |
| except ResourceConflict: | |
| revno = self.couch.database["{}".format(self.idn)]["_rev"] | |
| self.couch.database["{}".format(self.idn)] = { | |
| "_rev" : revno, | |
| "idn" : self.idn, | |
| "data" : self.data, | |
| "rules" : self.rules, | |
| "urls" : urls, | |
| } |
| import couchdb | |
| class Couch(object): | |
| def __init__(self, username, password): | |
| self.username = username | |
| self.password = password | |
| self.server = couchdb.Server() | |
| self.server.resource.credentials = (username, password) | |
| self.database = None | |
| def select_database(self, dbname): | |
| try: | |
| self.database=self.server.create(dbname) | |
| except Exception: | |
| self.database=self.server[dbname] |