Created
July 6, 2017 04:13
-
-
Save T31337/0a903b5913f00ab0b0368a3d6970f2f6 to your computer and use it in GitHub Desktop.
Scrapy & BeautifulSoup Based Python Spider
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup, SoupStrainer | |
from scrapy.selector import HtmlXPathSelector | |
from urllib import request | |
import requests | |
from scrapy.linkextractors import LinkExtractor | |
import requests.utils | |
import scrapy.link | |
import scrapy | |
import scrapy.spiders | |
from scrapy.spiders.crawl import CrawlSpider, Rule | |
from fake_useragent import UserAgent, FakeUserAgentError | |
import urllib | |
import lxml.html | |
import sys,os | |
global footer, spiderLog, spiderDebug, logFile, fileLocation | |
def setupGlobals(): | |
global footer, spiderLog, spiderDebug, logFile, fileLocation | |
spiderDebug=True | |
logFile=True | |
spiderLog = "spiderLog.txt" | |
footer = "\n==========================\n" | |
fileLocation="./" | |
class Data: | |
urlsList = [] | |
class SpiderCrawler(scrapy.spiders.Spider): | |
name="CrawlSpider" | |
rules = ( | |
# Extract links matching 'category.php' (but not matching 'subsection.php') | |
# and follow links from them (since no callback means follow=True by default). | |
Rule(LinkExtractor(allow=('category\.php',), deny=('subsection\.php',))), | |
# Extract links matching 'item.php' and parse them with the spider's method parse_item | |
Rule(LinkExtractor(allow=('item\.php',)), callback='parse_item'), | |
) | |
def __init__(self, category=None, *args, **kwargs): | |
super(SpiderCrawler, self).__init__(*args, **kwargs) | |
self.start_urls = Data.urlsList | |
print("Initiating SpiderCrawler...") | |
self.headers = { | |
'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.52 Safari/537.36", | |
'From': 'www.google.com/', | |
'Accept-Encoding': ', '.join(('gzip', 'deflate')), | |
'Accept': '*/*', | |
'Connection': 'keep-alive', | |
} | |
def readFile(self,file_name): | |
fileHeader = "\n=====FileData=====\n" | |
print(fileHeader) | |
#if logFile: | |
#file.write(fileHeader) | |
with open(file_name, "r") as f: | |
text = f.readlines() | |
for line in text: | |
print(line) | |
#if logFile: | |
# file.write(line) | |
if line[0]=="#" or line[1] == "#": | |
pass | |
else: | |
if spiderDebug: | |
print("\nFound URL: " + line+"\n") | |
Data.urlsList.append(line.strip("\n")) | |
urlsHeader = "\n========UrlsList==========\n" | |
print(urlsHeader) | |
for url in Data.urlsList: | |
print(url) | |
def parse_item(self, response): | |
self.logger.info('Hi, this is an item page! %s', response.url) | |
item = scrapy.Item() | |
item['id'] = response.xpath('//td[@id="item_id"]/text()').re(r'ID: (\d+)') | |
item['name'] = response.xpath('//td[@id="item_name"]/text()').extract() | |
item['description'] = response.xpath('//td[@id="item_description"]/text()').extract() | |
return item | |
def start_requests(self): | |
for url in self.start_urls: | |
requests = self.make_requests_from_url(url) | |
if type(requests) is list: | |
for request in requests: | |
yield request | |
else: | |
yield requests | |
def printLinks(self): | |
linksHeader="\n===========Links==========\m" | |
if logFile: | |
file.write(linksHeader) | |
for url in Data.urlsList: | |
r = requests.get(Data.url) | |
data = r.text | |
soup = BeautifulSoup(data, "lxml") | |
for link in soup.find_all('a'): | |
print(link.get('href')) | |
if logFile: | |
file.write(link+"\n") | |
print("\n=====================\m") | |
def getLinks(self): | |
connection = request.urlopen(Data.urlsList) | |
dom = lxml.html.fromstring(connection.read()) | |
if(spiderDebug): | |
print("=====Links======\n") | |
for link in dom.xpath('//a/@href'): # select the url in href for all a tags(links) | |
print(link) | |
if logFile: | |
file.write(link+"\n") | |
return dom.xpath('//a/@href') # select the url in href for all a tags(links) | |
def getURL(self,url): | |
response = requests.get(self,url) | |
session = requests.Session() | |
session.headers.update(self.headers) | |
session.get(url) | |
return response.request | |
def SpiderSite(self,url): | |
r=requests.get(url) | |
data = r.text | |
soup = BeautifulSoup(data,"lxml") | |
return soup | |
def make_soup(self,url): | |
html = request.urlopen(self,url).read() | |
return BeautifulSoup(html, "lxml") | |
def get_category_links(section_url): | |
html = request.urlopen(section_url).read() | |
soup = BeautifulSoup(html, "lxml") | |
boccat = soup.find("a", "href") | |
category_links = [section_url + dd.a["href"] for dd in boccat.findAll("a")] | |
if spiderDebug: | |
print("\nCategoryLinks:\n"+category_links) | |
return category_links | |
def parse_page(self, response): | |
self.log("\n\n\n We got data! \n\n\n") | |
hxs = HtmlXPathSelector(response) | |
sites = hxs.select('//ol[@id=\'result-set\']/li') | |
for site in sites: | |
item = site.select('a') | |
item['title'] = site.select('./h2/a/text()').extract() | |
item['link'] = site.select('./h2/a/@href').extract() | |
yield item | |
def parse(self, response): | |
hxs = HtmlXPathSelector(response) | |
titles = hxs.select("//span[@class='html']") | |
for titles in titles: | |
title = titles.select("a/text()").extract() | |
link = titles.select("a/@href").extract() | |
print(title, link) | |
if logFile: | |
file.write(dataHeader) | |
file.write("{0} | {1}\n".format(title,link)) | |
return titles | |
def get(self,url): | |
with request.urlopen(self,url) as r: | |
return r.read() | |
def downloadFile(self,url, file=None): | |
if not file: | |
file = url.split('/')[-1] | |
with open(file, 'wb') as f: | |
f.write(self.get(url)) | |
file.close() | |
def download(self,url, file_name): | |
# open in binary mode | |
with open(file_name, "wb") as file: | |
# get request | |
response = self.get(url) | |
# write to file | |
file.write(response.content) | |
file.close() | |
def getData(self,url): | |
file_name = '~/Downloads/web/data.txt' | |
fileName="~/Downloads/web/data_download.txt" | |
response = urllib.request.urlopen(self,url) | |
data = response.read() # a `bytes` object | |
text = data.decode('utf-8') # a `str`; this step can't be used if data is binary | |
# Download the file from `url` and save it locally under `file_name`: | |
urllib.request.urlretrieve(self,url, file_name) | |
# Download the file from `url`, save it in a temporary directory and get the | |
# path to it (e.g. '/tmp/tmpb48zma.txt') in the `file_name` variable: | |
file_name, headers = urllib.request.urlretrieve(self,url) | |
req = requests.get(self,url) | |
file = open(fileName, 'wb') | |
for chunk in req.iter_content(100000): | |
file.write(chunk) | |
file.close() | |
def extractURLS(self,url): | |
r = requests.get(url) | |
data = r.text | |
soup = BeautifulSoup(data) | |
soup = crawler.make_soup(soup).find_all("a") | |
if(spiderDebug): | |
print("\n===========URLS=================\n") | |
for item in soup: | |
print(item) | |
print("\n================================\n") | |
return crawler.make_soup(soup).find_all("a") | |
if __name__ == "__main__": | |
setupGlobals() | |
file = open(spiderLog,'w') | |
head = "\n=========SpiderInfo===========\n" | |
print(head) | |
print("Setting Up The Spider...\n") | |
crawler = SpiderCrawler(CrawlSpider) | |
crawler.start_requests() | |
print("Adding URLS Form download.txt To The Spider Crawl List...\n") | |
try: | |
crawler.readFile('download.txt') | |
except Exception as e: | |
print("Error:\n"+str(e)) | |
dataHeader = "\n===========SpiderData==============\n" | |
print(dataHeader) | |
if logFile: | |
file.write(dataHeader) | |
for uri in Data.urlsList: | |
urlHeader = "\n=============Base URL=================\n"+uri+"\n========================================\n" | |
print(urlHeader) | |
if logFile: | |
file.write(urlHeader) | |
links = crawler.SpiderSite(uri) | |
for link in links.find_all('a'): | |
print(link.get('href')) | |
if logFile: | |
file.write(link.get("href")+"\n") | |
if logFile: | |
file.write(footer) | |
print("===========================") | |
print("{0} Saved To {1}".format(file.name,fileLocation)) | |
print("===========================") | |
file.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment