Skip to content

Instantly share code, notes, and snippets.

@harveyslash
Created November 25, 2016 04:31
Show Gist options
  • Save harveyslash/423395a0914b78386452183eb1281591 to your computer and use it in GitHub Desktop.
Save harveyslash/423395a0914b78386452183eb1281591 to your computer and use it in GitHub Desktop.
import scrapy
import os
import urllib
import thread
from scrapy.selector import Selector
import cookie
class GoodReadsSpider(scrapy.Spider):
baseURL = "https://www.goodreads.com"
meta = {'dont_redirect': True, "handle_httpstatus_list" : [301, 302, 303]}
name = "datasetGoodReads"
pageCountStart = 1
pageCountEnd = 500
categories = [
"horror",
"art",
"business",
"science-fiction"
]
skeletonURL = baseURL + "/shelf/show/{}?page={}"
def start_requests(self):
print("started")
for category in self.categories:
print(category)
if not os.path.exists(category):
os.makedirs(category)
for i in range(self.pageCountStart,self.pageCountEnd):
finalURL = self.skeletonURL.format(category,i)
print(finalURL)
yield scrapy.Request(finalURL,self.crawlBasePage,meta={"category":category},cookies=cookie.cookiesData)
def downloadImage(self,url,location):
print("downlading image",url," ",location)
print(urllib.urlretrieve(url, location))
def doCrawl(self,response):
print(response.css("html").extract())
def crawlInfoPage(self,response):
print("I HAVE REACHED")
link = response.xpath('//img[@id="coverImage"]/@src').extract_first()
print(link)
contatenatedName = response.meta['category'] +"/"+link.split("/")[-2] + link.split("/")[-1]
thread.start_new_thread(self.downloadImage, (link, contatenatedName,))
def crawlBasePage(self,response):
link = response.css('a.bookTitle::attr(href)').extract()
print("!!!!!!!!!!!!!",len(link))
print(link[0])
for i in link:
print(self.baseURL+i)
yield scrapy.Request(self.baseURL+i,self.crawlInfoPage,meta=response.meta,cookies=cookie.cookiesData)
def parse(self, response):
print(response.css("html").extract())
# return self.openLoginPage(response)
# s-access-image cfMarker
# print("printing stuff !!!!!!!!!")
listOfBooks = response.css('img.s-access-image::attr(src)').extract()
for i in listOfBooks:
# print("OKKKKKKKKKKKKKKKKKKKKKKKKKKKK")
thread.start_new_thread(self.downloadImage, (i, response.meta['category']+"/"+ i.split("/")[-1],) )
# print(i)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment