Created
May 19, 2020 20:45
-
-
Save rafikahmed/c2c4524178b89985ee19085a3d800c4a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Define here the models for your scraped items | |
# | |
# See documentation in: | |
# https://doc.scrapy.org/en/latest/topics/items.html | |
# | |
# -*- coding: utf-8 -*- | |
# Define here the models for your scraped items | |
# | |
# See documentation in: | |
# https://docs.scrapy.org/en/latest/topics/items.html | |
import unicodedata | |
import re | |
import scrapy | |
from scrapy.loader.processors import TakeFirst, MapCompose | |
def slugify(value, allow_unicode=False): | |
""" | |
Convert to ASCII if 'allow_unicode' is False. Convert spaces to hyphens. | |
Remove characters that aren't alphanumerics, underscores, or hyphens. | |
Convert to lowercase. Also strip leading and trailing whitespace. | |
""" | |
value = str(value) | |
if allow_unicode: | |
value = unicodedata.normalize('NFKC', value) | |
else: | |
value = unicodedata.normalize('NFKD', value).encode( | |
'ascii', 'ignore').decode('ascii') | |
value = re.sub(r'[^\w\s-]', '', value.lower()).strip() | |
return re.sub(r'[-\s]+', '-', value) | |
class ScrapeimagesItem(scrapy.Item): | |
# define the fields for your item here like: | |
# name = scrapy.Field() | |
image_urls = scrapy.Field() | |
images = scrapy.Field() | |
book_name = scrapy.Field( | |
input_processor=MapCompose(slugify), | |
output_processor=TakeFirst() | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Define your item pipelines here | |
# | |
# Don't forget to add your pipeline to the ITEM_PIPELINES setting | |
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html | |
from scrapy import Request | |
from scrapy.pipelines.images import ImagesPipeline | |
import hashlib | |
class ScrapeimagesPipeline(ImagesPipeline): | |
def get_media_requests(self, item, info): | |
return [Request(x, meta={'BookName': item.get('book_name')}) for x in item.get(self.images_urls_field, [])] | |
def file_path(self, request, response=None, info=None): | |
# start of deprecation warning block (can be removed in the future) | |
def _warn(): | |
from scrapy.exceptions import ScrapyDeprecationWarning | |
import warnings | |
warnings.warn('ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, ' | |
'please use file_path(request, response=None, info=None) instead', | |
category=ScrapyDeprecationWarning, stacklevel=1) | |
# check if called from image_key or file_key with url as first argument | |
if not isinstance(request, Request): | |
_warn() | |
url = request | |
else: | |
url = request.url | |
# detect if file_key() or image_key() methods have been overridden | |
if not hasattr(self.file_key, '_base'): | |
_warn() | |
return self.file_key(url) | |
elif not hasattr(self.image_key, '_base'): | |
_warn() | |
return self.image_key(url) | |
# end of deprecation warning block | |
filename = request.meta['BookName'].replace(':', '') | |
return 'full/%s.jpg' % (filename) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ITEM_PIPELINES = { | |
'<project_name>.pipelines.ScrapeimagesPipeline': 1, # please replace <project_name> by your project name | |
} | |
IMAGES_STORE = 'C:\\Users\\username\\Desktop' # please replace username by your username | |
Download_TIMEOUT = 1200 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment