Last active
December 6, 2017 04:40
-
-
Save iwatake2222/872467b79c8799db69dbd7b0755e4490 to your computer and use it in GitHub Desktop.
Python + Scrapyで画像を巡回取得する ref: https://qiita.com/take-iwiw/items/be292150316a3ca033eb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
scrapy startproject test_scrapy | |
cd test_scrapy |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
scrapy genspider save_yahoo_image news.yahoo.co.jp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
scrapy crawl save_yahoo_image |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import scrapy | |
from scrapy.item import Item, Field | |
class ImageItem(Item): | |
image_directory_name = Field() | |
image_urls = Field() | |
images = Field() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import scrapy | |
from scrapy.pipelines.images import ImagesPipeline | |
from scrapy.utils.misc import md5sum | |
# refer: https://stackoverflow.com/questions/31779995/how-to-give-custom-name-to-images-when-downloading-through-scrapy | |
class MyImagesPipeline(ImagesPipeline): | |
def get_media_requests(self, item, info): | |
for image_url in item['image_urls']: | |
yield scrapy.Request(image_url, meta={'image_directory_name': item["image_directory_name"]}) | |
def image_downloaded(self, response, request, info): | |
checksum = None | |
for path, image, buf in self.get_images(response, request, info): | |
if checksum is None: | |
buf.seek(0) | |
checksum = md5sum(buf) | |
width, height = image.size | |
filename = request._url.rsplit("/", 1)[1] | |
path = 'full/%s/%s' % (response.meta['image_directory_name'], filename) | |
self.store.persist_file( | |
path, buf, info, | |
meta={'width': width, 'height': height}) | |
return checksum |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import scrapy | |
from scrapy.spiders import CrawlSpider, Rule | |
from scrapy.linkextractors import LinkExtractor | |
from test_scrapy.items import ImageItem | |
class SaveYahooImageSpider(CrawlSpider): | |
name = 'save_yahoo_image' | |
allowed_domains = ["news.yahoo.co.jp"] | |
start_urls = ["https://news.yahoo.co.jp"] | |
rules = ( | |
Rule(LinkExtractor(allow=( )), callback="parse_page", follow=True), | |
) | |
def parse_page(self, response): | |
print("\n>>> Parse " + response.url + " <<<") | |
# print(response.url.rsplit("/", 1)[0]) | |
item = ImageItem() | |
item["image_directory_name"] = self.start_urls[0].rsplit("/", 1)[1] | |
item["image_urls"] = [] | |
for image_url in response.xpath("//img/@src").extract(): | |
if "http" not in image_url: | |
item["image_urls"].append(response.url.rsplit("/", 1)[0] + "/" + image_url) | |
else: | |
item["image_urls"].append(image_url) | |
# print(vars(item)) | |
return item |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ITEM_PIPELINES = {'test_scrapy.pipelines.MyImagesPipeline': 1} | |
# ITEM_PIPELINES = {'scrapy.contrib.pipeline.images.ImagesPipeline': 1} | |
IMAGES_STORE = './savedImages' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment