Skip to content

Instantly share code, notes, and snippets.

@xhiroga
Created April 1, 2017 15:21
Show Gist options
  • Save xhiroga/850b3d6d324dbbae5610fb6f94e4860f to your computer and use it in GitHub Desktop.
Save xhiroga/850b3d6d324dbbae5610fb6f94e4860f to your computer and use it in GitHub Desktop.
scrapyでポプテピピックセカンドシーズンの更新をスクレイピングしたいときのためのコードです。
# -*- coding: utf-8 -*-
import scrapy
import re
import boto3
from pptp.items import PptpItem
class Popute2Spider(scrapy.Spider):
name = "popute2"
allowed_domains = ["mangalifewin.takeshobo.co.jp/rensai/popute2/"]
start_urls = ['http://mangalifewin.takeshobo.co.jp/rensai/popute2//']
print ("this is 1st start url -> "+ start_urls[0])
def parse(self, response):
# dynamoDB接続準備
client = boto3.client('dynamodb')
# Crawl Part
month_day =''
lastMonth = 0
lastDay = 0
for sel in response.css("div.bookR"):
for td in sel.css("td"):
pptp = PptpItem()
pptp['title'] = td.css("a::attr('title')").extract_first()
pptp['url'] = td.css("a::attr('href')").extract_first()
# print ('pptp["title"] -> ' + pptp["title"]) #visualize code
match = re.search(r'\d+-\d+',pptp['title'])
if match != None:
month_day = match.group().split("-")
# print("month_day -> " + str(month_day)) # visualize vode
if (int(month_day[0]) >= lastMonth) & (int(month_day[1]) >= lastDay):
lastMonth = int(month_day[0])
lastDay = int(month_day[1])
serial = int(re.search(r'/(\d+)/$',pptp['url']).group(1))
yield pptp
# update dynamoDB
# print ("Last Month -> " + str(lastMonth) + ", Last Day -> " + str(lastDay))
# print ("serial ->" + str(serial))
scan = client.scan(TableName='pptp')
if (lastMonth >= int(scan['Items'][0]['month']['N'])) & (lastDay > int(scan['Items'][0]['day']['N'])):
# dynamodbのorderがlatestだからできる荒技
res = client.put_item(
TableName='pptp',
Item={
'serial':{'N':str(serial)},
'month':{'N':str(lastMonth)},
'day':{'N':str(lastDay)}
}
)
# print ("put item finished! res -> ")
# print (res)
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment