Created
May 7, 2021 15:45
-
-
Save zdxerr/3571118a33eb18c3a4802182ca83bb66 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Sun May 2 12:01:41 2021 | |
@author: Haya Halimeh | |
""" | |
import json | |
import scrapy | |
from scrapy.selector import Selector | |
NUMBER_OF_DEBATES = 1 | |
class CrawlerDebatesSpider(scrapy.Spider): | |
name = 'debate_crawler' | |
def start_requests(self): | |
#start_url | |
urls=['https://www.debate.org/opinions/?sort=popular/'] | |
for url in urls: | |
request = scrapy.Request(url=url, callback=self.parse_url) | |
yield request | |
def parse_url(self, response): | |
debates = response.css('#opinions-list .a-image-contain') | |
# find first 5 popular debates aka urls for the debates | |
for i in range(NUMBER_OF_DEBATES): | |
relative_url = debates[i].css('a::attr(href)').get() | |
debate_url = f'https://www.debate.org{relative_url}' | |
# follow each url and scrap the pages using parse in callback | |
yield response.follow(debate_url, callback= self.parse) | |
def parse_more(self, response): | |
s = Selector(text=json.loads(response.text)["d"]) | |
print(s) | |
pass | |
def parse(self, response): | |
# retrieve topic and categorie using css tags per page | |
topic=response.css('div.r-contain h1.qh-debate span.q-title ::text').get() | |
categorie=response.css('div#breadcrumb a::text')[2].get() | |
# create a pro and a con lists to save nested arguments | |
pro_list = [] | |
con_list = [] | |
load_more_id = response.css('.debate-more-btn').attrib["onclick"].strip().split("'")[1] | |
load_more_url = f'https://www.debate.org/opinions/~services/opinions.asmx/GetDebateArgumentPage' | |
params = { | |
"debateId": load_more_id, | |
"pageNumber": 1, | |
"itemsPerPage": 50, | |
"ysort": 5, | |
"nsort": 5 | |
} | |
yield response.follow( | |
url=load_more_url, | |
callback=self.parse_more, | |
method="POST", | |
body=json.dumps(params), | |
headers={'Content-Type':'application/json'}, | |
) | |
return | |
#retrieve pro_arguments using css tags | |
pro_arguments=response.css('div#yes-arguments li') | |
for i in range(0, len(pro_arguments)-1): | |
title = pro_arguments[i].css('.hasData h2::text').getall() or pro_arguments[i].css('.hasData h2 a::text').getall() | |
body= pro_arguments[i].css('p::text').getall() | |
#create con_item dict object | |
pro_item={} | |
pro_item['title']=title | |
pro_item['body']=body | |
#save pro_item object into a pro_list | |
pro_list.append(pro_item) | |
#pro_titles = pro_arguments.css('.hasData h2 a::text').getall() + pro_arguments.css('h2::text').getall() | |
#pro_bodies= pro_arguments.css('p::text').getall() | |
#retrieve con_arguments using css tags | |
con_arguments=response.css('div#no-arguments li') | |
for i in range(0, len(con_arguments)-1): | |
title = con_arguments[i].css('.hasData h2::text').getall() or con_arguments[i].css('.hasData h2 a::text').getall() | |
body= con_arguments[i].css('p::text').getall() | |
#create con_item dict object | |
con_item={} | |
con_item['title']=title | |
con_item['body']=body | |
#save con_item object into a con_list | |
con_list.append(con_item) | |
#con_titles = con_arguments.css('.hasData h2 a::text').getall() + con_arguments.css('h2::text').getall() | |
#con_bodies= con_arguments.css('p::text').getall() | |
yield {"topic": topic, "categorie": categorie, "pro_arguments":pro_list, "con_arguments": con_list} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment