Skip to content

Instantly share code, notes, and snippets.

@rafikahmed
Last active November 24, 2019 17:36
Show Gist options
  • Select an option

  • Save rafikahmed/d965ad10d4ec6124589b2455c6efdae0 to your computer and use it in GitHub Desktop.

Select an option

Save rafikahmed/d965ad10d4ec6124589b2455c6efdae0 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import scrapy
class ChannelSpider(scrapy.Spider):
name = 'channel'
allowed_domains = ['channelcrawler.com']
def start_requests(self):
yield scrapy.Request(
url="https://channelcrawler.com/eng/results/136614",
meta={"proxy": "192.168.99.100:32770", "dont_redirect": True},
callback=self.parse,
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
)
def parse(self, response):
row = response.xpath("//div[contains(@class,'channel')]")
for each_row in row:
yield {
'Channel Name': each_row.xpath('.//h4/a/text()').get(),
'Category': each_row.xpath('.//small/b/text()').get(),
'Subscriber': (each_row.xpath('normalize-space(.//p[1]/small/text()[1])').get().split(' '))[0],
'Total videos': (each_row.xpath('normalize-space(.//p[1]/small/text()[2])').get().split(' '))[0],
'Total Views': (each_row.xpath('normalize-space(.//p[1]/small/text()[3])').get().split(' '))[0],
'Join Date': (each_row.xpath('normalize-space(.//p[1]/small/text()[4])').get().split(':'))[1],
'Country': each_row.xpath('.//h4/img/@title').get()
}
next_page = response.xpath(
'//ul[@class="pagination"]/li[@class="next"]/a/@href').get()
if next_page:
next_page_link = response.urljoin(next_page)
yield scrapy.Request(next_page_link,
callback=self.parse,
meta={"proxy": "192.168.99.100:32770"},
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
}
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment