Skip to content

Instantly share code, notes, and snippets.

@deepanshumehtaa
Last active March 1, 2024 04:27
Show Gist options
  • Save deepanshumehtaa/e3e9911844218f63c43eeef1cefa3d6f to your computer and use it in GitHub Desktop.
Save deepanshumehtaa/e3e9911844218f63c43eeef1cefa3d6f to your computer and use it in GitHub Desktop.
Scrapy boilerPlate
import re
import scrapy
from scrapy.http import Request, Response
class MyscraperSpider(scrapy.Spider):
"""
> scrapy runspider MyScraper.py -o quotes.jsonl
"""
name = "MyScraper" # spider name
allowed_domains = [
"kljdevelopers.com",
]
start_urls = [
"https://kljdevelopers.com/",
]
def start_requests(self):
"""
initial Requests to start the spider.
It's used to define where the spider should begin crawling.
"""
for url in self.start_urls:
yield Request(url, dont_filter=True, callback=self.parse)
def parse(self, response):
"""
> scrapy crawl MyScraper
default callback method for processing downloaded web pages.
You define how to extract data from a page.
"""
# Extract text from the page
text = response.css('body::text').getall()
text = ' '.join(text)
# Search for patterns that resemble addresses
address_pattern = r'\d+\s+\w+\s+\w+'
addresses = re.findall(address_pattern, text)
# Yield the addresses as results
for address in addresses:
yield {
'address': address,
'url': response.url
}
def extract_text_without_css(self, response):
# Remove style and script tags to ignore CSS and JavaScript
response = response.replace(body=response.css('body *:not(style):not(script)').extract())
# Extract the remaining text content
text_content = " ".join(response.css('body::text').extract())
# You may further clean or process the text content if needed
return text_content
def parse_item(response):
...
def closed(reason):
"""
Called when the spider is closed,
allowing you to perform cleanup operations or finalize data storage.
"""
...
def make_requests_from_url(url):
"""
This method generates a Request object for a given URL.
"""
...
def process_request(request, spider):
"""
Middleware method that processes the outgoing requests:
such as adding headers or handling cookies.
"""
...
def process_response(request, response, spider):
"""
Middleware method that processes incoming responses:
allowing you to modify or filter them.
"""
...
def item_scraped(item, response, spider):
"""
Called after an item has been scraped,
allowing you to perform additional actions after scraping.
"""
...
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = cls(*args, **kwargs)
spider._set_crawler(crawler)
return spider
def close_spider(self, spider):
"""
This method can be overridden in your spider to perform actions when the spider is closed.
It's called at the end of the spider's lifecycle.
"""
...
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment