Last active
March 1, 2024 04:27
-
-
Save deepanshumehtaa/e3e9911844218f63c43eeef1cefa3d6f to your computer and use it in GitHub Desktop.
Scrapy boilerPlate
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import scrapy | |
from scrapy.http import Request, Response | |
class MyscraperSpider(scrapy.Spider): | |
""" | |
> scrapy runspider MyScraper.py -o quotes.jsonl | |
""" | |
name = "MyScraper" # spider name | |
allowed_domains = [ | |
"kljdevelopers.com", | |
] | |
start_urls = [ | |
"https://kljdevelopers.com/", | |
] | |
def start_requests(self): | |
""" | |
initial Requests to start the spider. | |
It's used to define where the spider should begin crawling. | |
""" | |
for url in self.start_urls: | |
yield Request(url, dont_filter=True, callback=self.parse) | |
def parse(self, response): | |
""" | |
> scrapy crawl MyScraper | |
default callback method for processing downloaded web pages. | |
You define how to extract data from a page. | |
""" | |
# Extract text from the page | |
text = response.css('body::text').getall() | |
text = ' '.join(text) | |
# Search for patterns that resemble addresses | |
address_pattern = r'\d+\s+\w+\s+\w+' | |
addresses = re.findall(address_pattern, text) | |
# Yield the addresses as results | |
for address in addresses: | |
yield { | |
'address': address, | |
'url': response.url | |
} | |
def extract_text_without_css(self, response): | |
# Remove style and script tags to ignore CSS and JavaScript | |
response = response.replace(body=response.css('body *:not(style):not(script)').extract()) | |
# Extract the remaining text content | |
text_content = " ".join(response.css('body::text').extract()) | |
# You may further clean or process the text content if needed | |
return text_content | |
def parse_item(response): | |
... | |
def closed(reason): | |
""" | |
Called when the spider is closed, | |
allowing you to perform cleanup operations or finalize data storage. | |
""" | |
... | |
def make_requests_from_url(url): | |
""" | |
This method generates a Request object for a given URL. | |
""" | |
... | |
def process_request(request, spider): | |
""" | |
Middleware method that processes the outgoing requests: | |
such as adding headers or handling cookies. | |
""" | |
... | |
def process_response(request, response, spider): | |
""" | |
Middleware method that processes incoming responses: | |
allowing you to modify or filter them. | |
""" | |
... | |
def item_scraped(item, response, spider): | |
""" | |
Called after an item has been scraped, | |
allowing you to perform additional actions after scraping. | |
""" | |
... | |
@classmethod | |
def from_crawler(cls, crawler, *args, **kwargs): | |
spider = cls(*args, **kwargs) | |
spider._set_crawler(crawler) | |
return spider | |
def close_spider(self, spider): | |
""" | |
This method can be overridden in your spider to perform actions when the spider is closed. | |
It's called at the end of the spider's lifecycle. | |
""" | |
... |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment