deepanshumehtaa · March 1, 2024 04:27
diff --git a/MyScrapy.py b/MyScrapy.py
 import re

 import scrapy
 from scrapy.http import Request, Response


 class MyscraperSpider(scrapy.Spider):
    """
    > scrapy runspider MyScraper.py -o quotes.jsonl
    """
    name = "MyScraper"  # spider name
    allowed_domains = [
        "kljdevelopers.com",
    ]
    start_urls = [
        "https://kljdevelopers.com/",
    ]

    def start_requests(self):
        """
        initial Requests to start the spider.
        It's used to define where the spider should begin crawling.
        """
        for url in self.start_urls:
            yield Request(url, dont_filter=True, callback=self.parse)

    def parse(self, response):
        """
        > scrapy crawl MyScraper
        default callback method for processing downloaded web pages.
        You define how to extract data from a page.
        """
        # Extract text from the page
        text = response.css('body::text').getall()
        text = ' '.join(text)

        # Search for patterns that resemble addresses
        address_pattern = r'\d+\s+\w+\s+\w+'
        addresses = re.findall(address_pattern, text)

        # Yield the addresses as results
        for address in addresses:
            yield {
                'address': address,
                'url': response.url
            }

    def extract_text_without_css(self, response):
        # Remove style and script tags to ignore CSS and JavaScript
        response = response.replace(body=response.css('body *:not(style):not(script)').extract())

        # Extract the remaining text content
        text_content = " ".join(response.css('body::text').extract())

        # You may further clean or process the text content if needed
        return text_content

    def parse_item(response):
        ...

    def closed(reason):
        """
        Called when the spider is closed,
        allowing you to perform cleanup operations or finalize data storage.
        """
        ...

    def make_requests_from_url(url):
        """
        This method generates a Request object for a given URL.
        """
        ...

    def process_request(request, spider):
        """
        Middleware method that processes the outgoing requests:
        such as adding headers or handling cookies.
        """
        ...

    def process_response(request, response, spider):
        """
        Middleware method that processes incoming responses:
        allowing you to modify or filter them.
        """
        ...

    def item_scraped(item, response, spider):
        """
        Called after an item has been scraped,
        allowing you to perform additional actions after scraping.
        """
        ...

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = cls(*args, **kwargs)
        spider._set_crawler(crawler)
        return spider

    def close_spider(self, spider):
        """
        This method can be overridden in your spider to perform actions when the spider is closed.
        It's called at the end of the spider's lifecycle.
        """
        ...
	import re

	import scrapy
	from scrapy.http import Request, Response


	class MyscraperSpider(scrapy.Spider):
	"""
	> scrapy runspider MyScraper.py -o quotes.jsonl
	"""
	name = "MyScraper" # spider name
	allowed_domains = [
	"kljdevelopers.com",
	]
	start_urls = [
	"https://kljdevelopers.com/",
	]

	def start_requests(self):
	"""
	initial Requests to start the spider.
	It's used to define where the spider should begin crawling.
	"""
	for url in self.start_urls:
	yield Request(url, dont_filter=True, callback=self.parse)

	def parse(self, response):
	"""
	> scrapy crawl MyScraper
	default callback method for processing downloaded web pages.
	You define how to extract data from a page.
	"""
	# Extract text from the page
	text = response.css('body::text').getall()
	text = ' '.join(text)

	# Search for patterns that resemble addresses
	address_pattern = r'\d+\s+\w+\s+\w+'
	addresses = re.findall(address_pattern, text)

	# Yield the addresses as results
	for address in addresses:
	yield {
	'address': address,
	'url': response.url
	}

	def extract_text_without_css(self, response):
	# Remove style and script tags to ignore CSS and JavaScript
	response = response.replace(body=response.css('body *:not(style):not(script)').extract())

	# Extract the remaining text content
	text_content = " ".join(response.css('body::text').extract())

	# You may further clean or process the text content if needed
	return text_content

	def parse_item(response):
	...

	def closed(reason):
	"""
	Called when the spider is closed,
	allowing you to perform cleanup operations or finalize data storage.
	"""
	...

	def make_requests_from_url(url):
	"""
	This method generates a Request object for a given URL.
	"""
	...

	def process_request(request, spider):
	"""
	Middleware method that processes the outgoing requests:
	such as adding headers or handling cookies.
	"""
	...

	def process_response(request, response, spider):
	"""
	Middleware method that processes incoming responses:
	allowing you to modify or filter them.
	"""
	...

	def item_scraped(item, response, spider):
	"""
	Called after an item has been scraped,
	allowing you to perform additional actions after scraping.
	"""
	...

	@classmethod
	def from_crawler(cls, crawler, args, *kwargs):
	spider = cls(args, *kwargs)
	spider._set_crawler(crawler)
	return spider

	def close_spider(self, spider):
	"""
	This method can be overridden in your spider to perform actions when the spider is closed.
	It's called at the end of the spider's lifecycle.
	"""
	...