wiljdaws · November 6, 2023 07:55
diff --git a/endpoint.py b/endpoint.py
 import scrapy
 from tabulate import tabulate
 import os

 class EndpointSpider(scrapy.Spider):
    name = 'endpoint-spider'
    base = 'wikipedia' # <-------- base of url you want to crawl
    start_urls = [f'https://www.{base}.com']

    def __init__(self, *args, **kwargs):
        super(EndpointSpider, self).__init__(*args, **kwargs)
        self.visited_links = set()
        self.links_file = os.path.join(self.base, 'endpoints.txt')

    def save_link_to_file(self, link):
        with open(self.links_file, 'a') as f:
            f.write(link + '\n')

    def start_requests(self):
        # Create 'endpoints.txt' file if it doesn't exist
        if not os.path.exists(self.base):
            os.mkdir(self.base)
        if not os.path.exists(self.links_file):
            open(self.links_file, 'w').close()

        for url in self.start_urls:
            yield scrapy.Request(url, callback=self.parse)

    def parse(self, response):
        # Extract links from the current page
        links = response.css('a::attr(href)').getall()

        for link in links:
            if link not in self.visited_links:
                self.visited_links.add(link)
                self.save_link_to_file(link)

            # Follow links to other pages
            if link.startswith('http://') or link.startswith('https://'):
                if f'{self.base}' in link:
                    yield response.follow(link, self.parse)

        TABLE_SELECTORS = ['table', 'div.data-table']

        for selector in TABLE_SELECTORS:
            # Use a CSS selector to find tables on the page
            tables = response.css(selector)
            for table in tables:
                # Process each table row
                for row in table.css('tr'):
                    data = {}
                    for index, cell in enumerate(row.css('td, th')):
                        # Use the column header or index as the key
                        header = cell.css('th::text').get()
                        key = header if header else f'column{index + 1}'
                        value = cell.css('td::text').get()
                        data[key] = value
                    # Yield the extracted data as a Scrapy Item
                    yield data

 '''
  If you keep the name endpoint.py run below in the command line to run the file.
  scrapy runspider endpoint.py
 '''
	import scrapy
	from tabulate import tabulate
	import os

	class EndpointSpider(scrapy.Spider):
	name = 'endpoint-spider'
	base = 'wikipedia' # <-------- base of url you want to crawl
	start_urls = [f'https://www.{base}.com']

	def __init__(self, args, *kwargs):
	super(EndpointSpider, self).__init__(args, *kwargs)
	self.visited_links = set()
	self.links_file = os.path.join(self.base, 'endpoints.txt')

	def save_link_to_file(self, link):
	with open(self.links_file, 'a') as f:
	f.write(link + '\n')

	def start_requests(self):
	# Create 'endpoints.txt' file if it doesn't exist
	if not os.path.exists(self.base):
	os.mkdir(self.base)
	if not os.path.exists(self.links_file):
	open(self.links_file, 'w').close()

	for url in self.start_urls:
	yield scrapy.Request(url, callback=self.parse)

	def parse(self, response):
	# Extract links from the current page
	links = response.css('a::attr(href)').getall()

	for link in links:
	if link not in self.visited_links:
	self.visited_links.add(link)
	self.save_link_to_file(link)

	# Follow links to other pages
	if link.startswith('http://') or link.startswith('https://'):
	if f'{self.base}' in link:
	yield response.follow(link, self.parse)

	TABLE_SELECTORS = ['table', 'div.data-table']

	for selector in TABLE_SELECTORS:
	# Use a CSS selector to find tables on the page
	tables = response.css(selector)
	for table in tables:
	# Process each table row
	for row in table.css('tr'):
	data = {}
	for index, cell in enumerate(row.css('td, th')):
	# Use the column header or index as the key
	header = cell.css('th::text').get()
	key = header if header else f'column{index + 1}'
	value = cell.css('td::text').get()
	data[key] = value
	# Yield the extracted data as a Scrapy Item
	yield data

	'''
	If you keep the name endpoint.py run below in the command line to run the file.
	scrapy runspider endpoint.py
	'''