darookee · April 27, 2023 15:34
diff --git a/README.md b/README.md
diff --git a/crawler.py b/crawler.py
 import sys
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse

 ALL_LINK = set()

 def get_links(url):
    # Make a GET request to the specified URL
    response = requests.get(url)

    content_type = response.headers.get('Content-Type')
    if not content_type or 'html' not in content_type:
        return [], [(None, None)]

    try:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
    except:
        return [], [(None, None)]

    # Find all anchor tags in the page and extract their href attributes
    links = []
    for a_tag in soup.find_all('a'):
        link = a_tag.get('href')

        # Ignore links that don't have an href attribute
        if link is None:
            continue

        # Ignore links that are not HTTP or HTTPS
        parsed_link = urlparse(link)

        if parsed_link.scheme not in ('http', 'https'):
            continue

        # Ignore links that are not within the same domain
        if parsed_link.netloc != urlparse(url).netloc:
            continue

        if link in ALL_LINK:
            continue

        links.append(link)
        ALL_LINK.add(link)

    headers = []
    for header_level in range(1, 7):
        for header_tag in soup.find_all('h{}'.format(header_level)):
            headers.append((header_level, header_tag.text.strip()))

    return links, headers

 def print_links(url, indent=0):
    # Get all the links on the page
    links, headers = get_links(url)

    if indent == 0:
        print('- ' + url)

    for header_level, header_text in headers:
        if header_level is None:
            continue

        print(' ' * indent + '  ' + '#'*header_level + ' ' + header_text)


    # Print the links with an indent
    for link in links:
        print(' ' * indent + '- ' + link)

        # Recursively print the links on the linked pages
        print_links(link, indent + 2)



 if len(sys.argv) != 2:
    print('Usage: crawler.py <domain>')
    exit()

 # Get the domain from the command line arguments
 domain = sys.argv[1]

 # Example usage
 print_links('https://'+domain)
diff --git a/Dockerfile b/Dockerfile
 # Use an official Python runtime as a parent image
 FROM python:3.9-slim-buster

 # Set the working directory to /app
 WORKDIR /app

 # Copy the requirements file into the container
 COPY requirements.txt .

 # Install the required Python packages
 RUN pip install --no-cache-dir -r requirements.txt

 # Copy the Python script into the container
 COPY crawler.py .

 EXPOSE 3306

 # Run the Python script when the container starts
 ENTRYPOINT ["python", "crawler.py"]
diff --git a/requirements.txt b/requirements.txt
 beautifulsoup4==4.12.2
 bs4==0.0.1
 certifi==2022.12.7
 charset-normalizer==3.1.0
 idna==3.4
 requests==2.29.0
 soupsieve==2.4.1
 urllib3==1.26.15
diff --git a/run.sh b/run.sh
 #!/bin/bash

 python3 -m venv venv

 ./venv/bin/pip install -r requirements.txt
 ./venv/bin/python3 crawler.py
	import sys
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urlparse

	ALL_LINK = set()

	def get_links(url):
	# Make a GET request to the specified URL
	response = requests.get(url)

	content_type = response.headers.get('Content-Type')
	if not content_type or 'html' not in content_type:
	return [], [(None, None)]

	try:
	# Parse the HTML content of the page using BeautifulSoup
	soup = BeautifulSoup(response.content, 'html.parser')
	except:
	return [], [(None, None)]

	# Find all anchor tags in the page and extract their href attributes
	links = []
	for a_tag in soup.find_all('a'):
	link = a_tag.get('href')

	# Ignore links that don't have an href attribute
	if link is None:
	continue

	# Ignore links that are not HTTP or HTTPS
	parsed_link = urlparse(link)

	if parsed_link.scheme not in ('http', 'https'):
	continue

	# Ignore links that are not within the same domain
	if parsed_link.netloc != urlparse(url).netloc:
	continue

	if link in ALL_LINK:
	continue

	links.append(link)
	ALL_LINK.add(link)

	headers = []
	for header_level in range(1, 7):
	for header_tag in soup.find_all('h{}'.format(header_level)):
	headers.append((header_level, header_tag.text.strip()))

	return links, headers

	def print_links(url, indent=0):
	# Get all the links on the page
	links, headers = get_links(url)

	if indent == 0:
	print('- ' + url)

	for header_level, header_text in headers:
	if header_level is None:
	continue

	print(' ' * indent + ' ' + '#'*header_level + ' ' + header_text)


	# Print the links with an indent
	for link in links:
	print(' ' * indent + '- ' + link)

	# Recursively print the links on the linked pages
	print_links(link, indent + 2)



	if len(sys.argv) != 2:
	print('Usage: crawler.py <domain>')
	exit()

	# Get the domain from the command line arguments
	domain = sys.argv[1]

	# Example usage
	print_links('https://'+domain)
	# Use an official Python runtime as a parent image
	FROM python:3.9-slim-buster

	# Set the working directory to /app
	WORKDIR /app

	# Copy the requirements file into the container
	COPY requirements.txt .

	# Install the required Python packages
	RUN pip install --no-cache-dir -r requirements.txt

	# Copy the Python script into the container
	COPY crawler.py .

	EXPOSE 3306

	# Run the Python script when the container starts
	ENTRYPOINT ["python", "crawler.py"]
	beautifulsoup4==4.12.2
	bs4==0.0.1
	certifi==2022.12.7
	charset-normalizer==3.1.0
	idna==3.4
	requests==2.29.0
	soupsieve==2.4.1
	urllib3==1.26.15
	#!/bin/bash

	python3 -m venv venv

	./venv/bin/pip install -r requirements.txt
	./venv/bin/python3 crawler.py