Created
July 3, 2023 19:35
-
-
Save OElesin/e21d2dc7bd84fd8a0d88d7e9ef0f5ec8 to your computer and use it in GitHub Desktop.
Covenant Nation Web Scraper to AWS S3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import re | |
from urllib.parse import unquote | |
import time | |
import boto3, json, os | |
import requests | |
from bs4 import BeautifulSoup | |
s3 = boto3.client('s3') | |
media_objects_bucket_name = 's3 bucket name' # Replace with your S3 bucket name | |
BASE_URL = "https://elibrary.insightsforliving.org/categories/4f8b4b73-0e68-46c2-a7fb-0828a50cc6ad?page={page_num}&q=Pastor%20Poju%20Oyemade" | |
page_nums = range(2, 10) | |
def extract_message_url(base_string): | |
""" | |
""" | |
match = re.search(r"'([^']+)'", base_string) | |
if match: | |
return match.group(1) | |
else: | |
print("No match found.") | |
return None | |
def upload_message_to_s3(file_content, filename): | |
""" | |
""" | |
s3_key = f'{filename}' | |
try: | |
s3.put_object(Body=file_content, Bucket=media_objects_bucket_name, Key=s3_key) | |
print(f"File uploaded to S3 successfully: s3://{media_objects_bucket_name}/{s3_key}") | |
except NoCredentialsError: | |
print("AWS credentials not found. Make sure you have configured your AWS credentials.") | |
def clean_string(string): | |
return re.sub(r'[^0-9a-zA-Z._-]', '', string) | |
def download_message_from_url(message_url): | |
""" | |
""" | |
response = requests.head(message_url, allow_redirects=True) | |
final_url = response.url | |
filename = unquote(os.path.basename(final_url).split('?')[0]).replace(' ', '-') | |
mp3_response = requests.get(final_url) | |
upload_message_to_s3(mp3_response.content, clean_string(filename)) | |
print(f"Downloaded {filename} successfully!") | |
for i in page_nums: | |
URL = BASE_URL.format(page_num=i) | |
# Send a GET request to the webpage | |
response = requests.get(URL) | |
# Parse the HTML content | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Find the list of articles | |
messages = soup.find_all('div', class_='col-lg-3 col-6 mb-4') | |
for message in messages: | |
link_tag = message.find('div', onclick=True) | |
download_uri = extract_message_url(link_tag.get('onclick')) | |
full_download_url = f'https://elibrary.insightsforliving.org{download_uri}' | |
download_message_from_url(full_download_url) | |
time.sleep(2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment