Created
May 20, 2022 16:12
-
-
Save MCMXCIII/2197dcf3544b79eb7c47ed56639b4a65 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This version is just a clean up of the script. I used it while developing, uses functions and | |
# and has an Error Handling system. Thought you might find it use full. | |
import string | |
from typing import ContextManager | |
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys | |
from bs4 import BeautifulSoup | |
import csv | |
import time | |
import os | |
import sys | |
import re | |
# Search for SALE UNDER POWER and fetch all URLS | |
def Get_URLs(): | |
try: | |
time.sleep(2) | |
search = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_txtSearch"]""") | |
search.send_keys("SALE UNDER POWER") | |
exact_phrase = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_rdoType"]/li[3]/label""") | |
exact_phrase.click() | |
time.sleep(5) | |
plus = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_divDateRange"]/label/a""") | |
plus.click() | |
data_range = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_rbLastNumWeeks"]""") | |
data_range.click() | |
last_weeks = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_txtLastNumWeeks"]""") | |
last_weeks.clear() | |
last_weeks.send_keys(no_last_weeks) | |
submit = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_btnGo"]""") | |
submit.click() | |
time.sleep(5) | |
set_50 = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_WSExtendedGridNP1_GridView1_ctl01_ddlPerPage"]/option[7]""") | |
set_50.click() | |
time.sleep(5) | |
total_pages = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_WSExtendedGridNP1_GridView1_ctl01_lblTotalPages"]""").text | |
total_pages = int((total_pages.replace("of ","").replace(" Pages","")).strip()) | |
urls = [] | |
data_urls(urls) | |
for _ in range(total_pages-1): | |
next_page = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_WSExtendedGridNP1_GridView1_ctl01_btnNext"]""") | |
next_page.click() | |
time.sleep(5) | |
data_urls(urls) | |
return urls | |
except Exception as error: | |
ErrorHandler(error) | |
# Scrape Articles from urls list, and write them to .csv file | |
def Write_Article_Data(file_name: string, urls: list): | |
try: | |
for url in urls: | |
driver.get(url) | |
time.sleep(3) | |
soup_x = BeautifulSoup(driver.page_source, 'html.parser') | |
publication_name = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblPubName").text).strip() | |
publication_url= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lnkPubURL").text).strip() | |
publication_city= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblCity").text).strip() | |
publication_state= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblState").text).strip() | |
publication_county= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblCounty").text).strip() | |
notice_keywords= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblKeywords").text).strip() | |
notice_auth_no= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblNoticeAuthenticationNo").text).strip() | |
notice_url = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lnkNoticeURL").text).strip() | |
notice_publish_date = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_lblPublicationDAte").text).strip() | |
content_text = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_lblContentText").text).replace('\n', '') | |
address = re.findall("(known|to) as(:?) ([+ [\w ,\.\-\/\#']+ [\w \.\-']+ [-\dA-Z]+[0-9])", content_text) | |
# Checking if found addresses list is empy and replacing it with "Null". Otherwise setting it to the 3 item in the returned tuple | |
if address == []: | |
address = "Null" | |
else: | |
address = address[0][2] | |
with open(f'{file_name}.csv', 'a', newline='',encoding="utf-8") as new_file: | |
csv_writer = csv.writer(new_file) | |
csv_writer.writerow([publication_name,publication_url,publication_city,publication_state,publication_county,notice_keywords,notice_auth_no,notice_url,notice_publish_date,content_text,address]) | |
return True | |
except Exception as error: | |
ErrorHandler(error) | |
return False | |
def data_urls(urls): | |
try: | |
soup = BeautifulSoup(driver.page_source, 'html.parser') | |
items = soup.find_all("input",{"title":"Click to open the record."}) | |
for item in items: | |
url = "https://www.georgiapublicnotice.com/" + item["onclick"].replace("javascript:location.href='","").replace("';return false;","") | |
urls.append(url) | |
except Exception as error: | |
ErrorHandler(error) | |
# Handles any errors in Try-Except, and outputs them in a clean manner | |
def ErrorHandler(err): | |
# get details about the exception | |
err_type, err_obj, traceback = sys.exc_info() | |
print("\n---------------------------------------------------------") | |
print( | |
"An error occured! Here is more detail\n") | |
print("---------------------------------------------------------") | |
print("\n ERROR:", err) | |
print("Traceback:", traceback, "-- type:", err_type) | |
print("Error occured on line: ", traceback.tb_lineno) | |
# psycopg2 extensions.Diagnostics object attribute | |
print("\nextensions.Diagnostics:", err.diag) | |
print("Let's try again\n") | |
print("---------------------------------------------------------") | |
driver.quit() | |
if __name__ == "__main__": | |
# Recieve user input | |
no_last_weeks = str(input("Number of last weeks: ")).strip() | |
file_name = str(input("Name of the output file to be saved: ")).strip() | |
# Remove file if it already exists | |
try: | |
os.remove(f"{file_name}.csv") | |
except FileNotFoundError: | |
pass | |
except Exception as error: | |
ErrorHandler(error) | |
# Start browser | |
url = "https://www.georgiapublicnotice.com/" | |
driver = webdriver.Chrome() | |
driver.get(url) | |
# Create or append to .csv file | |
with open(f'{file_name}.csv', 'a', newline='') as new_file: | |
csv_writer = csv.writer(new_file) | |
csv_writer.writerow(["Publication Name","Publication Url","Publication City","Publication State","Publication County","Notice Keywords","Notice Auth No","Notice Url","Notice Publish Date","Content Text","Address"]) | |
# Scrape and write data to .csv file | |
urls = Get_URLs() | |
successfulScrape = Write_Article_Data(file_name, urls) | |
if successfulScrape: | |
print("Scrape was successful!") | |
else: | |
print("An unexpected error occured. Check the logs") | |
driver.quit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment