Skip to content

Instantly share code, notes, and snippets.

@MCMXCIII
Last active February 7, 2021 19:16
Show Gist options
  • Save MCMXCIII/ca96758930eb136648599353779fe6c5 to your computer and use it in GitHub Desktop.
Save MCMXCIII/ca96758930eb136648599353779fe6c5 to your computer and use it in GitHub Desktop.
Scraping script
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import csv
import time
import os
import re
no_last_weeks = str(input("Number of last weeks: ")).strip()
file_name = str(input("Name of the output file to be saved: ")).strip()
try:
os.remove(f"{file_name}.csv")
except:
pass
def data_urls():
soup = BeautifulSoup(driver.page_source, 'html.parser')
items = soup.find_all("input",{"title":"Click to open the record."})
for item in items:
url = "https://www.georgiapublicnotice.com/" + item["onclick"].replace("javascript:location.href='","").replace("';return false;","")
urls.append(url)
try:
url = "https://www.georgiapublicnotice.com/"
driver = webdriver.Chrome()
driver.get(url)
time.sleep(2)
search = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_txtSearch"]""")
search.send_keys("SALE UNDER POWER")
exact_phrase = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_rdoType"]/li[3]/label""")
exact_phrase.click()
time.sleep(5)
plus = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_divDateRange"]/label/a""")
plus.click()
data_range = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_rbLastNumWeeks"]""")
data_range.click()
last_weeks = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_txtLastNumWeeks"]""")
last_weeks.clear()
last_weeks.send_keys(no_last_weeks)
submit = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_btnGo"]""")
submit.click()
time.sleep(5)
set_50 = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_WSExtendedGridNP1_GridView1_ctl01_ddlPerPage"]/option[7]""")
set_50.click()
time.sleep(5)
total_pages = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_WSExtendedGridNP1_GridView1_ctl01_lblTotalPages"]""").text
total_pages = int((total_pages.replace("of ","").replace(" Pages","")).strip())
urls = []
data_urls()
for _ in range(total_pages-1):
next_page = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_WSExtendedGridNP1_GridView1_ctl01_btnNext"]""")
next_page.click()
time.sleep(5)
data_urls()
with open(f'{file_name}.csv', 'a', newline='') as new_file:
csv_writer = csv.writer(new_file)
csv_writer.writerow(["Publication Name","Publication Url","Publication City","Publication State","Publication County","Notice Keywords","Notice Auth No","Notice Url","Notice Publish Date","Content Text","Address"])
for url in urls:
driver.get(url)
time.sleep(3)
soup_x = BeautifulSoup(driver.page_source, 'html.parser')
publication_name = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblPubName").text).strip()
publication_url= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lnkPubURL").text).strip()
publication_city= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblCity").text).strip()
publication_state= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblState").text).strip()
publication_county= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblCounty").text).strip()
notice_keywords= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblKeywords").text).strip()
notice_auth_no= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblNoticeAuthenticationNo").text).strip()
notice_url = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lnkNoticeURL").text).strip()
notice_publish_date = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_lblPublicationDAte").text).strip()
content_text = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_lblContentText").text).replace('\n', '')
address = str(re.findall("is known as (\d+ [\w ,\.\-\/\#']+), ([\w \.\-']+), ([A-Z]{2}) ([-\dA-Z]+)",content_text)).text
with open(f'{file_name}.csv', 'a', newline='',encoding="utf-8") as new_file:
csv_writer = csv.writer(new_file)
csv_writer.writerow([publication_name,publication_url,publication_city,publication_state,publication_county,notice_keywords,notice_auth_no,notice_url,notice_publish_date,content_text,address])
driver.quit()
except:
driver.quit()
print("please try again...")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment