Last active
February 7, 2021 19:16
-
-
Save MCMXCIII/ca96758930eb136648599353779fe6c5 to your computer and use it in GitHub Desktop.
Scraping script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys | |
from bs4 import BeautifulSoup | |
import csv | |
import time | |
import os | |
import re | |
no_last_weeks = str(input("Number of last weeks: ")).strip() | |
file_name = str(input("Name of the output file to be saved: ")).strip() | |
try: | |
os.remove(f"{file_name}.csv") | |
except: | |
pass | |
def data_urls(): | |
soup = BeautifulSoup(driver.page_source, 'html.parser') | |
items = soup.find_all("input",{"title":"Click to open the record."}) | |
for item in items: | |
url = "https://www.georgiapublicnotice.com/" + item["onclick"].replace("javascript:location.href='","").replace("';return false;","") | |
urls.append(url) | |
try: | |
url = "https://www.georgiapublicnotice.com/" | |
driver = webdriver.Chrome() | |
driver.get(url) | |
time.sleep(2) | |
search = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_txtSearch"]""") | |
search.send_keys("SALE UNDER POWER") | |
exact_phrase = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_rdoType"]/li[3]/label""") | |
exact_phrase.click() | |
time.sleep(5) | |
plus = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_divDateRange"]/label/a""") | |
plus.click() | |
data_range = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_rbLastNumWeeks"]""") | |
data_range.click() | |
last_weeks = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_txtLastNumWeeks"]""") | |
last_weeks.clear() | |
last_weeks.send_keys(no_last_weeks) | |
submit = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_btnGo"]""") | |
submit.click() | |
time.sleep(5) | |
set_50 = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_WSExtendedGridNP1_GridView1_ctl01_ddlPerPage"]/option[7]""") | |
set_50.click() | |
time.sleep(5) | |
total_pages = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_WSExtendedGridNP1_GridView1_ctl01_lblTotalPages"]""").text | |
total_pages = int((total_pages.replace("of ","").replace(" Pages","")).strip()) | |
urls = [] | |
data_urls() | |
for _ in range(total_pages-1): | |
next_page = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_WSExtendedGridNP1_GridView1_ctl01_btnNext"]""") | |
next_page.click() | |
time.sleep(5) | |
data_urls() | |
with open(f'{file_name}.csv', 'a', newline='') as new_file: | |
csv_writer = csv.writer(new_file) | |
csv_writer.writerow(["Publication Name","Publication Url","Publication City","Publication State","Publication County","Notice Keywords","Notice Auth No","Notice Url","Notice Publish Date","Content Text","Address"]) | |
for url in urls: | |
driver.get(url) | |
time.sleep(3) | |
soup_x = BeautifulSoup(driver.page_source, 'html.parser') | |
publication_name = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblPubName").text).strip() | |
publication_url= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lnkPubURL").text).strip() | |
publication_city= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblCity").text).strip() | |
publication_state= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblState").text).strip() | |
publication_county= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblCounty").text).strip() | |
notice_keywords= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblKeywords").text).strip() | |
notice_auth_no= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblNoticeAuthenticationNo").text).strip() | |
notice_url = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lnkNoticeURL").text).strip() | |
notice_publish_date = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_lblPublicationDAte").text).strip() | |
content_text = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_lblContentText").text).replace('\n', '') | |
address = str(re.findall("is known as (\d+ [\w ,\.\-\/\#']+), ([\w \.\-']+), ([A-Z]{2}) ([-\dA-Z]+)",content_text)).text | |
with open(f'{file_name}.csv', 'a', newline='',encoding="utf-8") as new_file: | |
csv_writer = csv.writer(new_file) | |
csv_writer.writerow([publication_name,publication_url,publication_city,publication_state,publication_county,notice_keywords,notice_auth_no,notice_url,notice_publish_date,content_text,address]) | |
driver.quit() | |
except: | |
driver.quit() | |
print("please try again...") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment