Last active
September 24, 2024 18:39
-
-
Save stevemclaugh/d8985e800d81267d76849f929e7a9987 to your computer and use it in GitHub Desktop.
Scraping The Gazette of India with Selenium + ChromeDriver in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
from selenium import webdriver | |
import time | |
import random | |
import os | |
import csv | |
url = 'http://egazette.bih.nic.in/SearchAdvanceGazette.aspx' | |
# Launching Chrome and loading the search page | |
driver = webdriver.Chrome() | |
driver.get(url) | |
time.sleep(0.4) # Short pause to make sure the page is finished loading | |
# Clicking 'Search' button with all fields empty, which returns all records | |
search_button = driver.find_element_by_xpath('//input[@value="Search"]') | |
search_button.click() | |
time.sleep(0.4) | |
# At this point we're on the first page of search results. | |
master_metadata_table = [] | |
errors = [] | |
page_number = 1 | |
for i in range(650): # There should be 615 pages of results; overshooting just in case. | |
try: | |
table_body = driver.find_element_by_xpath('//*[@id="ctl00_ContentPlaceHolder1_DetailView"]/tbody') # Finding the relevant table with an XPath query | |
for row_number in range(3,18): # Looping through the 15 rows that contain PDF links | |
row = table_body.find_element_by_xpath("tr[{}]".format(row_number)) | |
print(row.text) | |
row_cells = [] | |
for cell_number in range(1,8): # Looping through 7 columns, beginning with index 1 | |
try: # Try/except here is a precaution. If something goes wrong, we'll record the error and continue the scrape. | |
row_cells.append(row.find_element_by_xpath('td[{}]'.format(cell_number)).text) # Adding cell text to temporary row list | |
except Exception as e: | |
print(row.text) | |
print(e) | |
errors.append([e, row.text, page_number]) # Recording the error so we can deal with it later | |
master_metadata_table.append(row_cells) # Adding the current row (as a list) to our running metadata table (a list of lists) | |
row.find_element_by_xpath('td[1]/a').click() # Clicking the PDF link in cell 1, which will download to ~/Downloads | |
time.sleep(1.5 + random.random()) # Waiting between 1.5 and 2.5 seconds before proceeding to the next row (as a courtesy & to avoid triggering a potential rate limit & to avoid hitting Chrome's cap on simultaneous downloads) | |
time.sleep(11 + random.random()) # Waiting between 11 and 12 seconds before we load the next page. No rush. | |
page_number += 1 # Incrementing page number variable | |
# The JS call below navigates to a given page number. Note that this only works for pages linked from the current page. | |
# (We're doing this at the end of the loop because you can't navigate to page 1 from page 1.) | |
driver.execute_script("javascript:__doPostBack('ctl00$ContentPlaceHolder1$DetailView','Page${}')".format(page_number)) | |
time.sleep(0.4) # Short pause to make sure the page is finished loading | |
except Exception as e: | |
errors.append(e) | |
print(e) | |
print('Stopped on page: ' + str(page_number)) # In case the browser crashes, we can restart the scrape from here. This would require amending the code to start at page 1 and navigate through pages 2, 3, 4, etc. (without downloading PDFs) until we get to the new start page. | |
break # Breaking the loop when we reach the last page (or the browser crashes) | |
print(errors) | |
# Creating new metadata table with a column that reflects downloaded PDFs' filenames | |
filename_prefix = 'D__Websites_eGazette_GazettePublished_' | |
output_metadata_table = [row + [filename_prefix + row[-1]] for row in master_metadata_table] | |
# Writing metadata table (a list of lists) to CSV file in ~/Downloads | |
os.chdir(os.path.expanduser('~/Downloads')) | |
header = ['Gazette No.', 'Published Date', 'Type', 'Page Start', 'Page End', 'Year', 'Filename (listed)', 'Filename (download)'] | |
with open('egazette.bih.nic.in_metadata.csv', 'w') as file_out: | |
csv_writer = csv.writer(file_out) | |
csv_writer.writerow(header) | |
csv_writer.writerows(output_metadata_table) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment