Skip to content

Instantly share code, notes, and snippets.

@stevemclaugh
Last active September 24, 2024 18:39
Show Gist options
  • Save stevemclaugh/d8985e800d81267d76849f929e7a9987 to your computer and use it in GitHub Desktop.
Save stevemclaugh/d8985e800d81267d76849f929e7a9987 to your computer and use it in GitHub Desktop.
Scraping The Gazette of India with Selenium + ChromeDriver in Python
#!/usr/bin/python3
from selenium import webdriver
import time
import random
import os
import csv
url = 'http://egazette.bih.nic.in/SearchAdvanceGazette.aspx'
# Launching Chrome and loading the search page
driver = webdriver.Chrome()
driver.get(url)
time.sleep(0.4) # Short pause to make sure the page is finished loading
# Clicking 'Search' button with all fields empty, which returns all records
search_button = driver.find_element_by_xpath('//input[@value="Search"]')
search_button.click()
time.sleep(0.4)
# At this point we're on the first page of search results.
master_metadata_table = []
errors = []
page_number = 1
for i in range(650): # There should be 615 pages of results; overshooting just in case.
try:
table_body = driver.find_element_by_xpath('//*[@id="ctl00_ContentPlaceHolder1_DetailView"]/tbody') # Finding the relevant table with an XPath query
for row_number in range(3,18): # Looping through the 15 rows that contain PDF links
row = table_body.find_element_by_xpath("tr[{}]".format(row_number))
print(row.text)
row_cells = []
for cell_number in range(1,8): # Looping through 7 columns, beginning with index 1
try: # Try/except here is a precaution. If something goes wrong, we'll record the error and continue the scrape.
row_cells.append(row.find_element_by_xpath('td[{}]'.format(cell_number)).text) # Adding cell text to temporary row list
except Exception as e:
print(row.text)
print(e)
errors.append([e, row.text, page_number]) # Recording the error so we can deal with it later
master_metadata_table.append(row_cells) # Adding the current row (as a list) to our running metadata table (a list of lists)
row.find_element_by_xpath('td[1]/a').click() # Clicking the PDF link in cell 1, which will download to ~/Downloads
time.sleep(1.5 + random.random()) # Waiting between 1.5 and 2.5 seconds before proceeding to the next row (as a courtesy & to avoid triggering a potential rate limit & to avoid hitting Chrome's cap on simultaneous downloads)
time.sleep(11 + random.random()) # Waiting between 11 and 12 seconds before we load the next page. No rush.
page_number += 1 # Incrementing page number variable
# The JS call below navigates to a given page number. Note that this only works for pages linked from the current page.
# (We're doing this at the end of the loop because you can't navigate to page 1 from page 1.)
driver.execute_script("javascript:__doPostBack('ctl00$ContentPlaceHolder1$DetailView','Page${}')".format(page_number))
time.sleep(0.4) # Short pause to make sure the page is finished loading
except Exception as e:
errors.append(e)
print(e)
print('Stopped on page: ' + str(page_number)) # In case the browser crashes, we can restart the scrape from here. This would require amending the code to start at page 1 and navigate through pages 2, 3, 4, etc. (without downloading PDFs) until we get to the new start page.
break # Breaking the loop when we reach the last page (or the browser crashes)
print(errors)
# Creating new metadata table with a column that reflects downloaded PDFs' filenames
filename_prefix = 'D__Websites_eGazette_GazettePublished_'
output_metadata_table = [row + [filename_prefix + row[-1]] for row in master_metadata_table]
# Writing metadata table (a list of lists) to CSV file in ~/Downloads
os.chdir(os.path.expanduser('~/Downloads'))
header = ['Gazette No.', 'Published Date', 'Type', 'Page Start', 'Page End', 'Year', 'Filename (listed)', 'Filename (download)']
with open('egazette.bih.nic.in_metadata.csv', 'w') as file_out:
csv_writer = csv.writer(file_out)
csv_writer.writerow(header)
csv_writer.writerows(output_metadata_table)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment