Created
March 10, 2021 13:54
-
-
Save charanjit-singh/44b792fd92c4ab3f709422589a7f08a9 to your computer and use it in GitHub Desktop.
Script to scrape RERA Information from Punjab RERA Website ( using multithreading)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import codecs | |
import threading | |
page = "" | |
f = codecs.open("RERA_PUNJAB_LIST.html","r") | |
page = f.read() | |
soup = BeautifulSoup(page, 'html.parser') | |
BROKERS = [] | |
for tr in soup.find_all("tr"): | |
tds = tr.find_all("td") | |
sr = tds[0].get_text() | |
name = tds[1].get_text() | |
district = tds[2].get_text() | |
rera = tds[3].get_text() | |
registration_valid_upto = tds[4].get_text() | |
anchors = tr.find_all("a") | |
inputs = tr.find_all("input") | |
type_ = anchors[0].get('id') | |
agent_id = inputs[0].get("value") | |
BROKERS.append({ | |
"Sr. No.":sr.strip(), | |
"Name":name.strip(), | |
"District": district.strip(), | |
"RERA No.":rera.strip(), | |
"Registration Valid Upto": registration_valid_upto, | |
"Offline": 1 if type_ in ['modalOpenerOfflineRegisteredButton',] else 0, | |
"Agent ID": agent_id | |
}) | |
import csv | |
csv_file = "PUNJAB_RERA_LIST.csv" | |
try: | |
with open(csv_file, 'w') as csvfile: | |
writer = csv.DictWriter(csvfile, fieldnames=BROKERS[0].keys()) | |
writer.writeheader() | |
for data in BROKERS: | |
writer.writerow(data) | |
except IOError: | |
print("I/O error") | |
import re | |
import time | |
class OnlineThread(threading.Thread): | |
def __init__(self, name,): | |
threading.Thread.__init__(self) | |
self.name = name | |
def run(self): | |
ONLINE_BROKERS = [] | |
ONLINE_KEYS = list(BROKERS[0].keys()) | |
# ONLINE Registered CASE | |
ONLINE_BROKERS_RAW = list(filter(lambda broker: broker['Offline'] == 0, BROKERS)) | |
ONLINE_BROKERS_RAW_CNT = len(ONLINE_BROKERS_RAW) | |
ONLINE_BROKERS_LOOP_COUNTER = 0 | |
FAILED_ONLINES = [] | |
for broker in ONLINE_BROKERS_RAW: | |
AGENT_ID = broker['Agent ID'] | |
ONLINE_BROKERS_LOOP_COUNTER += 1 | |
print("Processing Online Broker:", ONLINE_BROKERS_LOOP_COUNTER,'/',ONLINE_BROKERS_RAW_CNT) | |
URL = "https://rera.punjab.gov.in/reraindex/PublicView/AgentViewDetails?inAgent_ID="+AGENT_ID | |
try: | |
broker_page = requests.get(URL) | |
except Exception as e: | |
FAILED_ONLINES.append(broker) | |
print("Failed Online Broker", broker, "Reason:", str(e)) | |
time.sleep(5) | |
continue | |
content = broker_page.content | |
broker_soup = BeautifulSoup(content, "html.parser") | |
tds = broker_soup.find_all("td") | |
for i in range(len(tds)-1): | |
current_td = tds[i] | |
next_td = tds[i + 1] | |
if "single-detail" in current_td.get('class') if current_td.get("class") else False: | |
key = current_td.get_text().strip() | |
if key not in ONLINE_KEYS: | |
ONLINE_KEYS.append(key) | |
value = re.sub(' +', ' ', next_td.get_text().strip().replace("\r"," ").replace("\n"," ")) | |
broker[key] = value | |
ONLINE_BROKERS.append(broker) | |
csv_file = "ONLINE_PUNJAB_RERA_LIST.csv" | |
try: | |
with open(csv_file, 'w') as csvfile: | |
writer = csv.DictWriter(csvfile, fieldnames=ONLINE_KEYS) | |
writer.writeheader() | |
for data in ONLINE_BROKERS: | |
writer.writerow(data) | |
except IOError: | |
print("I/O error") | |
print("FAILED ONLINE BROKERS", FAILED_ONLINES) | |
class OfflineThread(threading.Thread): | |
def __init__(self, name,): | |
threading.Thread.__init__(self) | |
self.name = name | |
def run(self): | |
OFFLINE_BROKERS = [] | |
OFFLINE_KEYS = list(BROKERS[0].keys()) | |
# OFFLINE Registered CASE | |
OFFLINE_BROKERS_RAW = list(filter(lambda broker: broker['Offline'] == 1, BROKERS)) | |
OFFLINE_BROKERS_RAW_CNT = len(OFFLINE_BROKERS_RAW) | |
OFFLINE_BROKERS_LOOP_COUNTER = 0 | |
FAILED_OFFLINES = [] | |
for broker in OFFLINE_BROKERS_RAW: | |
AGENT_ID = broker['Agent ID'] | |
OFFLINE_BROKERS_LOOP_COUNTER += 1 | |
URL = "https://rera.punjab.gov.in/reraindex/PublicView/AgentViewOfflineRegisteredDetails?inAgent_ID="+AGENT_ID | |
print("Processing Offline Broker:", OFFLINE_BROKERS_LOOP_COUNTER,'/',OFFLINE_BROKERS_RAW_CNT, "URL:",URL) | |
try: | |
broker_page = requests.get(URL) | |
except Exception as e: | |
print("Failed offline Broker", broker, "Reason:", str(e)) | |
FAILED_OFFLINES.append(broker) | |
continue | |
content = broker_page.content | |
broker_soup = BeautifulSoup(content, "html.parser") | |
tds = broker_soup.find_all("td") | |
for i in range(len(tds)-1): | |
current_td = tds[i] | |
next_td = tds[i + 1] | |
if "single-detail" in current_td.get('class') if current_td.get("class") else False: | |
key = current_td.get_text().strip() | |
if key not in OFFLINE_KEYS: | |
OFFLINE_KEYS.append(key) | |
value = re.sub(' +', ' ', next_td.get_text().strip().replace("\r"," ").replace("\n"," ")) | |
broker[key] = value | |
OFFLINE_BROKERS.append(broker) | |
csv_file = "OFFLINE_PUNJAB_RERA_LIST.csv" | |
try: | |
with open(csv_file, 'w') as csvfile: | |
writer = csv.DictWriter(csvfile, fieldnames=OFFLINE_KEYS) | |
writer.writeheader() | |
for data in OFFLINE_BROKERS: | |
writer.writerow(data) | |
except IOError: | |
print("I/O error") | |
print("FAILED OFFLINE BROKERS", FAILED_OFFLINES) | |
onlineThread = OnlineThread("Online") | |
offlineThread = OfflineThread("Offline") | |
onlineThread.start() | |
offlineThread.start() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment