Skip to content

Instantly share code, notes, and snippets.

@nawarazpokhrel
Last active January 14, 2022 06:21
Show Gist options
  • Save nawarazpokhrel/5626eb9998dba7951bad5e2a739036e8 to your computer and use it in GitHub Desktop.
Save nawarazpokhrel/5626eb9998dba7951bad5e2a739036e8 to your computer and use it in GitHub Desktop.
import json
import os
import re
import time
from decimal import Decimal
from scrapy import Request, FormRequest
from scrapy.http import HtmlResponse
from scrapy_selenium import SeleniumRequest
from selenium import webdriver
import scrapy
from selenium.webdriver.common.by import By
from sqlalchemy import create_engine
from webdriver_manager.firefox import GeckoDriverManager
class FloorSheetSpider(scrapy.Spider):
name = "nepse"
# custom_settings = {
# 'DOWNLOADER_MIDDLEWARES': {
# 'first_scrapy.middlewares.SeleniumMiddleware': 543,
# # 'projects_name.path.to.your.pipeline': 543
# }
# }
def start_requests(self):
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " \
"Chrome/85.0.4183.83 Safari/537.36 "
options = webdriver.FirefoxOptions()
prefs = {
"translate_whitelists": {"np": "en"},
"translate": {"enabled": "true"}
}
options.headless = True
options.add_argument(f'user-agent={user_agent}')
options.add_argument("--window-size=1920,1080")
options.add_argument('--ignore-certificate-errors')
options.add_argument('--allow-running-insecure-content')
options.add_argument("--disable-extensions")
options.add_argument("--proxy-server='direct://'")
options.add_argument("--proxy-bypass-list=*")
options.add_argument("--start-maximized")
options.add_argument('--disable-gpu')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--no-sandbox')
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install(), firefox_options=options)
# driver.get(
# "https://merolagani.com/Floorsheet.aspx")
import csv
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
csv_data_path = os.path.join(BASE_DIR,
'nepsealpha_export_price_NEPSE_2016-01-01_2022-01-07.csv') # for province
file = open(csv_data_path, 'r')
csvreader = csv.reader(file)
next(csvreader)
# floorsheet_dates = []
# for row in csvreader:
# """
# To change date i.e only to get of 2016 from csv we need to identify number or rows i.e from bottom
# and only loop those rows
# """
# new_date = (row[1].replace('-', '/'))
# floorsheet_dates.append(new_date)
# file.close()
final_floor_sheet = []
floorsheet_dates = ['2022/01/06']
for date in floorsheet_dates[::-1]:
date = date.split('/')
new_date = f'{date[1]}/{date[2]}/{date[0]}'
if date[0] == '2022' and date[1] == '01':
driver.get(
"https://merolagani.com/Floorsheet.aspx")
driver.find_element(By.XPATH, "//input[@name='ctl00$ContentPlaceHolder1$txtFloorsheetDateFilter']"
).send_keys(new_date)
driver.find_element(By.XPATH, "(//a[@title='Search'])[3]").click()
total_length = driver.find_element(By.XPATH,
"//span[@id='ctl00_ContentPlaceHolder1_PagerControl2_litRecords']").text
z = int((total_length.split()[-1]).replace(']', ''))
for data in range(1, z):
driver.find_element(By.XPATH, "(//a[@title='Page {}'])[1]".format(data)).click()
self.body = driver.page_source
response = HtmlResponse(url=driver.current_url, body=self.body, encoding='utf-8')
if response is not None:
for value in response.xpath('//tbody/tr'):
floor_sheet = [value.css('td::text').extract()[1], value.css('td a::text').extract()[0],
value.css('td a::text').extract()[1], value.css('td a::text').extract()[2],
value.css('td::text').extract()[9], value.css('td::text').extract()[10],
final_floor_sheet]
with open('new_floor_sheet.json', 'w', encoding='utf-8') as f:
json.dump(final_floor_sheet, f, ensure_ascii=False, indent=4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment