Last active
January 14, 2022 06:21
-
-
Save nawarazpokhrel/5626eb9998dba7951bad5e2a739036e8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import os | |
import re | |
import time | |
from decimal import Decimal | |
from scrapy import Request, FormRequest | |
from scrapy.http import HtmlResponse | |
from scrapy_selenium import SeleniumRequest | |
from selenium import webdriver | |
import scrapy | |
from selenium.webdriver.common.by import By | |
from sqlalchemy import create_engine | |
from webdriver_manager.firefox import GeckoDriverManager | |
class FloorSheetSpider(scrapy.Spider): | |
name = "nepse" | |
# custom_settings = { | |
# 'DOWNLOADER_MIDDLEWARES': { | |
# 'first_scrapy.middlewares.SeleniumMiddleware': 543, | |
# # 'projects_name.path.to.your.pipeline': 543 | |
# } | |
# } | |
def start_requests(self): | |
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " \ | |
"Chrome/85.0.4183.83 Safari/537.36 " | |
options = webdriver.FirefoxOptions() | |
prefs = { | |
"translate_whitelists": {"np": "en"}, | |
"translate": {"enabled": "true"} | |
} | |
options.headless = True | |
options.add_argument(f'user-agent={user_agent}') | |
options.add_argument("--window-size=1920,1080") | |
options.add_argument('--ignore-certificate-errors') | |
options.add_argument('--allow-running-insecure-content') | |
options.add_argument("--disable-extensions") | |
options.add_argument("--proxy-server='direct://'") | |
options.add_argument("--proxy-bypass-list=*") | |
options.add_argument("--start-maximized") | |
options.add_argument('--disable-gpu') | |
options.add_argument('--disable-dev-shm-usage') | |
options.add_argument('--no-sandbox') | |
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install(), firefox_options=options) | |
# driver.get( | |
# "https://merolagani.com/Floorsheet.aspx") | |
import csv | |
BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | |
csv_data_path = os.path.join(BASE_DIR, | |
'nepsealpha_export_price_NEPSE_2016-01-01_2022-01-07.csv') # for province | |
file = open(csv_data_path, 'r') | |
csvreader = csv.reader(file) | |
next(csvreader) | |
# floorsheet_dates = [] | |
# for row in csvreader: | |
# """ | |
# To change date i.e only to get of 2016 from csv we need to identify number or rows i.e from bottom | |
# and only loop those rows | |
# """ | |
# new_date = (row[1].replace('-', '/')) | |
# floorsheet_dates.append(new_date) | |
# file.close() | |
final_floor_sheet = [] | |
floorsheet_dates = ['2022/01/06'] | |
for date in floorsheet_dates[::-1]: | |
date = date.split('/') | |
new_date = f'{date[1]}/{date[2]}/{date[0]}' | |
if date[0] == '2022' and date[1] == '01': | |
driver.get( | |
"https://merolagani.com/Floorsheet.aspx") | |
driver.find_element(By.XPATH, "//input[@name='ctl00$ContentPlaceHolder1$txtFloorsheetDateFilter']" | |
).send_keys(new_date) | |
driver.find_element(By.XPATH, "(//a[@title='Search'])[3]").click() | |
total_length = driver.find_element(By.XPATH, | |
"//span[@id='ctl00_ContentPlaceHolder1_PagerControl2_litRecords']").text | |
z = int((total_length.split()[-1]).replace(']', '')) | |
for data in range(1, z): | |
driver.find_element(By.XPATH, "(//a[@title='Page {}'])[1]".format(data)).click() | |
self.body = driver.page_source | |
response = HtmlResponse(url=driver.current_url, body=self.body, encoding='utf-8') | |
if response is not None: | |
for value in response.xpath('//tbody/tr'): | |
floor_sheet = [value.css('td::text').extract()[1], value.css('td a::text').extract()[0], | |
value.css('td a::text').extract()[1], value.css('td a::text').extract()[2], | |
value.css('td::text').extract()[9], value.css('td::text').extract()[10], | |
final_floor_sheet] | |
with open('new_floor_sheet.json', 'w', encoding='utf-8') as f: | |
json.dump(final_floor_sheet, f, ensure_ascii=False, indent=4) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment