Last active
May 6, 2019 08:53
-
-
Save ctivanovich/020aa891273148515164be16b181571f to your computer and use it in GitHub Desktop.
A weather data scraper I made for Wunderground.com, using XPaths and dealing with asynchronously loading page elements.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import bs4 | |
import datetime | |
import os | |
import pandas as pd | |
import psycopg2 | |
from selenium import webdriver | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.webdriver.common.by import By | |
from selenium.common.exceptions import TimeoutException | |
from selenium.webdriver.chrome.options import Options | |
import csv | |
BASE_URL = "https://www.wunderground.com/history/daily/ZSSS/date/" | |
CITY = "Shanghai" | |
options = Options() | |
options.add_argument("--window-size=3200,1800") | |
options.add_argument("--disable-gpu") | |
options.add_argument("--start-maximized") | |
options.add_argument("--headless") | |
chrome_path = os.path.join(os.getcwd(), "path_to_chromedriver\\chromedriver.exe") | |
header = [ | |
'Date', | |
'Time', | |
'Temperature', | |
'Dew Point', | |
'Humidity', | |
'Wind', | |
'Wind Speed', | |
'Wind Gust', | |
'Pressure', | |
'Precip.', | |
'Precip Accum', | |
'Condition'] | |
def daterange(start_date, end_date): | |
for n in range(int ((end_date - start_date).days)): | |
yield start_date + datetime.timedelta(n) | |
def scrape_weather_data(csv_writer): | |
'''Takes a date_range iterable (list or generator), uses dates to construct | |
webpage queries and to index output''' | |
table_xpath = r'//*[@id="inner-content"]/div[2]/div[3]/div/div[1]/div/div/city-history-observation' | |
timeout = 60 | |
import pandas as pd | |
df = pd.read_csv('Shanghai_weather.csv', usecols=['Date']) | |
dates_acquired = pd.to_datetime(df.Date).dt.date.unique() | |
del df #not needed after a quick data check | |
for d in daterange(START, END): | |
try: | |
if d in dates_acquired: | |
continue | |
else: | |
d = d.strftime('%Y-%m-%d') | |
city_url = BASE_URL + f"{d}/req_city={CITY}&req_statename=China" | |
driver.get(city_url) | |
element_present = EC.presence_of_element_located((By.TAG_NAME, 'table')) | |
WebDriverWait(driver, timeout).until(element_present) | |
table = driver.find_element_by_xpath(table_xpath) | |
# next_col_link = driver.find_element_by_xpath(next_colum_xpath) | |
for i, el in enumerate(table.find_elements_by_tag_name("tr")): | |
observations = {'Date':d} | |
if i == 0: | |
continue | |
for i, datum in enumerate(el.find_elements_by_tag_name("td")): | |
observations[header[i+1]] = datum.text | |
csv_writer.writerow(observations) | |
except TimeoutException: | |
print(f"Timed out waiting for page to load on day {d}") | |
missed_dates.append(d) | |
with open('Shanghai_weather.csv', 'a') as f: | |
driver = webdriver.chrome.webdriver.WebDriver(executable_path=chrome_path, options=options) | |
driver.maximize_window() | |
START = datetime.date(2017,1,1) | |
END = datetime.date(2018,12,31) | |
#missed dates is appended to by the scrape_weather_data function, here it is simply declared | |
#just in time for the function call | |
missed_dates = [] | |
writer = csv.DictWriter(f, fieldnames=header) | |
scrape_weather_data(writer) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment