-
-
Save pohzipohzi/ad7942fc5545675022c1f31123e64c0c to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup | |
import requests | |
import datetime | |
import logging | |
import csv | |
def setLogger(): | |
logging.basicConfig(level=logging.INFO, | |
format='%(asctime)s - %(levelname)s - %(message)s', | |
filename='logs_file', | |
filemode='w') | |
console = logging.StreamHandler() | |
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') | |
console.setFormatter(formatter) | |
logging.getLogger('').addHandler(console) | |
def getEconomicCalendar(startlink,endlink): | |
# write to console current status | |
logging.info("Scraping data for link: {}".format(startlink)) | |
# get the page and make the soup | |
baseURL = "https://www.forexfactory.com/" | |
r = requests.get(baseURL + startlink) | |
data = r.text | |
soup = BeautifulSoup(data, "lxml") | |
# get and parse table data, ignoring details and graph | |
table = soup.find("table", class_="calendar__table") | |
# do not use the ".calendar__row--grey" css selector (reserved for historical data) | |
trs = table.select("tr.calendar__row.calendar_row") | |
fields = ["date","time","currency","impact","event","actual","forecast","previous"] | |
# some rows do not have a date (cells merged) | |
curr_year = startlink[-4:] | |
curr_date = "" | |
curr_time = "" | |
for tr in trs: | |
# fields may mess up sometimes, see Tue Sep 25 2:45AM French Consumer Spending | |
# in that case we append to errors.csv the date time where the error is | |
try: | |
for field in fields: | |
data = tr.select("td.calendar__cell.calendar__{}.{}".format(field,field))[0] | |
# print(data) | |
if field=="date" and data.text.strip()!="": | |
curr_date = data.text.strip() | |
elif field=="time" and data.text.strip()!="": | |
# time is sometimes "All Day" or "Day X" (eg. WEF Annual Meetings) | |
if data.text.strip().find("Day")!=-1: | |
curr_time = "12:00am" | |
else: | |
curr_time = data.text.strip() | |
elif field=="currency": | |
currency = data.text.strip() | |
elif field=="impact": | |
# when impact says "Non-Economic" on mouseover, the relevant | |
# class name is "Holiday", thus we do not use the classname | |
impact = data.find("span")["title"] | |
elif field=="event": | |
event = data.text.strip() | |
elif field=="actual": | |
actual = data.text.strip() | |
elif field=="forecast": | |
forecast = data.text.strip() | |
elif field=="previous": | |
previous = data.text.strip() | |
dt = datetime.datetime.strptime(",".join([curr_year,curr_date,curr_time]), | |
"%Y,%a%b %d,%I:%M%p") | |
print(",".join([str(dt),currency,impact,event,actual,forecast,previous])) | |
except: | |
with open("errors.csv","a") as f: | |
csv.writer(f).writerow([curr_year,curr_date,curr_time]) | |
# exit recursion when last available link has reached | |
if startlink==endlink: | |
logging.info("Successfully retrieved data") | |
return | |
# get the link for the next week and follow | |
follow = soup.select("a.calendar__pagination.calendar__pagination--next.next") | |
follow = follow[0]["href"] | |
getEconomicCalendar(follow,endlink) | |
if __name__ == "__main__": | |
""" | |
Run this using the command "python `script_name`.py >> `output_name`.csv" | |
""" | |
setLogger() | |
getEconomicCalendar("calendar.php?week=jan7.2007","calendar.php?week=dec24.2017") |
Super beginner coder here any help would be appreciated. I'm trying to run this in visual studio but every time I run it, it returns empty. Any ideas what might be the cause for it to not print anything?
from bs4 import BeautifulSoup
from datetime import date, datetime
from typing import List
import urllib.request
import urllib.parse
import ssl
import json
from json import JSONEncoder
from pytz import timezone
class PyEcoElement(object):
def init(self, currency: str, event: str, impact : str, time_utc: str, actual: str, forecast: str, previous: str):
self.currency = currency
self.event = event
self.impact = impact
self.time_utc = time_utc
self.actual = actual
self.forecast = forecast
self.previous = previous
class PyEcoRoot(object):
def init(self, eco_elements : List[PyEcoElement]):
self.eco_elements = eco_elements
class PyEcoCal:
def GetEconomicCalendar(self, query_date: datetime, ):
base_url = "https://www.forexfactory.com/"
ssl._create_default_https_context = ssl._create_unverified_context
# ctx = ssl.create_default_context()
# ctx.check_hostname = False
# ctx.verify_mode = ssl.CERT_NONE
# html = urllib.request.urlopen(url, context=ctx).read()
# get the page and make the soup
urleco = f"{base_url}calendar.php?day={query_date.strftime('%b').lower()}{query_date.day}.{query_date.year}"
date_string = query_date.strftime('%Y-%m-%d')
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
response = opener.open(urleco)
result = response.read().decode('utf-8', errors='replace')
soup = BeautifulSoup(result, "html.parser")
table = soup.find_all("tr", class_="calendar_row")
cal_date = soup.find_all("a", {"class": "highlight light options flexTitle"})[0].span.text.strip()
eco_day = []
for item in table:
dict = {}
dict["Currency"] = item.find_all("td", \
{"class": "calendar__cell calendar__currency currency"})[0].text.strip() # Currency
dict["Event"] = item.find_all("span", \
{"class": "calendar__event-title"})[0].text.strip() # Event Name
try:
time_eastern = item.find_all("td", {"class": "calendar__cell calendar__time time"})[
0].div.text.strip() # Time Eastern
datetime_eastern = datetime.strptime(f"{date_string} {time_eastern}", '%Y-%m-%d %I:%M%p')
except:
datetime_eastern = datetime.strptime(f"{date_string} 12:00am", '%Y-%m-%d %I:%M%p')
eastern_tz = timezone('US/Eastern')
dict["Time_UTC"] = eastern_tz.localize(datetime(datetime_eastern.year, datetime_eastern.month, \
datetime_eastern.day, datetime_eastern.hour, \
datetime_eastern.minute, 0)).astimezone(timezone('utc')).strftime("%Y%m%dT%H:%M:%S %z")
impact = item.find_all("td", {"class": "impact"})
for icon in range(0, len(impact)):
dict["Impact"] = impact[icon].find_all("span")[0]['title'].split(' ', 1)[0]
try:
# dict["Actual"] = item.find_all("td", {"class": "calendar__cell calendar__actual actual"})[0].span.text.strip() # Actual Value
actual_value =item.find_all("td", {"class": "calendar__cell calendar__actual actual"})[0].text
if actual_value is not None:
dict["Actual"] = actual_value.strip()
else:
dict["Actual"] = item.find_all("td", \
{"class": "calendar__cell calendar__actual actual"})[0].span.text.strip() # Actual Value
except:
dict["Actual"] = ""
try:
dict["Forecast"] = item.find_all("span", {"class": "calendar-forecast"})[
0].text.strip() # forecasted Value
except:
dict["Forecast"] = ""
try:
dict["Previous"] = item.find_all("span", {"class": "calendar-previous"})[0].text.strip() # Previous
except:
dict["Previous"] = ""
eco_day.append(dict)
events_array = []
for row_dict in eco_day:
eco_elem = PyEcoElement(
row_dict["Currency"],
row_dict["Event"],
row_dict["Impact"],
row_dict["Time_UTC"],
row_dict["Actual"],
row_dict["Forecast"],
row_dict["Previous"]
)
events_array.append(eco_elem)
eco_cal = PyEcoRoot(events_array)
json_object = json.dumps(eco_cal.__dict__, default=lambda o: o.__dict__, indent=3)
return json_object
if name == "main":
eco = PyEcoCal()
json = eco.GetEconomicCalendar(datetime.today())
print(json)
Convert times to utc timestamps and handles edge case where actual found in span element.
``
from bs4 import BeautifulSoup
from datetime import date, datetime
from typing import List
import urllib.request
import urllib.parse
import ssl
import json
from pytz import timezone
class PyEcoElement(object):
def __init__(self, currency: str, event: str, impact : str, time_utc: str, actual: str, forecast: str, previous: str):
self.currency = currency
self.event = event
self.impact = impact
self.time_utc = time_utc
self.actual = actual
self.forecast = forecast
self.previous = previous
class PyEcoRoot(object):
def __init__(self, eco_elements : List[PyEcoElement]):
self.eco_elements = eco_elements
class PyEcoCal:
def GetEconomicCalendar(self, query_date: datetime):
base_url = "https://www.forexfactory.com/"
ssl._create_default_https_context = ssl._create_unverified_context
# ctx = ssl.create_default_context()
# ctx.check_hostname = False
# ctx.verify_mode = ssl.CERT_NONE
# html = urllib.request.urlopen(url, context=ctx).read()
# get the page and make the soup
urleco = f"{base_url}calendar.php?day={query_date.strftime('%b').lower()}{query_date.day}.{query_date.year}"
date_string = query_date.strftime('%Y-%m-%d')
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
response = opener.open(urleco)
result = response.read().decode('utf-8', errors='replace')
soup = BeautifulSoup(result, "html.parser")
table = soup.find_all("tr", class_="calendar_row")
cal_date = soup.find_all("a", {"class": "highlight light options flexTitle"})[0].span.text.strip()
eco_day = []
for item in table:
dict = {}
dict["Currency"] = item.find_all("td", \
{"class": "calendar__cell calendar__currency currency"})[0].text.strip() # Currency
dict["Event"] = item.find_all("span", \
{"class": "calendar__event-title"})[0].text.strip() # Event Name
try:
time_eastern = item.find_all("td", {"class": "calendar__cell calendar__time time"})[
0].div.text.strip() # Time Eastern
datetime_eastern = datetime.strptime(f"{date_string} {time_eastern}", '%Y-%m-%d %I:%M%p')
except:
datetime_eastern = datetime.strptime(f"{date_string} 12:00am", '%Y-%m-%d %I:%M%p')
eastern_tz = timezone('US/Eastern')
dict["Time_UTC"] = eastern_tz.localize(datetime(datetime_eastern.year, datetime_eastern.month, \
datetime_eastern.day, datetime_eastern.hour, \
datetime_eastern.minute, 0)).astimezone(timezone('utc')).strftime("%Y%m%dT%H:%M:%S %z")
impact = item.find_all("td", {"class": "impact"})
for icon in range(0, len(impact)):
dict["Impact"] = impact[icon].find_all("span")[0]['title'].split(' ', 1)[0]
try:
actual_value =item.find_all("td", {"class": "calendar__cell calendar__actual actual"})[0].text
if actual_value is not None:
dict["Actual"] = actual_value.strip()
else:
dict["Actual"] = item.find_all("td", \
{"class": "calendar__cell calendar__actual actual"})[0].span.text.strip() # Actual Value
except:
dict["Actual"] = ""
try:
dict["Forecast"] = item.find_all("span", {"class": "calendar-forecast"})[
0].text.strip() # forecasted Value
except:
dict["Forecast"] = ""
try:
dict["Previous"] = item.find_all("span", {"class": "calendar-previous"})[0].text.strip() # Previous
except:
dict["Previous"] = ""
eco_day.append(dict)
events_array = []
for row_dict in eco_day:
eco_elem = PyEcoElement(
row_dict["Currency"],
row_dict["Event"],
row_dict["Impact"],
row_dict["Time_UTC"],
row_dict["Actual"],
row_dict["Forecast"],
row_dict["Previous"]
)
events_array.append(eco_elem)
eco_cal = PyEcoRoot(events_array)
json_object = json.dumps(eco_cal.__dict__, default=lambda o: o.__dict__, indent=3)
return json_object
if name == "main":
eco = PyEcoCal()
json = eco.GetEconomicCalendar(datetime.today())
print(json)
``
And how can I fetch the next events for the next week
from bs4 import BeautifulSoup from datetime import date, datetime from typing import List import urllib.request import urllib.parse import ssl import json from pytz import timezone
class PyEcoElement(object):
def __init__(self, currency: str, event: str, impact : str, time_utc: str, actual: str, forecast: str, previous: str): self.currency = currency self.event = event self.impact = impact self.time_utc = time_utc self.actual = actual self.forecast = forecast self.previous = previous
class PyEcoRoot(object):
def __init__(self, eco_elements : List[PyEcoElement]): self.eco_elements = eco_elements
class PyEcoCal:
def GetEconomicCalendar(self, query_date: datetime): base_url = "https://www.forexfactory.com/" ssl._create_default_https_context = ssl._create_unverified_context # ctx = ssl.create_default_context() # ctx.check_hostname = False # ctx.verify_mode = ssl.CERT_NONE # html = urllib.request.urlopen(url, context=ctx).read() # get the page and make the soup urleco = f"{base_url}calendar.php?day={query_date.strftime('%b').lower()}{query_date.day}.{query_date.year}" date_string = query_date.strftime('%Y-%m-%d') opener = urllib.request.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] response = opener.open(urleco) result = response.read().decode('utf-8', errors='replace') soup = BeautifulSoup(result, "html.parser") table = soup.find_all("tr", class_="calendar_row") cal_date = soup.find_all("a", {"class": "highlight light options flexTitle"})[0].span.text.strip() eco_day = [] for item in table: dict = {} dict["Currency"] = item.find_all("td", \ {"class": "calendar__cell calendar__currency currency"})[0].text.strip() # Currency dict["Event"] = item.find_all("span", \ {"class": "calendar__event-title"})[0].text.strip() # Event Name try: time_eastern = item.find_all("td", {"class": "calendar__cell calendar__time time"})[ 0].div.text.strip() # Time Eastern datetime_eastern = datetime.strptime(f"{date_string} {time_eastern}", '%Y-%m-%d %I:%M%p') except: datetime_eastern = datetime.strptime(f"{date_string} 12:00am", '%Y-%m-%d %I:%M%p') eastern_tz = timezone('US/Eastern') dict["Time_UTC"] = eastern_tz.localize(datetime(datetime_eastern.year, datetime_eastern.month, \ datetime_eastern.day, datetime_eastern.hour, \ datetime_eastern.minute, 0)).astimezone(timezone('utc')).strftime("%Y%m%dT%H:%M:%S %z") impact = item.find_all("td", {"class": "impact"}) for icon in range(0, len(impact)): dict["Impact"] = impact[icon].find_all("span")[0]['title'].split(' ', 1)[0] try: actual_value =item.find_all("td", {"class": "calendar__cell calendar__actual actual"})[0].text if actual_value is not None: dict["Actual"] = actual_value.strip() else: dict["Actual"] = item.find_all("td", \ {"class": "calendar__cell calendar__actual actual"})[0].span.text.strip() # Actual Value except: dict["Actual"] = "" try: dict["Forecast"] = item.find_all("span", {"class": "calendar-forecast"})[ 0].text.strip() # forecasted Value except: dict["Forecast"] = "" try: dict["Previous"] = item.find_all("span", {"class": "calendar-previous"})[0].text.strip() # Previous except: dict["Previous"] = "" eco_day.append(dict) events_array = [] for row_dict in eco_day: eco_elem = PyEcoElement( row_dict["Currency"], row_dict["Event"], row_dict["Impact"], row_dict["Time_UTC"], row_dict["Actual"], row_dict["Forecast"], row_dict["Previous"] ) events_array.append(eco_elem) eco_cal = PyEcoRoot(events_array) json_object = json.dumps(eco_cal.__dict__, default=lambda o: o.__dict__, indent=3) return json_object
if name == "main": eco = PyEcoCal() json = eco.GetEconomicCalendar(datetime.today()) print(json)
This works, thanks buddy
PS C:\Users\Jasper> python -u "c:\Users\Jasper\Downloads\Lot Size Calculator\main.py"
2023-07-24 21:32:36,080 - INFO - Scraping data for link: calendar.php?week=jan7.2007
2023-07-24 21:32:36,219 - WARNING - Table not found on the page. Exiting...
Hey guys, discover this API providing access to all ForexFactory data. Unlock valuable insights and enhance your trading strategies efficiently.
Link to api: https://rapidapi.com/ousema.frikha/api/forex-factory-scraper1
Getting a 503 Error.
This is because the User-Agent for Python's urllib is so obviously not a browser. You could always fake the User-Agent, but that's not really good (or moral) practice.
Any reason why you replaced the requests library with urllib?