-
-
Save pohzipohzi/ad7942fc5545675022c1f31123e64c0c to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup | |
import requests | |
import datetime | |
import logging | |
import csv | |
def setLogger(): | |
logging.basicConfig(level=logging.INFO, | |
format='%(asctime)s - %(levelname)s - %(message)s', | |
filename='logs_file', | |
filemode='w') | |
console = logging.StreamHandler() | |
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') | |
console.setFormatter(formatter) | |
logging.getLogger('').addHandler(console) | |
def getEconomicCalendar(startlink,endlink): | |
# write to console current status | |
logging.info("Scraping data for link: {}".format(startlink)) | |
# get the page and make the soup | |
baseURL = "https://www.forexfactory.com/" | |
r = requests.get(baseURL + startlink) | |
data = r.text | |
soup = BeautifulSoup(data, "lxml") | |
# get and parse table data, ignoring details and graph | |
table = soup.find("table", class_="calendar__table") | |
# do not use the ".calendar__row--grey" css selector (reserved for historical data) | |
trs = table.select("tr.calendar__row.calendar_row") | |
fields = ["date","time","currency","impact","event","actual","forecast","previous"] | |
# some rows do not have a date (cells merged) | |
curr_year = startlink[-4:] | |
curr_date = "" | |
curr_time = "" | |
for tr in trs: | |
# fields may mess up sometimes, see Tue Sep 25 2:45AM French Consumer Spending | |
# in that case we append to errors.csv the date time where the error is | |
try: | |
for field in fields: | |
data = tr.select("td.calendar__cell.calendar__{}.{}".format(field,field))[0] | |
# print(data) | |
if field=="date" and data.text.strip()!="": | |
curr_date = data.text.strip() | |
elif field=="time" and data.text.strip()!="": | |
# time is sometimes "All Day" or "Day X" (eg. WEF Annual Meetings) | |
if data.text.strip().find("Day")!=-1: | |
curr_time = "12:00am" | |
else: | |
curr_time = data.text.strip() | |
elif field=="currency": | |
currency = data.text.strip() | |
elif field=="impact": | |
# when impact says "Non-Economic" on mouseover, the relevant | |
# class name is "Holiday", thus we do not use the classname | |
impact = data.find("span")["title"] | |
elif field=="event": | |
event = data.text.strip() | |
elif field=="actual": | |
actual = data.text.strip() | |
elif field=="forecast": | |
forecast = data.text.strip() | |
elif field=="previous": | |
previous = data.text.strip() | |
dt = datetime.datetime.strptime(",".join([curr_year,curr_date,curr_time]), | |
"%Y,%a%b %d,%I:%M%p") | |
print(",".join([str(dt),currency,impact,event,actual,forecast,previous])) | |
except: | |
with open("errors.csv","a") as f: | |
csv.writer(f).writerow([curr_year,curr_date,curr_time]) | |
# exit recursion when last available link has reached | |
if startlink==endlink: | |
logging.info("Successfully retrieved data") | |
return | |
# get the link for the next week and follow | |
follow = soup.select("a.calendar__pagination.calendar__pagination--next.next") | |
follow = follow[0]["href"] | |
getEconomicCalendar(follow,endlink) | |
if __name__ == "__main__": | |
""" | |
Run this using the command "python `script_name`.py >> `output_name`.csv" | |
""" | |
setLogger() | |
getEconomicCalendar("calendar.php?week=jan7.2007","calendar.php?week=dec24.2017") |
@successfulmike the script works by following the link for the next week via the a.calendar__pagination.calendar__pagination--next.next
tag, which refers to the right arrow at the top. Thus if your start page begins on a monday you need to find an end link that also begins on a monday. For example "calendar.php?week=jan7.2019","calendar.php?week=jan28.2019" would work
Be careful with the variable curr_year
, it is quite possible that a given week overlaps two years, in which case the dt
will be incorrect I believe.
To illustrate the problem, please run:
getEconomicCalendar("calendar.php?week=dec31.2018","calendar.php?week=dec31.2018")
Thank you!
Amazing job..
Congrats !!
It was running for a wile⁹, and now it says; an error line 32
"Attribute error" None type error select has not attribute..
Some one knows how to fix it?
Kind regards,
[email protected]
Amazing job..
Congrats !!
It was running for a wile⁹, and now it says; an error line 32
"Attribute error" None type error select has not attribute..
Some one knows how to fix it?
Kind regards,
[email protected]
Having same problem, I found out that the request returns: 'Service Temporarily Unavailable'
That means, the website does not allow anymore script access, mean: no real browser.
that causes soup is empty, respectivaly the command: table = soup.find("table", class_="calendar__table ")
returns nothing, because it does not find a table on the html-page.
Nay idea how to workaround this problem?
In the mean time , I found a solution, see code. But I am not finished yet, because I will solve the loop problem withe the date:
``
from bs4 import BeautifulSoup
import urllib.request
import datetime
import logging
import csv
import datetime
import urllib.parse, urllib.request
from calendar import timegm
def setLogger():
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
filename='logs_file',
filemode='w')
console = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)
def getEconomicCalendar(startlink,endlink):
# write to console current status
logging.info("Scraping data for link: {}".format(startlink))
baseURL = "https://www.forexfactory.com/"
# get the page and make the soup
ecolink = baseURL + startlink
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
response = opener.open(ecolink)
result = response.read().decode('utf-8', errors='replace')
soup = BeautifulSoup(result,"html.parser")
# get and parse table data, ignoring details and graph
table = soup.find("table", class_="calendar__table")
# do not use the ".calendar__row--grey" css selector (reserved for historical data)
trs = table.select("tr.calendar__row.calendar_row")
fields = ["date","time","currency","impact","event","actual","forecast","previous"]
# some rows do not have a date (cells merged)
curr_year = startlink[-4:]
curr_date = ""
curr_time = ""
for tr in trs:
# fields may mess up sometimes, see Tue Sep 25 2:45AM French Consumer Spending
# in that case we append to errors.csv the date time where the error is
try:
for field in fields:
data = tr.select("td.calendar__cell.calendar__{}.{}".format(field,field))[0]
# print(data)
if field=="date" and data.text.strip()!="":
curr_date = data.text.strip()
elif field=="time" and data.text.strip()!="":
# time is sometimes "All Day" or "Day X" (eg. WEF Annual Meetings)
if data.text.strip().find("Day")!=-1:
curr_time = "12:00am"
else:
curr_time = data.text.strip()
elif field=="currency":
currency = data.text.strip()
elif field=="impact":
# when impact says "Non-Economic" on mouseover, the relevant
# class name is "Holiday", thus we do not use the classname
impact = data.find("span")["title"]
elif field=="event":
event = data.text.strip()
elif field=="actual":
actual = data.text.strip()
elif field=="forecast":
forecast = data.text.strip()
elif field=="previous":
previous = data.text.strip()
dt = datetime.datetime.strptime(",".join([curr_year,curr_date,curr_time]),
"%Y,%a%b %d,%I:%M%p")
print(",".join([str(dt),currency,impact,event,actual,forecast,previous]))
except:
with open("errors.csv","a") as f:
csv.writer(f).writerow([curr_year,curr_date,curr_time])
# exit recursion when last available link has reached
if startlink==endlink:
logging.info("Successfully retrieved data")
return
# get the link for the next week and follow
follow = soup.select("a.calendar__pagination.calendar__pagination--next.next")
follow = follow[0]["href"]
getEconomicCalendar(follow,endlink)
if name == "main":
"""
Run this using the command "python script_name
.py >> output_name
.csv"
"""
setLogger()
getEconomicCalendar("calendar.php?week=jan11.2021","calendar.php?week=jan12.2021")
``
Please share :-)
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import pandas
import logging
import ssl
import json
from json import JSONEncoder
class ComplexEncoder(json.JSONEncoder):
def default(self, obj):
return obj.dict
class PyEcoRoot:
def init(self, currency, eco_element):
self.currency = currency
self.eco_element = eco_element
class PyEcoElement:
def init(self,currency,event, impact,time_eastern, actual,forecast, previous ):
self.currency = currency
self.event = event
self.impact = impact
self.time_eastern = time_eastern
self.actual = actual
self.forecast = forecast
self.previous = previous
class PyEcoCal:
def init(self, p1 = 1):
self.p1 = p1
def GetEconomicCalendar(self,date):
baseURL = "https://www.forexfactory.com/"
ssl._create_default_https_context = ssl._create_unverified_context
# ctx = ssl.create_default_context()
# ctx.check_hostname = False
# ctx.verify_mode = ssl.CERT_NONE
# html = urllib.request.urlopen(url, context=ctx).read()
# get the page and make the soup
urleco = baseURL + date
opener = urllib.request.build_opener()
#opener = urllib.request.build_opener(urllib.request.HTTPSHandler(context=ctx))
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
response = opener.open(urleco)
result = response.read().decode('utf-8', errors='replace')
soup = BeautifulSoup(result,"html.parser")
table = soup.find_all("tr", class_="calendar_row")
ecoday = []
for item in table:
dict = {}
dict["Currency"] = item.find_all("td", {"class":"calendar__currency"})[0].text.strip() #Currency
dict["Event"] = item.find_all("td",{"class":"calendar__event"})[0].text.strip() #Event Name
dict["Time_Eastern"] = item.find_all("td", {"class":"calendar__time"})[0].text #Time Eastern
impact = item.find_all("td", {"class":"impact"})
for icon in range(0,len(impact)):
dict["Impact"] = impact[icon].find_all("span")[0]['title'].split(' ', 1)[0]
dict["Actual"] = item.find_all("td", {"class":"calendar__actual"})[0].text #Actual Value
dict["Forecast"] = item.find_all("td", {"class":"calendar__forecast"})[0].text #forecasted Value
dict["Previous"] = item.find_all("td", {"class":"calendar__previous"})[0].text # Previous
ecoday.append(dict)
ecoDict=[]
for item in ecoday:
rec = ComplexEncoder()
ecoelem = PyEcoElement(
item["Currency"],
item["Event"],
item["Impact"],
item["Time_Eastern"],
item["Actual"],
item["Forecast"],
item["Previous"]
)
rec.ecoobject = ecoelem
ecoDict.append(rec)
json_object = json.dumps(ComplexEncoder().encode(ecoDict), indent = 3)
return json_object
eco = PyEcoCal()
json = eco.GetEconomicCalendar("calendar.php?day=today")
print(json)
`
Dear Hiuwe
tanks a lot for shearing your useful program.
Getting a 503 Error.
This is because the User-Agent for Python's urllib is so obviously not a browser. You could always fake the User-Agent, but that's not really good (or moral) practice.
Any reason why you replaced the requests library with urllib?
Super beginner coder here any help would be appreciated. I'm trying to run this in visual studio but every time I run it, it returns empty. Any ideas what might be the cause for it to not print anything?
from bs4 import BeautifulSoup
from datetime import date, datetime
from typing import List
import urllib.request
import urllib.parse
import ssl
import json
from json import JSONEncoder
from pytz import timezone
class PyEcoElement(object):
def init(self, currency: str, event: str, impact : str, time_utc: str, actual: str, forecast: str, previous: str):
self.currency = currency
self.event = event
self.impact = impact
self.time_utc = time_utc
self.actual = actual
self.forecast = forecast
self.previous = previous
class PyEcoRoot(object):
def init(self, eco_elements : List[PyEcoElement]):
self.eco_elements = eco_elements
class PyEcoCal:
def GetEconomicCalendar(self, query_date: datetime, ):
base_url = "https://www.forexfactory.com/"
ssl._create_default_https_context = ssl._create_unverified_context
# ctx = ssl.create_default_context()
# ctx.check_hostname = False
# ctx.verify_mode = ssl.CERT_NONE
# html = urllib.request.urlopen(url, context=ctx).read()
# get the page and make the soup
urleco = f"{base_url}calendar.php?day={query_date.strftime('%b').lower()}{query_date.day}.{query_date.year}"
date_string = query_date.strftime('%Y-%m-%d')
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
response = opener.open(urleco)
result = response.read().decode('utf-8', errors='replace')
soup = BeautifulSoup(result, "html.parser")
table = soup.find_all("tr", class_="calendar_row")
cal_date = soup.find_all("a", {"class": "highlight light options flexTitle"})[0].span.text.strip()
eco_day = []
for item in table:
dict = {}
dict["Currency"] = item.find_all("td", \
{"class": "calendar__cell calendar__currency currency"})[0].text.strip() # Currency
dict["Event"] = item.find_all("span", \
{"class": "calendar__event-title"})[0].text.strip() # Event Name
try:
time_eastern = item.find_all("td", {"class": "calendar__cell calendar__time time"})[
0].div.text.strip() # Time Eastern
datetime_eastern = datetime.strptime(f"{date_string} {time_eastern}", '%Y-%m-%d %I:%M%p')
except:
datetime_eastern = datetime.strptime(f"{date_string} 12:00am", '%Y-%m-%d %I:%M%p')
eastern_tz = timezone('US/Eastern')
dict["Time_UTC"] = eastern_tz.localize(datetime(datetime_eastern.year, datetime_eastern.month, \
datetime_eastern.day, datetime_eastern.hour, \
datetime_eastern.minute, 0)).astimezone(timezone('utc')).strftime("%Y%m%dT%H:%M:%S %z")
impact = item.find_all("td", {"class": "impact"})
for icon in range(0, len(impact)):
dict["Impact"] = impact[icon].find_all("span")[0]['title'].split(' ', 1)[0]
try:
# dict["Actual"] = item.find_all("td", {"class": "calendar__cell calendar__actual actual"})[0].span.text.strip() # Actual Value
actual_value =item.find_all("td", {"class": "calendar__cell calendar__actual actual"})[0].text
if actual_value is not None:
dict["Actual"] = actual_value.strip()
else:
dict["Actual"] = item.find_all("td", \
{"class": "calendar__cell calendar__actual actual"})[0].span.text.strip() # Actual Value
except:
dict["Actual"] = ""
try:
dict["Forecast"] = item.find_all("span", {"class": "calendar-forecast"})[
0].text.strip() # forecasted Value
except:
dict["Forecast"] = ""
try:
dict["Previous"] = item.find_all("span", {"class": "calendar-previous"})[0].text.strip() # Previous
except:
dict["Previous"] = ""
eco_day.append(dict)
events_array = []
for row_dict in eco_day:
eco_elem = PyEcoElement(
row_dict["Currency"],
row_dict["Event"],
row_dict["Impact"],
row_dict["Time_UTC"],
row_dict["Actual"],
row_dict["Forecast"],
row_dict["Previous"]
)
events_array.append(eco_elem)
eco_cal = PyEcoRoot(events_array)
json_object = json.dumps(eco_cal.__dict__, default=lambda o: o.__dict__, indent=3)
return json_object
if name == "main":
eco = PyEcoCal()
json = eco.GetEconomicCalendar(datetime.today())
print(json)
Convert times to utc timestamps and handles edge case where actual found in span element.
``
from bs4 import BeautifulSoup
from datetime import date, datetime
from typing import List
import urllib.request
import urllib.parse
import ssl
import json
from pytz import timezone
class PyEcoElement(object):
def __init__(self, currency: str, event: str, impact : str, time_utc: str, actual: str, forecast: str, previous: str):
self.currency = currency
self.event = event
self.impact = impact
self.time_utc = time_utc
self.actual = actual
self.forecast = forecast
self.previous = previous
class PyEcoRoot(object):
def __init__(self, eco_elements : List[PyEcoElement]):
self.eco_elements = eco_elements
class PyEcoCal:
def GetEconomicCalendar(self, query_date: datetime):
base_url = "https://www.forexfactory.com/"
ssl._create_default_https_context = ssl._create_unverified_context
# ctx = ssl.create_default_context()
# ctx.check_hostname = False
# ctx.verify_mode = ssl.CERT_NONE
# html = urllib.request.urlopen(url, context=ctx).read()
# get the page and make the soup
urleco = f"{base_url}calendar.php?day={query_date.strftime('%b').lower()}{query_date.day}.{query_date.year}"
date_string = query_date.strftime('%Y-%m-%d')
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
response = opener.open(urleco)
result = response.read().decode('utf-8', errors='replace')
soup = BeautifulSoup(result, "html.parser")
table = soup.find_all("tr", class_="calendar_row")
cal_date = soup.find_all("a", {"class": "highlight light options flexTitle"})[0].span.text.strip()
eco_day = []
for item in table:
dict = {}
dict["Currency"] = item.find_all("td", \
{"class": "calendar__cell calendar__currency currency"})[0].text.strip() # Currency
dict["Event"] = item.find_all("span", \
{"class": "calendar__event-title"})[0].text.strip() # Event Name
try:
time_eastern = item.find_all("td", {"class": "calendar__cell calendar__time time"})[
0].div.text.strip() # Time Eastern
datetime_eastern = datetime.strptime(f"{date_string} {time_eastern}", '%Y-%m-%d %I:%M%p')
except:
datetime_eastern = datetime.strptime(f"{date_string} 12:00am", '%Y-%m-%d %I:%M%p')
eastern_tz = timezone('US/Eastern')
dict["Time_UTC"] = eastern_tz.localize(datetime(datetime_eastern.year, datetime_eastern.month, \
datetime_eastern.day, datetime_eastern.hour, \
datetime_eastern.minute, 0)).astimezone(timezone('utc')).strftime("%Y%m%dT%H:%M:%S %z")
impact = item.find_all("td", {"class": "impact"})
for icon in range(0, len(impact)):
dict["Impact"] = impact[icon].find_all("span")[0]['title'].split(' ', 1)[0]
try:
actual_value =item.find_all("td", {"class": "calendar__cell calendar__actual actual"})[0].text
if actual_value is not None:
dict["Actual"] = actual_value.strip()
else:
dict["Actual"] = item.find_all("td", \
{"class": "calendar__cell calendar__actual actual"})[0].span.text.strip() # Actual Value
except:
dict["Actual"] = ""
try:
dict["Forecast"] = item.find_all("span", {"class": "calendar-forecast"})[
0].text.strip() # forecasted Value
except:
dict["Forecast"] = ""
try:
dict["Previous"] = item.find_all("span", {"class": "calendar-previous"})[0].text.strip() # Previous
except:
dict["Previous"] = ""
eco_day.append(dict)
events_array = []
for row_dict in eco_day:
eco_elem = PyEcoElement(
row_dict["Currency"],
row_dict["Event"],
row_dict["Impact"],
row_dict["Time_UTC"],
row_dict["Actual"],
row_dict["Forecast"],
row_dict["Previous"]
)
events_array.append(eco_elem)
eco_cal = PyEcoRoot(events_array)
json_object = json.dumps(eco_cal.__dict__, default=lambda o: o.__dict__, indent=3)
return json_object
if name == "main":
eco = PyEcoCal()
json = eco.GetEconomicCalendar(datetime.today())
print(json)
``
And how can I fetch the next events for the next week
from bs4 import BeautifulSoup from datetime import date, datetime from typing import List import urllib.request import urllib.parse import ssl import json from pytz import timezone
class PyEcoElement(object):
def __init__(self, currency: str, event: str, impact : str, time_utc: str, actual: str, forecast: str, previous: str): self.currency = currency self.event = event self.impact = impact self.time_utc = time_utc self.actual = actual self.forecast = forecast self.previous = previous
class PyEcoRoot(object):
def __init__(self, eco_elements : List[PyEcoElement]): self.eco_elements = eco_elements
class PyEcoCal:
def GetEconomicCalendar(self, query_date: datetime): base_url = "https://www.forexfactory.com/" ssl._create_default_https_context = ssl._create_unverified_context # ctx = ssl.create_default_context() # ctx.check_hostname = False # ctx.verify_mode = ssl.CERT_NONE # html = urllib.request.urlopen(url, context=ctx).read() # get the page and make the soup urleco = f"{base_url}calendar.php?day={query_date.strftime('%b').lower()}{query_date.day}.{query_date.year}" date_string = query_date.strftime('%Y-%m-%d') opener = urllib.request.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] response = opener.open(urleco) result = response.read().decode('utf-8', errors='replace') soup = BeautifulSoup(result, "html.parser") table = soup.find_all("tr", class_="calendar_row") cal_date = soup.find_all("a", {"class": "highlight light options flexTitle"})[0].span.text.strip() eco_day = [] for item in table: dict = {} dict["Currency"] = item.find_all("td", \ {"class": "calendar__cell calendar__currency currency"})[0].text.strip() # Currency dict["Event"] = item.find_all("span", \ {"class": "calendar__event-title"})[0].text.strip() # Event Name try: time_eastern = item.find_all("td", {"class": "calendar__cell calendar__time time"})[ 0].div.text.strip() # Time Eastern datetime_eastern = datetime.strptime(f"{date_string} {time_eastern}", '%Y-%m-%d %I:%M%p') except: datetime_eastern = datetime.strptime(f"{date_string} 12:00am", '%Y-%m-%d %I:%M%p') eastern_tz = timezone('US/Eastern') dict["Time_UTC"] = eastern_tz.localize(datetime(datetime_eastern.year, datetime_eastern.month, \ datetime_eastern.day, datetime_eastern.hour, \ datetime_eastern.minute, 0)).astimezone(timezone('utc')).strftime("%Y%m%dT%H:%M:%S %z") impact = item.find_all("td", {"class": "impact"}) for icon in range(0, len(impact)): dict["Impact"] = impact[icon].find_all("span")[0]['title'].split(' ', 1)[0] try: actual_value =item.find_all("td", {"class": "calendar__cell calendar__actual actual"})[0].text if actual_value is not None: dict["Actual"] = actual_value.strip() else: dict["Actual"] = item.find_all("td", \ {"class": "calendar__cell calendar__actual actual"})[0].span.text.strip() # Actual Value except: dict["Actual"] = "" try: dict["Forecast"] = item.find_all("span", {"class": "calendar-forecast"})[ 0].text.strip() # forecasted Value except: dict["Forecast"] = "" try: dict["Previous"] = item.find_all("span", {"class": "calendar-previous"})[0].text.strip() # Previous except: dict["Previous"] = "" eco_day.append(dict) events_array = [] for row_dict in eco_day: eco_elem = PyEcoElement( row_dict["Currency"], row_dict["Event"], row_dict["Impact"], row_dict["Time_UTC"], row_dict["Actual"], row_dict["Forecast"], row_dict["Previous"] ) events_array.append(eco_elem) eco_cal = PyEcoRoot(events_array) json_object = json.dumps(eco_cal.__dict__, default=lambda o: o.__dict__, indent=3) return json_object
if name == "main": eco = PyEcoCal() json = eco.GetEconomicCalendar(datetime.today()) print(json)
This works, thanks buddy
PS C:\Users\Jasper> python -u "c:\Users\Jasper\Downloads\Lot Size Calculator\main.py"
2023-07-24 21:32:36,080 - INFO - Scraping data for link: calendar.php?week=jan7.2007
2023-07-24 21:32:36,219 - WARNING - Table not found on the page. Exiting...
Hey guys, discover this API providing access to all ForexFactory data. Unlock valuable insights and enhance your trading strategies efficiently.
Link to api: https://rapidapi.com/ousema.frikha/api/forex-factory-scraper1
Hi pohzipohzi,
Not sure if you're aware, but your code is looping unnecessarily and also picking up dates that are outside the ones specified in the function arguments. Do you think something changed with Forex Factory's code that could be causing this?
P.S. I tested code with arguments "calendar.php?week=jan7.2019","calendar.php?week=jan24.2019"