Skip to content

Instantly share code, notes, and snippets.

@scrapehero-code
Created May 6, 2020 04:18
Show Gist options
  • Save scrapehero-code/6d87e1e1369ee701dcea8880b4b620e9 to your computer and use it in GitHub Desktop.
Save scrapehero-code/6d87e1e1369ee701dcea8880b4b620e9 to your computer and use it in GitHub Desktop.
Python 3 code to extract quotes from yahoo finance. https://www.scrapehero.com/scrape-yahoo-finance-stock-market-data/
from lxml import html
import requests
import json
import argparse
from collections import OrderedDict
def get_headers():
return {"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "en-GB,en;q=0.9,en-US;q=0.8,ml;q=0.7",
"cache-control": "max-age=0",
"dnt": "1",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36"}
def parse(ticker):
url = "http://finance.yahoo.com/quote/%s?p=%s" % (ticker, ticker)
response = requests.get(
url, verify=False, headers=get_headers(), timeout=30)
print("Parsing %s" % (url))
parser = html.fromstring(response.text)
summary_table = parser.xpath(
'//div[contains(@data-test,"summary-table")]//tr')
summary_data = OrderedDict()
other_details_json_link = "https://query2.finance.yahoo.com/v10/finance/quoteSummary/{0}?formatted=true&lang=en-US&region=US&modules=summaryProfile%2CfinancialData%2CrecommendationTrend%2CupgradeDowngradeHistory%2Cearnings%2CdefaultKeyStatistics%2CcalendarEvents&corsDomain=finance.yahoo.com".format(
ticker)
summary_json_response = requests.get(other_details_json_link)
try:
json_loaded_summary = json.loads(summary_json_response.text)
summary = json_loaded_summary["quoteSummary"]["result"][0]
y_Target_Est = summary["financialData"]["targetMeanPrice"]['raw']
earnings_list = summary["calendarEvents"]['earnings']
eps = summary["defaultKeyStatistics"]["trailingEps"]['raw']
datelist = []
for i in earnings_list['earningsDate']:
datelist.append(i['fmt'])
earnings_date = ' to '.join(datelist)
for table_data in summary_table:
raw_table_key = table_data.xpath(
'.//td[1]//text()')
raw_table_value = table_data.xpath(
'.//td[2]//text()')
table_key = ''.join(raw_table_key).strip()
table_value = ''.join(raw_table_value).strip()
summary_data.update({table_key: table_value})
summary_data.update({'1y Target Est': y_Target_Est, 'EPS (TTM)': eps,
'Earnings Date': earnings_date, 'ticker': ticker,
'url': url})
return summary_data
except ValueError:
print("Failed to parse json response")
return {"error": "Failed to parse json response"}
except:
return {"error": "Unhandled Error"}
if __name__ == "__main__":
argparser = argparse.ArgumentParser()
argparser.add_argument('ticker', help='')
args = argparser.parse_args()
ticker = args.ticker
print("Fetching data for %s" % (ticker))
scraped_data = parse(ticker)
print("Writing data to output file")
with open('%s-summary.json' % (ticker), 'w') as fp:
json.dump(scraped_data, fp, indent=4)
@madsoeni
Copy link

madsoeni commented Sep 1, 2020

Hi,
I noticed the first request to url = "http://finance.yahoo.com/quote/%s?p=%s" % (ticker, ticker) results in the data protection privacy consent page.
This results in summary_table being empty.
So the results .json of AAPL is not the complete list as stated above..
{ "1y Target Est": 425.17, "EPS (TTM)": 13.185, "Earnings Date": "2020-10-28 to 2020-11-02", "ticker": "AAPL", "url": "https://finance.yahoo.com/quote/AAPL?p=AAPL" }

Same issue here!

@Vorotori
Copy link

Vorotori commented Dec 3, 2020

I had the same problem and found an alternative solution that grabs the same data in one line of code:

from yahoo_fin import stock_info as si
si.get_quote_table("aapl")

@mpassador
Copy link

Very helpful and very easy to use! Thanks for sharing

@CL33-cmd
Copy link

CL33-cmd commented Feb 25, 2021

line 51: table_key = ''.join(raw_table_key).strip() is giving me an invalid syntax error. Anyone know why?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment