Levi-Lesches · September 2, 2020 06:36
diff --git a/nytimes_headlines.py b/nytimes_headlines.py
 import requests
 from bs4 import BeautifulSoup
 import json

 URL = "https://www.nytimes.com/"
 START_OF_JSON = "window.__preloadedData = "

 def get_html(): 
  response = requests.get(URL)
  return response.text

 def get_script(soup): 
  scripts = soup.find_all("script")
  for script in scripts: 
    content = str(script.string).strip()
    if content.startswith(START_OF_JSON):
      return content

 def get_script_data(soup): 
  script = get_script(soup)
  data = script [len(START_OF_JSON) : -1]  # semicolon at the end
  return json.loads(data)

 def parse_data(data): 
  result = set()
  data = data ["initialState"]
  for key in data: 
    if "headline" in data [key]: 
      headline = data [key]["headline"]
      if not type(headline) is str or not headline: continue
      result.add(headline)
  return result

 def get_headlines(): 
  html = get_html()
  soup = BeautifulSoup(html, 'html.parser')
  script_data = get_script_data(soup)  
  result = parse_data(script_data)
  return result

 def main(): 
  headlines = get_headlines()
  print(f"Found {len(headlines)} headlines")
  print("\n".join(headlines))


 main()
	import requests
	from bs4 import BeautifulSoup
	import json

	URL = "https://www.nytimes.com/"
	START_OF_JSON = "window.__preloadedData = "

	def get_html():
	response = requests.get(URL)
	return response.text

	def get_script(soup):
	scripts = soup.find_all("script")
	for script in scripts:
	content = str(script.string).strip()
	if content.startswith(START_OF_JSON):
	return content

	def get_script_data(soup):
	script = get_script(soup)
	data = script [len(START_OF_JSON) : -1] # semicolon at the end
	return json.loads(data)

	def parse_data(data):
	result = set()
	data = data ["initialState"]
	for key in data:
	if "headline" in data [key]:
	headline = data [key]["headline"]
	if not type(headline) is str or not headline: continue
	result.add(headline)
	return result

	def get_headlines():
	html = get_html()
	soup = BeautifulSoup(html, 'html.parser')
	script_data = get_script_data(soup)
	result = parse_data(script_data)
	return result

	def main():
	headlines = get_headlines()
	print(f"Found {len(headlines)} headlines")
	print("\n".join(headlines))


	main()