Skip to content

Instantly share code, notes, and snippets.

@kinoute
Last active September 18, 2019 15:59
Show Gist options
  • Save kinoute/838c289d365b80e8a0607ee414ced0bb to your computer and use it in GitHub Desktop.
Save kinoute/838c289d365b80e8a0607ee414ced0bb to your computer and use it in GitHub Desktop.
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import random
from urllib.parse import urlsplit
stories_cols = ['title', 'story']
csv_stories = pd.DataFrame(columns=stories_cols)
headers = requests.utils.default_headers()
headers.update(
{
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0',
}
)
url = "http://www.tonightsbedtimestory.com/stories/"
requete = requests.get(url, headers=headers)
page = requete.content
soup = BeautifulSoup(page, "html.parser")
stories = soup.findAll('div', {'class': 'post'})
for story in stories:
story_url = story.a['href']
requete_story = requests.get(story_url, headers=headers)
page = requete_story.content
soup = BeautifulSoup(page, "html.parser")
story_name = story.string
story_text = soup.find('div', {'class': 'body'})
if story_text.find('em'): # remove first line
story_text.em.decompose()
story_text = story_text.extract().get_text()
csv_stories.loc[len(csv_stories)] = [story_name, story_text]
csv_stories.to_csv("stories.csv")
print(csv_stories)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment