Created
January 29, 2021 04:09
-
-
Save afiaka87/13ddc362e502602ff755558a31e404d9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
# Setup: | |
`python3 -m pip install bs4` | |
# Usage: | |
```python3 | |
# Change the episode_id_param and num_pages | |
python3 transcripts.py | |
``` | |
""" | |
from bs4 import BeautifulSoup | |
import urllib | |
episode_id_param = 104 # Go to show on foreverdreaming. episode_id is found in the `.*/?f=(\d+).*` | |
num_pages = 7 # Enter the number of pages for the episode. | |
start_param = 0 # (dont change) The first page of results is at index 0 | |
increment_by = 25 # (dont change) Pages are indexed via 25, 50, 75, etc. | |
page = "https://transcripts.foreverdreaming.org/viewforum.php?f={}&start={}" # (dont change) | |
page_urls = [page.format(episode_id_param, page_start) for page_start in range(start_param, num_ pages * 25, increment_by)] | |
each_pages_html = [urllib.request.urlopen(_url) for _url in page_urls] | |
links = [] | |
for _html in each_pages_html: | |
soup = BeautifulSoup(_html, "html.parser") | |
links.extend( | |
(link.get('href') for link in soup.findAll('a')) | |
) | |
episode_texts = [] | |
for link in links: | |
if "./viewtopic" in link: | |
full_url = "https://transcripts.foreverdreaming.org/" + link.replace("./","") | |
episode_html = urllib.request.urlopen(full_url) | |
soup = BeautifulSoup(episode_html, 'html.parser') | |
p_elements = [elem.text for elem in soup.findAll('p')] | |
print("\n".join(p_elements), file=open(link, 'w')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment