Last active
August 12, 2017 16:57
-
-
Save abehmiel/4298adb93dbf80b6b5879c1cd2993dcf to your computer and use it in GitHub Desktop.
Press release scrape for pressreleasepoint.com - verbose output to console and saves to text file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Because this code take so long to run as-coded below, I recommended to follow it | |
up with a check for file duplicates (fdupes -dN in linux seems to work) | |
After downloading, you can combine them into a single corpus file by concatenating: | |
find . -name "*.txt" -exec cat '{}' ';' > dirty.txt | |
Then you can use whatever means you wish to clean up the text and remove unicode symbols | |
and so on | |
""" | |
from bs4 import BeautifulSoup | |
from glob import glob | |
from os import makedirs | |
from os.path import join | |
from urllib.parse import urljoin | |
import requests | |
BASE_URL = 'http://www.pressreleasepoint.com' | |
PAGE_PATTERN = '/prpage/best?page=' | |
MAX_PAGES = 2497 | |
makedirs('press-releases', exist_ok=True) | |
# Gather up all the index pages | |
# for each year | |
for page in range(MAX_PAGES+1): | |
this_url = BASE_URL + PAGE_PATTERN + str(page) | |
print(this_url) | |
this_page = requests.get(this_url) | |
soup = BeautifulSoup(this_page.content, 'lxml') | |
i = 0 | |
for hed in soup.find_all('h2'): | |
try: | |
href = hed.find('a').attrs['href'] | |
landed_url = urljoin(BASE_URL, href) | |
print("Downloading from...", landed_url) | |
pr_page = requests.get(landed_url) | |
pr_soup = BeautifulSoup(pr_page.content, 'lxml') | |
pr_text = pr_soup.find(class_ = 'content-press_release').text | |
pr_text = pr_text.replace("Printer-friendly version", "", 1) | |
pr_text = pr_text.replace("\n\n", "") | |
i += 1 | |
print(pr_text) | |
text_file = open('press-releases/' + str(page) + '-' + str(i) + '.txt', "w") | |
text_file.write(pr_text) | |
text_file.close() | |
except: | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment