abehmiel · August 12, 2017 16:57
diff --git a/PRP_scrape.py b/PRP_scrape.py
 """
 Because this code take so long to run as-coded below, I recommended to follow it
 up with a check for file duplicates (fdupes -dN in linux seems to work)

 After downloading, you can combine them into a single corpus file by concatenating:
 find . -name "*.txt" -exec cat '{}' ';' > dirty.txt

 Then you can use whatever means you wish to clean up the text and remove unicode symbols
 and so on
 """

 from bs4 import BeautifulSoup
 from glob import glob
 from os import makedirs
 from os.path import join
 from urllib.parse import urljoin
 import requests

 BASE_URL = 'http://www.pressreleasepoint.com'
 PAGE_PATTERN = '/prpage/best?page='
 MAX_PAGES = 2497
 makedirs('press-releases', exist_ok=True)
 # Gather up all the index pages

 # for each year
 for page in range(MAX_PAGES+1):
    this_url = BASE_URL + PAGE_PATTERN + str(page) 
    print(this_url)
    this_page = requests.get(this_url)
    soup = BeautifulSoup(this_page.content, 'lxml')

    i = 0
    for hed in soup.find_all('h2'):
        try:
            href = hed.find('a').attrs['href']
            landed_url = urljoin(BASE_URL, href)
            print("Downloading from...", landed_url)
            pr_page = requests.get(landed_url)
            pr_soup = BeautifulSoup(pr_page.content, 'lxml')
            pr_text = pr_soup.find(class_ = 'content-press_release').text
            pr_text = pr_text.replace("Printer-friendly version", "", 1)
            pr_text = pr_text.replace("\n\n", "")
            i += 1
            print(pr_text)
            text_file = open('press-releases/' + str(page) + '-' + str(i) + '.txt', "w")
            text_file.write(pr_text)
            text_file.close()
        except:
            pass
	"""
	Because this code take so long to run as-coded below, I recommended to follow it
	up with a check for file duplicates (fdupes -dN in linux seems to work)

	After downloading, you can combine them into a single corpus file by concatenating:
	find . -name "*.txt" -exec cat '{}' ';' > dirty.txt

	Then you can use whatever means you wish to clean up the text and remove unicode symbols
	and so on
	"""

	from bs4 import BeautifulSoup
	from glob import glob
	from os import makedirs
	from os.path import join
	from urllib.parse import urljoin
	import requests

	BASE_URL = 'http://www.pressreleasepoint.com'
	PAGE_PATTERN = '/prpage/best?page='
	MAX_PAGES = 2497
	makedirs('press-releases', exist_ok=True)
	# Gather up all the index pages

	# for each year
	for page in range(MAX_PAGES+1):
	this_url = BASE_URL + PAGE_PATTERN + str(page)
	print(this_url)
	this_page = requests.get(this_url)
	soup = BeautifulSoup(this_page.content, 'lxml')

	i = 0
	for hed in soup.find_all('h2'):
	try:
	href = hed.find('a').attrs['href']
	landed_url = urljoin(BASE_URL, href)
	print("Downloading from...", landed_url)
	pr_page = requests.get(landed_url)
	pr_soup = BeautifulSoup(pr_page.content, 'lxml')
	pr_text = pr_soup.find(class_ = 'content-press_release').text
	pr_text = pr_text.replace("Printer-friendly version", "", 1)
	pr_text = pr_text.replace("\n\n", "")
	i += 1
	print(pr_text)
	text_file = open('press-releases/' + str(page) + '-' + str(i) + '.txt', "w")
	text_file.write(pr_text)
	text_file.close()
	except:
	pass