Last active
August 31, 2020 11:23
-
-
Save pamelafox/020683c814e71c8262534563fcc7363f to your computer and use it in GitHub Desktop.
Scrapers
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
import string | |
def save_to_file(filename, sayings): | |
file = open(filename, "w") | |
file.write("\n".join(sayings).encode("utf-8")) | |
file.close() | |
def scrape_pickuplinesnet(): | |
sayings = [] | |
for num in range(1, 72): | |
r = requests.get("https://pickup-lines.net/page/" + str(num) + "/") | |
soup = BeautifulSoup(r.text, "html.parser") | |
for list_item in soup.select('article.loop-entry .loop-entry-line'): | |
saying = list_item.get_text() | |
sayings.append(saying) | |
save_to_file("pickuplinesnet.txt", sayings) | |
def lower_and_strip(s): | |
return ''.join(c for c in s.lower() if c in string.ascii_lowercase) | |
def combine_sayings(): | |
filenames = ['sayings_wikipedia.txt'] | |
outfile = open('sayings.txt', "w") | |
unique_sayings = {} | |
for fname in filenames: | |
with open(fname) as infile: | |
for line in infile: | |
unique_sayings[lower_and_strip(line)] = True | |
outfile.write(line) | |
print len(unique_sayings.keys()) | |
scrape_pickuplinesnet() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
import string | |
def save_to_file(filename, sayings): | |
file = open(filename, "w") | |
file.write("\n".join(sayings).encode("utf-8")) | |
file.close() | |
def scrape_wikipedia(): | |
r = requests.get("https://en.wikiquote.org/wiki/English_proverbs") | |
soup = BeautifulSoup(r.text) | |
sayings = [] | |
for list_item in soup.find_all('li'): | |
saying = list_item.get_text() | |
if saying.startswith('"'): | |
saying = saying.split('"')[1] | |
sayings.append(saying) | |
save_to_file("sayings_wikipedia.txt", sayings) | |
def scrape_truisms(): | |
r = requests.get("http://1001truisms.webs.com/truisms.htm") | |
soup = BeautifulSoup(r.text) | |
sayings = [] | |
for item in soup.select('p > span:nth-of-type(2)'): | |
saying = item.get_text() | |
sayings.append(saying) | |
save_to_file("sayings_truisms.txt", sayings) | |
def scrape_phrasesuk(): | |
r = requests.get("http://www.phrases.org.uk/meanings/proverbs.html") | |
soup = BeautifulSoup(r.text) | |
sayings = [] | |
for item in soup.select('p.phrase-list'): | |
saying = item.get_text() | |
print saying | |
sayings.append(saying) | |
save_to_file("sayings_phrasesuk.txt", sayings) | |
def scrape_twwproverbs(): | |
r = requests.get("http://tww.id.au/proverbs/proverbs.html") | |
soup = BeautifulSoup(r.text) | |
sayings = [] | |
for item in soup.select('ul li'): | |
saying = item.get_text() | |
print saying | |
sayings.append(saying) | |
save_to_file("sayings_twwproverbs.txt", sayings) | |
def lower_and_strip(s): | |
return ''.join(c for c in s.lower() if c in string.ascii_lowercase) | |
def combine_sayings(): | |
filenames = ['sayings_wikipedia.txt', 'sayings_truisms.txt', 'sayings_phrasesuk.txt', 'sayings_twwproverbs.txt'] | |
outfile = open('sayings.txt', "w") | |
unique_sayings = {} | |
for fname in filenames: | |
with open(fname) as infile: | |
for line in infile: | |
unique_sayings[lower_and_strip(line)] = True | |
outfile.write(line) | |
print len(unique_sayings.keys()) | |
combine_sayings() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment