Created
April 26, 2018 18:21
-
-
Save aleszu/c78d64125ce2aa697f291ba3130bb095 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import requests | |
import threading | |
import os | |
import sys | |
from Queue import Queue | |
from bs4 import BeautifulSoup as bs | |
# Globals | |
q = Queue() | |
lock = threading.Lock() | |
total_articles = 0 | |
cur_article = 0 | |
target_path = '' | |
# Represents Article | |
class Article(): | |
def __init__(self, name, link): | |
self.headline = name | |
self.link = link | |
self.text = None | |
self.done = False | |
self.trys = 0 | |
# Returns article text | |
def parse_text(self, html): | |
soup = bs(html, 'html.parser') | |
for remove in soup(["script", "stylings", "style"]): | |
remove.decompose() | |
text = soup.get_text() | |
text = ''.join([c for c in text if ord(c) < 128]) | |
text = text.translate({ord(c):None for c in u'\r\n\t'}) | |
if len(text): | |
self.done = True | |
return text | |
# Gets the text for a given article | |
def getText(self): | |
try: | |
r = requests.get(self.link) | |
self.text = self.parse_text(r.content) | |
except: | |
pass # ignore | |
self.trys += 1 | |
if self.trys >= 2: | |
self.done = True | |
return self.done | |
# saves the article to s3 bucket | |
# add this instead to JSON | |
def save(self): | |
fname = (''.join([c for c in self.headline.split(' ')])).replace('/', '').replace('.', '').replace("'", "") + '.txt' | |
# write text to file | |
try: | |
f = open(target_path + fname, 'w') | |
f.write(self.text) | |
f.close() | |
except: | |
pass | |
# Attempt to open file | |
def openFile(path): | |
try: | |
f = open(path, 'r') | |
return f | |
except: | |
print 'file open failed, please check path.' | |
# skip headers if csv file has column headers | |
def skipHeaders(reader): | |
x = raw_input('does file have column headers(y/n): ') | |
if x == 'y' or x == '': | |
next(reader) # skip headers | |
# build queue | |
def init(): | |
global total_articles | |
f = openFile(raw_input('relative path to the file: ')) | |
reader = csv.reader(f) | |
skipHeaders(reader) # skip headers | |
url_col = int(raw_input('which column has the url [0-n]: ')) | |
name_col = raw_input('which column has article name (press enter if none): ') | |
name_col = int(name_col) if name_col != '' else url_col | |
for row in reader: | |
if row[url_col]: | |
article = Article(row[name_col], row[url_col]) | |
q.put(article) | |
total_articles += 1 | |
# Set data destination | |
def setDestination(): | |
global target_path | |
dst = raw_input('path of where to store text files: ') | |
if not os.path.exists(dst): | |
os.makedirs(dst) | |
target_path = dst | |
# updates task done count and prints update on progress | |
def updateQueue(): | |
global cur_article | |
global total_articles | |
with lock: | |
q.task_done() | |
cur_article += 1 | |
s = '%d/%d articles complete' % (cur_article, total_articles) | |
sys.stdout.write('\r' + s) | |
if cur_article == total_articles: | |
sys.stdout.write('\n') | |
sys.stdout.flush() | |
# thread function | |
def worker(): | |
while not q.empty(): | |
article = q.get() | |
resp = article.getText() | |
if resp: | |
updateQueue() | |
article.save() | |
else: | |
q.task_done() | |
q.put(article) | |
# build the threads | |
def initThreads(): | |
for i in range(30): | |
t = threading.Thread(target=worker) | |
t.daemon = True | |
t.start() | |
if __name__ == '__main__': | |
init() | |
total_articles = q.qsize() | |
setDestination() | |
initThreads() | |
q.join() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment