Created
November 25, 2016 10:17
-
-
Save husio/982a56e9507e1db47324be5272865695 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from os import path | |
from urllib.request import urlopen | |
from bs4 import BeautifulSoup | |
import justext | |
WORLD_URL = 'https://www.theguardian.com/world?page=%d' | |
URLS = [WORLD_URL % i for i in range(1, 11)] | |
DEST_DIR = './articles' | |
def find_articles(url): | |
try: | |
html = urlopen(url).read() | |
soup = BeautifulSoup(html, "lxml") | |
articles = soup.find_all('a', attrs={'data-link-name': 'article'}) | |
return [a.get('href') for a in articles] | |
except Exception: | |
print("cannot read {}".format(url)) | |
return [] | |
def jt(url): | |
html = urlopen(url).read() | |
paragraphs = justext.justext(html, justext.get_stoplist("English")) | |
text = "\n".join([p.text for p in paragraphs if not p.is_boilerplate]) | |
return text.encode("utf-8") | |
def run(): | |
print('Scrapping article links...') | |
# Fetch all articles and save text in data/world | |
articles = [] | |
for url in URLS: | |
articles += find_articles(url) | |
print('Scrapping links done:') | |
print("Fetching text from %d articles..." % len(articles)) | |
for a in set(articles): # Avoid possible duplicates | |
try: | |
text = jt(a) | |
name = '{}/{}'.format(DEST_DIR, path.basename(a)) | |
with open(name, 'wb') as out: | |
out.write(text) | |
except Exception as e: | |
print("error parsing {}:{}".format(a, e)) | |
if __name__ == "__main__": | |
run() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from flask import Flask, request, jsonify | |
from os import listdir, path | |
from urllib.request import urlopen | |
import justext | |
from gensim import corpora, models, similarities | |
import stopwords | |
app = Flask(__name__) | |
@app.route('/', methods=['POST']) | |
def find(): | |
text = request.get_data().decode('utf-8') | |
if text.startswith(('https://', 'http://')): | |
text = jt(text) | |
res = similar_articles(text) | |
return jsonify(res) | |
def jt(url): | |
html = urlopen(url).read() | |
paragraphs = justext.justext(html, justext.get_stoplist("English")) | |
text = "\n".join([p.text for p in paragraphs if not p.is_boilerplate]) | |
return text.encode("utf-8") | |
def similar_articles(text): | |
vec_bow = dictionary.doc2bow(text.lower().split()) | |
vec_lsi = lsi[vec_bow] # convert the query to LSI space | |
sims = index[vec_lsi] | |
sims = sorted(enumerate(sims), key=lambda item: -item[1]) | |
return [(str(s[1]), articles[s[0]]) for s in sims[:5] if s[1] > 0.7] | |
index = None | |
dictionary = None | |
lsi = None | |
articles = None | |
def build(): | |
global articles | |
articles = load_files('./articles/') | |
tokenized_articles = [tokenize_text(a) for a in articles] | |
global dictionary | |
dictionary = corpora.Dictionary(tokenized_articles) | |
corpus = [dictionary.doc2bow(t) for t in tokenized_articles] | |
global lsi | |
lsi = models.LsiModel(corpus, id2word=dictionary) | |
# transform corpus to LSI space and index it | |
global index | |
index = similarities.MatrixSimilarity(lsi[corpus]) | |
def load_files(folder): | |
files = [] | |
for name in listdir(folder): | |
with open(path.join(folder, name), 'r') as f: | |
files.append('ARTICLE_' + name + ' ' + f.read()) | |
return files | |
def tokenize_text(text): | |
words = text.lower().split() | |
return [w for w in words | |
if len(w) > 3 and len(w) < 40 and w not in stopwords.EN] | |
if __name__ == "__main__": | |
print('building corpus') | |
build() | |
print("corpus built") | |
app.run(host='127.0.0.1', port=5000, debug=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
EN = set([ | |
"a", | |
"a's", | |
"able", | |
"about", | |
"above", | |
"according", | |
"accordingly", | |
"across", | |
"actually", | |
"after", | |
"afterwards", | |
"again", | |
"against", | |
"ain't", | |
"all", | |
"allow", | |
"allows", | |
"almost", | |
"alone", | |
"along", | |
"already", | |
"also", | |
"although", | |
"always", | |
"am", | |
"among", | |
"amongst", | |
"an", | |
"and", | |
"another", | |
"any", | |
"anybody", | |
"anyhow", | |
"anyone", | |
"anything", | |
"anyway", | |
"anyways", | |
"anywhere", | |
"apart", | |
"appear", | |
"appreciate", | |
"appropriate", | |
"are", | |
"aren't", | |
"around", | |
"as", | |
"aside", | |
"ask", | |
"asking", | |
"associated", | |
"at", | |
"available", | |
"away", | |
"awfully", | |
"b", | |
"be", | |
"became", | |
"because", | |
"become", | |
"becomes", | |
"becoming", | |
"been", | |
"before", | |
"beforehand", | |
"behind", | |
"being", | |
"believe", | |
"below", | |
"beside", | |
"besides", | |
"best", | |
"better", | |
"between", | |
"beyond", | |
"both", | |
"brief", | |
"but", | |
"by", | |
"c", | |
"c'mon", | |
"c's", | |
"came", | |
"can", | |
"can't", | |
"cannot", | |
"cant", | |
"cause", | |
"causes", | |
"certain", | |
"certainly", | |
"changes", | |
"clearly", | |
"co", | |
"com", | |
"come", | |
"comes", | |
"concerning", | |
"consequently", | |
"consider", | |
"considering", | |
"contain", | |
"containing", | |
"contains", | |
"corresponding", | |
"could", | |
"couldn't", | |
"course", | |
"currently", | |
"d", | |
"definitely", | |
"described", | |
"despite", | |
"did", | |
"didn't", | |
"different", | |
"do", | |
"does", | |
"doesn't", | |
"doing", | |
"don't", | |
"done", | |
"down", | |
"downwards", | |
"during", | |
"e", | |
"each", | |
"edu", | |
"eg", | |
"eight", | |
"either", | |
"else", | |
"elsewhere", | |
"enough", | |
"entirely", | |
"especially", | |
"et", | |
"etc", | |
"even", | |
"ever", | |
"every", | |
"everybody", | |
"everyone", | |
"everything", | |
"everywhere", | |
"ex", | |
"exactly", | |
"example", | |
"except", | |
"f", | |
"far", | |
"few", | |
"fifth", | |
"first", | |
"five", | |
"followed", | |
"following", | |
"follows", | |
"for", | |
"former", | |
"formerly", | |
"forth", | |
"four", | |
"from", | |
"further", | |
"furthermore", | |
"g", | |
"get", | |
"gets", | |
"getting", | |
"given", | |
"gives", | |
"go", | |
"goes", | |
"going", | |
"gone", | |
"got", | |
"gotten", | |
"greetings", | |
"h", | |
"had", | |
"hadn't", | |
"happens", | |
"hardly", | |
"has", | |
"hasn't", | |
"have", | |
"haven't", | |
"having", | |
"he", | |
"he's", | |
"hello", | |
"help", | |
"hence", | |
"her", | |
"here", | |
"here's", | |
"hereafter", | |
"hereby", | |
"herein", | |
"hereupon", | |
"hers", | |
"herself", | |
"hi", | |
"him", | |
"himself", | |
"his", | |
"hither", | |
"hopefully", | |
"how", | |
"howbeit", | |
"however", | |
"i", | |
"i'd", | |
"i'll", | |
"i'm", | |
"i've", | |
"ie", | |
"if", | |
"ignored", | |
"immediate", | |
"in", | |
"inasmuch", | |
"inc", | |
"indeed", | |
"indicate", | |
"indicated", | |
"indicates", | |
"inner", | |
"insofar", | |
"instead", | |
"into", | |
"inward", | |
"is", | |
"isn't", | |
"it", | |
"it'd", | |
"it'll", | |
"it's", | |
"its", | |
"itself", | |
"j", | |
"just", | |
"k", | |
"keep", | |
"keeps", | |
"kept", | |
"know", | |
"knows", | |
"known", | |
"l", | |
"last", | |
"lately", | |
"later", | |
"latter", | |
"latterly", | |
"least", | |
"less", | |
"lest", | |
"let", | |
"let's", | |
"like", | |
"liked", | |
"likely", | |
"little", | |
"look", | |
"looking", | |
"looks", | |
"ltd", | |
"m", | |
"mainly", | |
"many", | |
"may", | |
"maybe", | |
"me", | |
"mean", | |
"meanwhile", | |
"merely", | |
"might", | |
"more", | |
"moreover", | |
"most", | |
"mostly", | |
"much", | |
"must", | |
"my", | |
"myself", | |
"n", | |
"name", | |
"namely", | |
"nd", | |
"near", | |
"nearly", | |
"necessary", | |
"need", | |
"needs", | |
"neither", | |
"never", | |
"nevertheless", | |
"new", | |
"next", | |
"nine", | |
"no", | |
"nobody", | |
"non", | |
"none", | |
"noone", | |
"nor", | |
"normally", | |
"not", | |
"nothing", | |
"novel", | |
"now", | |
"nowhere", | |
"o", | |
"obviously", | |
"of", | |
"off", | |
"often", | |
"oh", | |
"ok", | |
"okay", | |
"old", | |
"on", | |
"once", | |
"one", | |
"ones", | |
"only", | |
"onto", | |
"or", | |
"other", | |
"others", | |
"otherwise", | |
"ought", | |
"our", | |
"ours", | |
"ourselves", | |
"out", | |
"outside", | |
"over", | |
"overall", | |
"own", | |
"p", | |
"particular", | |
"particularly", | |
"per", | |
"perhaps", | |
"placed", | |
"please", | |
"plus", | |
"possible", | |
"presumably", | |
"probably", | |
"provides", | |
"q", | |
"que", | |
"quite", | |
"qv", | |
"r", | |
"rather", | |
"rd", | |
"re", | |
"really", | |
"reasonably", | |
"regarding", | |
"regardless", | |
"regards", | |
"relatively", | |
"respectively", | |
"right", | |
"s", | |
"said", | |
"same", | |
"saw", | |
"say", | |
"saying", | |
"says", | |
"second", | |
"secondly", | |
"see", | |
"seeing", | |
"seem", | |
"seemed", | |
"seeming", | |
"seems", | |
"seen", | |
"self", | |
"selves", | |
"sensible", | |
"sent", | |
"serious", | |
"seriously", | |
"seven", | |
"several", | |
"shall", | |
"she", | |
"should", | |
"shouldn't", | |
"since", | |
"six", | |
"so", | |
"some", | |
"somebody", | |
"somehow", | |
"someone", | |
"something", | |
"sometime", | |
"sometimes", | |
"somewhat", | |
"somewhere", | |
"soon", | |
"sorry", | |
"specified", | |
"specify", | |
"specifying", | |
"still", | |
"sub", | |
"such", | |
"sup", | |
"sure", | |
"t", | |
"t's", | |
"take", | |
"taken", | |
"tell", | |
"tends", | |
"th", | |
"than", | |
"thank", | |
"thanks", | |
"thanx", | |
"that", | |
"that's", | |
"thats", | |
"the", | |
"their", | |
"theirs", | |
"them", | |
"themselves", | |
"then", | |
"thence", | |
"there", | |
"there's", | |
"thereafter", | |
"thereby", | |
"therefore", | |
"therein", | |
"theres", | |
"thereupon", | |
"these", | |
"they", | |
"they'd", | |
"they'll", | |
"they're", | |
"they've", | |
"think", | |
"third", | |
"this", | |
"thorough", | |
"thoroughly", | |
"those", | |
"though", | |
"three", | |
"through", | |
"throughout", | |
"thru", | |
"thus", | |
"to", | |
"together", | |
"too", | |
"took", | |
"toward", | |
"towards", | |
"tried", | |
"tries", | |
"truly", | |
"try", | |
"trying", | |
"twice", | |
"two", | |
"u", | |
"un", | |
"under", | |
"unfortunately", | |
"unless", | |
"unlikely", | |
"until", | |
"unto", | |
"up", | |
"upon", | |
"us", | |
"use", | |
"used", | |
"useful", | |
"uses", | |
"using", | |
"usually", | |
"uucp", | |
"v", | |
"value", | |
"various", | |
"very", | |
"via", | |
"viz", | |
"vs", | |
"w", | |
"want", | |
"wants", | |
"was", | |
"wasn't", | |
"way", | |
"we", | |
"we'd", | |
"we'll", | |
"we're", | |
"we've", | |
"welcome", | |
"well", | |
"went", | |
"were", | |
"weren't", | |
"what", | |
"what's", | |
"whatever", | |
"when", | |
"whence", | |
"whenever", | |
"where", | |
"where's", | |
"whereafter", | |
"whereas", | |
"whereby", | |
"wherein", | |
"whereupon", | |
"wherever", | |
"whether", | |
"which", | |
"while", | |
"whither", | |
"who", | |
"who's", | |
"whoever", | |
"whole", | |
"whom", | |
"whose", | |
"why", | |
"will", | |
"willing", | |
"wish", | |
"with", | |
"within", | |
"without", | |
"won't", | |
"wonder", | |
"would", | |
"would", | |
"wouldn't", | |
"x", | |
"y", | |
"yes", | |
"yet", | |
"you", | |
"you'd", | |
"you'll", | |
"you're", | |
"you've", | |
"your", | |
"yours", | |
"yourself", | |
"yourselves", | |
"z", | |
"zero", | |
"a", | |
",", | |
".", | |
"?", | |
"!", | |
"|", | |
":", | |
"'", | |
"\"", | |
";", | |
"<NUM>", | |
"?", | |
"$", | |
"km", | |
"s", | |
"u", | |
"&", | |
"#", | |
"'s", | |
"/", | |
"dr.", | |
]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment