husio · November 25, 2016 10:17
diff --git a/crawler.py b/crawler.py
 from os import path
 from urllib.request import urlopen

 from bs4 import BeautifulSoup
 import justext

 WORLD_URL = 'https://www.theguardian.com/world?page=%d'

 URLS = [WORLD_URL % i for i in range(1, 11)]
 DEST_DIR = './articles'


 def find_articles(url):
    try:
        html = urlopen(url).read()
        soup = BeautifulSoup(html, "lxml")
        articles = soup.find_all('a', attrs={'data-link-name': 'article'})
        return [a.get('href') for a in articles]
    except Exception:
        print("cannot read {}".format(url))
        return []


 def jt(url):
    html = urlopen(url).read()
    paragraphs = justext.justext(html, justext.get_stoplist("English"))
    text = "\n".join([p.text for p in paragraphs if not p.is_boilerplate])
    return text.encode("utf-8")


 def run():
    print('Scrapping article links...')
    # Fetch all articles and save text in data/world
    articles = []
    for url in URLS:
        articles += find_articles(url)

    print('Scrapping links done:')
    print("Fetching text from %d articles..." % len(articles))
    for a in set(articles):  # Avoid possible duplicates
        try:
            text = jt(a)
            name = '{}/{}'.format(DEST_DIR, path.basename(a))
            with open(name, 'wb') as out:
                out.write(text)
        except Exception as e:
            print("error parsing {}:{}".format(a, e))


 if __name__ == "__main__":
    run()
diff --git a/search.py b/search.py
 from flask import Flask, request, jsonify
 from os import listdir, path
 from urllib.request import urlopen

 import justext
 from gensim import corpora, models, similarities

 import stopwords

 app = Flask(__name__)


 @app.route('/', methods=['POST'])
 def find():
    text = request.get_data().decode('utf-8')
    if text.startswith(('https://', 'http://')):
        text = jt(text)

    res = similar_articles(text)
    return jsonify(res)


 def jt(url):
    html = urlopen(url).read()
    paragraphs = justext.justext(html, justext.get_stoplist("English"))
    text = "\n".join([p.text for p in paragraphs if not p.is_boilerplate])
    return text.encode("utf-8")


 def similar_articles(text):
    vec_bow = dictionary.doc2bow(text.lower().split())
    vec_lsi = lsi[vec_bow]  # convert the query to LSI space
    sims = index[vec_lsi]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    return [(str(s[1]), articles[s[0]]) for s in sims[:5] if s[1] > 0.7]


 index = None
 dictionary = None
 lsi = None
 articles = None


 def build():
    global articles
    articles = load_files('./articles/')
    tokenized_articles = [tokenize_text(a) for a in articles]

    global dictionary
    dictionary = corpora.Dictionary(tokenized_articles)
    corpus = [dictionary.doc2bow(t) for t in tokenized_articles]
    global lsi
    lsi = models.LsiModel(corpus, id2word=dictionary)

    # transform corpus to LSI space and index it
    global index
    index = similarities.MatrixSimilarity(lsi[corpus])


 def load_files(folder):
    files = []
    for name in listdir(folder):
        with open(path.join(folder, name), 'r') as f:
            files.append('ARTICLE_' + name + ' ' + f.read())
    return files


 def tokenize_text(text):
    words = text.lower().split()
    return [w for w in words
            if len(w) > 3 and len(w) < 40 and w not in stopwords.EN]


 if __name__ == "__main__":
    print('building corpus')
    build()
    print("corpus built")
    app.run(host='127.0.0.1', port=5000, debug=True)
diff --git a/stopwords.py b/stopwords.py
 EN = set([
    "a",
    "a's",
    "able",
    "about",
    "above",
    "according",
    "accordingly",
    "across",
    "actually",
    "after",
    "afterwards",
    "again",
    "against",
    "ain't",
    "all",
    "allow",
    "allows",
    "almost",
    "alone",
    "along",
    "already",
    "also",
    "although",
    "always",
    "am",
    "among",
    "amongst",
    "an",
    "and",
    "another",
    "any",
    "anybody",
    "anyhow",
    "anyone",
    "anything",
    "anyway",
    "anyways",
    "anywhere",
    "apart",
    "appear",
    "appreciate",
    "appropriate",
    "are",
    "aren't",
    "around",
    "as",
    "aside",
    "ask",
    "asking",
    "associated",
    "at",
    "available",
    "away",
    "awfully",
    "b",
    "be",
    "became",
    "because",
    "become",
    "becomes",
    "becoming",
    "been",
    "before",
    "beforehand",
    "behind",
    "being",
    "believe",
    "below",
    "beside",
    "besides",
    "best",
    "better",
    "between",
    "beyond",
    "both",
    "brief",
    "but",
    "by",
    "c",
    "c'mon",
    "c's",
    "came",
    "can",
    "can't",
    "cannot",
    "cant",
    "cause",
    "causes",
    "certain",
    "certainly",
    "changes",
    "clearly",
    "co",
    "com",
    "come",
    "comes",
    "concerning",
    "consequently",
    "consider",
    "considering",
    "contain",
    "containing",
    "contains",
    "corresponding",
    "could",
    "couldn't",
    "course",
    "currently",
    "d",
    "definitely",
    "described",
    "despite",
    "did",
    "didn't",
    "different",
    "do",
    "does",
    "doesn't",
    "doing",
    "don't",
    "done",
    "down",
    "downwards",
    "during",
    "e",
    "each",
    "edu",
    "eg",
    "eight",
    "either",
    "else",
    "elsewhere",
    "enough",
    "entirely",
    "especially",
    "et",
    "etc",
    "even",
    "ever",
    "every",
    "everybody",
    "everyone",
    "everything",
    "everywhere",
    "ex",
    "exactly",
    "example",
    "except",
    "f",
    "far",
    "few",
    "fifth",
    "first",
    "five",
    "followed",
    "following",
    "follows",
    "for",
    "former",
    "formerly",
    "forth",
    "four",
    "from",
    "further",
    "furthermore",
    "g",
    "get",
    "gets",
    "getting",
    "given",
    "gives",
    "go",
    "goes",
    "going",
    "gone",
    "got",
    "gotten",
    "greetings",
    "h",
    "had",
    "hadn't",
    "happens",
    "hardly",
    "has",
    "hasn't",
    "have",
    "haven't",
    "having",
    "he",
    "he's",
    "hello",
    "help",
    "hence",
    "her",
    "here",
    "here's",
    "hereafter",
    "hereby",
    "herein",
    "hereupon",
    "hers",
    "herself",
    "hi",
    "him",
    "himself",
    "his",
    "hither",
    "hopefully",
    "how",
    "howbeit",
    "however",
    "i",
    "i'd",
    "i'll",
    "i'm",
    "i've",
    "ie",
    "if",
    "ignored",
    "immediate",
    "in",
    "inasmuch",
    "inc",
    "indeed",
    "indicate",
    "indicated",
    "indicates",
    "inner",
    "insofar",
    "instead",
    "into",
    "inward",
    "is",
    "isn't",
    "it",
    "it'd",
    "it'll",
    "it's",
    "its",
    "itself",
    "j",
    "just",
    "k",
    "keep",
    "keeps",
    "kept",
    "know",
    "knows",
    "known",
    "l",
    "last",
    "lately",
    "later",
    "latter",
    "latterly",
    "least",
    "less",
    "lest",
    "let",
    "let's",
    "like",
    "liked",
    "likely",
    "little",
    "look",
    "looking",
    "looks",
    "ltd",
    "m",
    "mainly",
    "many",
    "may",
    "maybe",
    "me",
    "mean",
    "meanwhile",
    "merely",
    "might",
    "more",
    "moreover",
    "most",
    "mostly",
    "much",
    "must",
    "my",
    "myself",
    "n",
    "name",
    "namely",
    "nd",
    "near",
    "nearly",
    "necessary",
    "need",
    "needs",
    "neither",
    "never",
    "nevertheless",
    "new",
    "next",
    "nine",
    "no",
    "nobody",
    "non",
    "none",
    "noone",
    "nor",
    "normally",
    "not",
    "nothing",
    "novel",
    "now",
    "nowhere",
    "o",
    "obviously",
    "of",
    "off",
    "often",
    "oh",
    "ok",
    "okay",
    "old",
    "on",
    "once",
    "one",
    "ones",
    "only",
    "onto",
    "or",
    "other",
    "others",
    "otherwise",
    "ought",
    "our",
    "ours",
    "ourselves",
    "out",
    "outside",
    "over",
    "overall",
    "own",
    "p",
    "particular",
    "particularly",
    "per",
    "perhaps",
    "placed",
    "please",
    "plus",
    "possible",
    "presumably",
    "probably",
    "provides",
    "q",
    "que",
    "quite",
    "qv",
    "r",
    "rather",
    "rd",
    "re",
    "really",
    "reasonably",
    "regarding",
    "regardless",
    "regards",
    "relatively",
    "respectively",
    "right",
    "s",
    "said",
    "same",
    "saw",
    "say",
    "saying",
    "says",
    "second",
    "secondly",
    "see",
    "seeing",
    "seem",
    "seemed",
    "seeming",
    "seems",
    "seen",
    "self",
    "selves",
    "sensible",
    "sent",
    "serious",
    "seriously",
    "seven",
    "several",
    "shall",
    "she",
    "should",
    "shouldn't",
    "since",
    "six",
    "so",
    "some",
    "somebody",
    "somehow",
    "someone",
    "something",
    "sometime",
    "sometimes",
    "somewhat",
    "somewhere",
    "soon",
    "sorry",
    "specified",
    "specify",
    "specifying",
    "still",
    "sub",
    "such",
    "sup",
    "sure",
    "t",
    "t's",
    "take",
    "taken",
    "tell",
    "tends",
    "th",
    "than",
    "thank",
    "thanks",
    "thanx",
    "that",
    "that's",
    "thats",
    "the",
    "their",
    "theirs",
    "them",
    "themselves",
    "then",
    "thence",
    "there",
    "there's",
    "thereafter",
    "thereby",
    "therefore",
    "therein",
    "theres",
    "thereupon",
    "these",
    "they",
    "they'd",
    "they'll",
    "they're",
    "they've",
    "think",
    "third",
    "this",
    "thorough",
    "thoroughly",
    "those",
    "though",
    "three",
    "through",
    "throughout",
    "thru",
    "thus",
    "to",
    "together",
    "too",
    "took",
    "toward",
    "towards",
    "tried",
    "tries",
    "truly",
    "try",
    "trying",
    "twice",
    "two",
    "u",
    "un",
    "under",
    "unfortunately",
    "unless",
    "unlikely",
    "until",
    "unto",
    "up",
    "upon",
    "us",
    "use",
    "used",
    "useful",
    "uses",
    "using",
    "usually",
    "uucp",
    "v",
    "value",
    "various",
    "very",
    "via",
    "viz",
    "vs",
    "w",
    "want",
    "wants",
    "was",
    "wasn't",
    "way",
    "we",
    "we'd",
    "we'll",
    "we're",
    "we've",
    "welcome",
    "well",
    "went",
    "were",
    "weren't",
    "what",
    "what's",
    "whatever",
    "when",
    "whence",
    "whenever",
    "where",
    "where's",
    "whereafter",
    "whereas",
    "whereby",
    "wherein",
    "whereupon",
    "wherever",
    "whether",
    "which",
    "while",
    "whither",
    "who",
    "who's",
    "whoever",
    "whole",
    "whom",
    "whose",
    "why",
    "will",
    "willing",
    "wish",
    "with",
    "within",
    "without",
    "won't",
    "wonder",
    "would",
    "would",
    "wouldn't",
    "x",
    "y",
    "yes",
    "yet",
    "you",
    "you'd",
    "you'll",
    "you're",
    "you've",
    "your",
    "yours",
    "yourself",
    "yourselves",
    "z",
    "zero",
    "a",
    ",",
    ".",
    "?",
    "!",
    "|",
    ":",
    "'",
    "\"",
    ";",
    "<NUM>",
    "?",
    "$",
    "km",
    "s",
    "u",
    "&",
    "#",
    "'s",
    "/",
    "dr.",
 ])
	from os import path
	from urllib.request import urlopen

	from bs4 import BeautifulSoup
	import justext

	WORLD_URL = 'https://www.theguardian.com/world?page=%d'

	URLS = [WORLD_URL % i for i in range(1, 11)]
	DEST_DIR = './articles'


	def find_articles(url):
	try:
	html = urlopen(url).read()
	soup = BeautifulSoup(html, "lxml")
	articles = soup.find_all('a', attrs={'data-link-name': 'article'})
	return [a.get('href') for a in articles]
	except Exception:
	print("cannot read {}".format(url))
	return []


	def jt(url):
	html = urlopen(url).read()
	paragraphs = justext.justext(html, justext.get_stoplist("English"))
	text = "\n".join([p.text for p in paragraphs if not p.is_boilerplate])
	return text.encode("utf-8")


	def run():
	print('Scrapping article links...')
	# Fetch all articles and save text in data/world
	articles = []
	for url in URLS:
	articles += find_articles(url)

	print('Scrapping links done:')
	print("Fetching text from %d articles..." % len(articles))
	for a in set(articles): # Avoid possible duplicates
	try:
	text = jt(a)
	name = '{}/{}'.format(DEST_DIR, path.basename(a))
	with open(name, 'wb') as out:
	out.write(text)
	except Exception as e:
	print("error parsing {}:{}".format(a, e))


	if __name__ == "__main__":
	run()
	from flask import Flask, request, jsonify
	from os import listdir, path
	from urllib.request import urlopen

	import justext
	from gensim import corpora, models, similarities

	import stopwords

	app = Flask(__name__)


	@app.route('/', methods=['POST'])
	def find():
	text = request.get_data().decode('utf-8')
	if text.startswith(('https://', 'http://')):
	text = jt(text)

	res = similar_articles(text)
	return jsonify(res)


	def jt(url):
	html = urlopen(url).read()
	paragraphs = justext.justext(html, justext.get_stoplist("English"))
	text = "\n".join([p.text for p in paragraphs if not p.is_boilerplate])
	return text.encode("utf-8")


	def similar_articles(text):
	vec_bow = dictionary.doc2bow(text.lower().split())
	vec_lsi = lsi[vec_bow] # convert the query to LSI space
	sims = index[vec_lsi]
	sims = sorted(enumerate(sims), key=lambda item: -item[1])
	return [(str(s[1]), articles[s[0]]) for s in sims[:5] if s[1] > 0.7]


	index = None
	dictionary = None
	lsi = None
	articles = None


	def build():
	global articles
	articles = load_files('./articles/')
	tokenized_articles = [tokenize_text(a) for a in articles]

	global dictionary
	dictionary = corpora.Dictionary(tokenized_articles)
	corpus = [dictionary.doc2bow(t) for t in tokenized_articles]
	global lsi
	lsi = models.LsiModel(corpus, id2word=dictionary)

	# transform corpus to LSI space and index it
	global index
	index = similarities.MatrixSimilarity(lsi[corpus])


	def load_files(folder):
	files = []
	for name in listdir(folder):
	with open(path.join(folder, name), 'r') as f:
	files.append('ARTICLE_' + name + ' ' + f.read())
	return files


	def tokenize_text(text):
	words = text.lower().split()
	return [w for w in words
	if len(w) > 3 and len(w) < 40 and w not in stopwords.EN]


	if __name__ == "__main__":
	print('building corpus')
	build()
	print("corpus built")
	app.run(host='127.0.0.1', port=5000, debug=True)
	EN = set([
	"a",
	"a's",
	"able",
	"about",
	"above",
	"according",
	"accordingly",
	"across",
	"actually",
	"after",
	"afterwards",
	"again",
	"against",
	"ain't",
	"all",
	"allow",
	"allows",
	"almost",
	"alone",
	"along",
	"already",
	"also",
	"although",
	"always",
	"am",
	"among",
	"amongst",
	"an",
	"and",
	"another",
	"any",
	"anybody",
	"anyhow",
	"anyone",
	"anything",
	"anyway",
	"anyways",
	"anywhere",
	"apart",
	"appear",
	"appreciate",
	"appropriate",
	"are",
	"aren't",
	"around",
	"as",
	"aside",
	"ask",
	"asking",
	"associated",
	"at",
	"available",
	"away",
	"awfully",
	"b",
	"be",
	"became",
	"because",
	"become",
	"becomes",
	"becoming",
	"been",
	"before",
	"beforehand",
	"behind",
	"being",
	"believe",
	"below",
	"beside",
	"besides",
	"best",
	"better",
	"between",
	"beyond",
	"both",
	"brief",
	"but",
	"by",
	"c",
	"c'mon",
	"c's",
	"came",
	"can",
	"can't",
	"cannot",
	"cant",
	"cause",
	"causes",
	"certain",
	"certainly",
	"changes",
	"clearly",
	"co",
	"com",
	"come",
	"comes",
	"concerning",
	"consequently",
	"consider",
	"considering",
	"contain",
	"containing",
	"contains",
	"corresponding",
	"could",
	"couldn't",
	"course",
	"currently",
	"d",
	"definitely",
	"described",
	"despite",
	"did",
	"didn't",
	"different",
	"do",
	"does",
	"doesn't",
	"doing",
	"don't",
	"done",
	"down",
	"downwards",
	"during",
	"e",
	"each",
	"edu",
	"eg",
	"eight",
	"either",
	"else",
	"elsewhere",
	"enough",
	"entirely",
	"especially",
	"et",
	"etc",
	"even",
	"ever",
	"every",
	"everybody",
	"everyone",
	"everything",
	"everywhere",
	"ex",
	"exactly",
	"example",
	"except",
	"f",
	"far",
	"few",
	"fifth",
	"first",
	"five",
	"followed",
	"following",
	"follows",
	"for",
	"former",
	"formerly",
	"forth",
	"four",
	"from",
	"further",
	"furthermore",
	"g",
	"get",
	"gets",
	"getting",
	"given",
	"gives",
	"go",
	"goes",
	"going",
	"gone",
	"got",
	"gotten",
	"greetings",
	"h",
	"had",
	"hadn't",
	"happens",
	"hardly",
	"has",
	"hasn't",
	"have",
	"haven't",
	"having",
	"he",
	"he's",
	"hello",
	"help",
	"hence",
	"her",
	"here",
	"here's",
	"hereafter",
	"hereby",
	"herein",
	"hereupon",
	"hers",
	"herself",
	"hi",
	"him",
	"himself",
	"his",
	"hither",
	"hopefully",
	"how",
	"howbeit",
	"however",
	"i",
	"i'd",
	"i'll",
	"i'm",
	"i've",
	"ie",
	"if",
	"ignored",
	"immediate",
	"in",
	"inasmuch",
	"inc",
	"indeed",
	"indicate",
	"indicated",
	"indicates",
	"inner",
	"insofar",
	"instead",
	"into",
	"inward",
	"is",
	"isn't",
	"it",
	"it'd",
	"it'll",
	"it's",
	"its",
	"itself",
	"j",
	"just",
	"k",
	"keep",
	"keeps",
	"kept",
	"know",
	"knows",
	"known",
	"l",
	"last",
	"lately",
	"later",
	"latter",
	"latterly",
	"least",
	"less",
	"lest",
	"let",
	"let's",
	"like",
	"liked",
	"likely",
	"little",
	"look",
	"looking",
	"looks",
	"ltd",
	"m",
	"mainly",
	"many",
	"may",
	"maybe",
	"me",
	"mean",
	"meanwhile",
	"merely",
	"might",
	"more",
	"moreover",
	"most",
	"mostly",
	"much",
	"must",
	"my",
	"myself",
	"n",
	"name",
	"namely",
	"nd",
	"near",
	"nearly",
	"necessary",
	"need",
	"needs",
	"neither",
	"never",
	"nevertheless",
	"new",
	"next",
	"nine",
	"no",
	"nobody",
	"non",
	"none",
	"noone",
	"nor",
	"normally",
	"not",
	"nothing",
	"novel",
	"now",
	"nowhere",
	"o",
	"obviously",
	"of",
	"off",
	"often",
	"oh",
	"ok",
	"okay",
	"old",
	"on",
	"once",
	"one",
	"ones",
	"only",
	"onto",
	"or",
	"other",
	"others",
	"otherwise",
	"ought",
	"our",
	"ours",
	"ourselves",
	"out",
	"outside",
	"over",
	"overall",
	"own",
	"p",
	"particular",
	"particularly",
	"per",
	"perhaps",
	"placed",
	"please",
	"plus",
	"possible",
	"presumably",
	"probably",
	"provides",
	"q",
	"que",
	"quite",
	"qv",
	"r",
	"rather",
	"rd",
	"re",
	"really",
	"reasonably",
	"regarding",
	"regardless",
	"regards",
	"relatively",
	"respectively",
	"right",
	"s",
	"said",
	"same",
	"saw",
	"say",
	"saying",
	"says",
	"second",
	"secondly",
	"see",
	"seeing",
	"seem",
	"seemed",
	"seeming",
	"seems",
	"seen",
	"self",
	"selves",
	"sensible",
	"sent",
	"serious",
	"seriously",
	"seven",
	"several",
	"shall",
	"she",
	"should",
	"shouldn't",
	"since",
	"six",
	"so",
	"some",
	"somebody",
	"somehow",
	"someone",
	"something",
	"sometime",
	"sometimes",
	"somewhat",
	"somewhere",
	"soon",
	"sorry",
	"specified",
	"specify",
	"specifying",
	"still",
	"sub",
	"such",
	"sup",
	"sure",
	"t",
	"t's",
	"take",
	"taken",
	"tell",
	"tends",
	"th",
	"than",
	"thank",
	"thanks",
	"thanx",
	"that",
	"that's",
	"thats",
	"the",
	"their",
	"theirs",
	"them",
	"themselves",
	"then",
	"thence",
	"there",
	"there's",
	"thereafter",
	"thereby",
	"therefore",
	"therein",
	"theres",
	"thereupon",
	"these",
	"they",
	"they'd",
	"they'll",
	"they're",
	"they've",
	"think",
	"third",
	"this",
	"thorough",
	"thoroughly",
	"those",
	"though",
	"three",
	"through",
	"throughout",
	"thru",
	"thus",
	"to",
	"together",
	"too",
	"took",
	"toward",
	"towards",
	"tried",
	"tries",
	"truly",
	"try",
	"trying",
	"twice",
	"two",
	"u",
	"un",
	"under",
	"unfortunately",
	"unless",
	"unlikely",
	"until",
	"unto",
	"up",
	"upon",
	"us",
	"use",
	"used",
	"useful",
	"uses",
	"using",
	"usually",
	"uucp",
	"v",
	"value",
	"various",
	"very",
	"via",
	"viz",
	"vs",
	"w",
	"want",
	"wants",
	"was",
	"wasn't",
	"way",
	"we",
	"we'd",
	"we'll",
	"we're",
	"we've",
	"welcome",
	"well",
	"went",
	"were",
	"weren't",
	"what",
	"what's",
	"whatever",
	"when",
	"whence",
	"whenever",
	"where",
	"where's",
	"whereafter",
	"whereas",
	"whereby",
	"wherein",
	"whereupon",
	"wherever",
	"whether",
	"which",
	"while",
	"whither",
	"who",
	"who's",
	"whoever",
	"whole",
	"whom",
	"whose",
	"why",
	"will",
	"willing",
	"wish",
	"with",
	"within",
	"without",
	"won't",
	"wonder",
	"would",
	"would",
	"wouldn't",
	"x",
	"y",
	"yes",
	"yet",
	"you",
	"you'd",
	"you'll",
	"you're",
	"you've",
	"your",
	"yours",
	"yourself",
	"yourselves",
	"z",
	"zero",
	"a",
	",",
	".",
	"?",
	"!",
	"\|",
	":",
	"'",
	"\"",
	";",
	"<NUM>",
	"?",
	"$",
	"km",
	"s",
	"u",
	"&",
	"#",
	"'s",
	"/",
	"dr.",
	])