kasnder · January 7, 2020 15:31
diff --git a/top_senteces_from_text.py b/top_senteces_from_text.py
 # Authored by Konrad Kollnig
 # Oxford, 20 April 2019

 import wikipedia
 from nltk.tokenize import sent_tokenize, RegexpTokenizer
 from nltk.corpus import stopwords
 from collections import defaultdict
 import math
 import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

 data = wikipedia.page("New York City").content
 allSentences = sent_tokenize(data)
 lenghts = [len(s) for s in allSentences]
 m = np.array(lenghts).mean()
 longSentences = [s for s in allSentences if len(s) > m/2

 sentences = longSentences

 # Get number of tokens
 cv = CountVectorizer(stop_words='english', token_pattern=r"(?u)\b[A-Za-z\-]{4,}\b") # can also use \w
 cv_fit = cv.fit(sentences)
 counts = cv_fit.transform(longSentences).toarray().sum(axis=1)

 # Get weighting of words, compared to other sentences
 tf = TfidfVectorizer(stop_words='english', token_pattern=r"(?u)\b[A-Za-z\-]{4,}\b") # can also use \w
 tf_fit = tf.fit(sentences)
 weights = np.asarray(tf_fit.transform(longSentences).sum(axis=1).squeeze()).flatten()
 results = weights/counts

 # Extract top positions
 topPositions = np.sort(np.flip(np.argsort(results))[0:20])
 topSentences = [longSentences[p] for p in topPositions]
 topLenghts = [len(s) for s in topSentences]
 print(topSentences)
	# Authored by Konrad Kollnig
	# Oxford, 20 April 2019

	import wikipedia
	from nltk.tokenize import sent_tokenize, RegexpTokenizer
	from nltk.corpus import stopwords
	from collections import defaultdict
	import math
	import numpy as np
	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

	data = wikipedia.page("New York City").content
	allSentences = sent_tokenize(data)
	lenghts = [len(s) for s in allSentences]
	m = np.array(lenghts).mean()
	longSentences = [s for s in allSentences if len(s) > m/2

	sentences = longSentences

	# Get number of tokens
	cv = CountVectorizer(stop_words='english', token_pattern=r"(?u)\b[A-Za-z\-]{4,}\b") # can also use \w
	cv_fit = cv.fit(sentences)
	counts = cv_fit.transform(longSentences).toarray().sum(axis=1)

	# Get weighting of words, compared to other sentences
	tf = TfidfVectorizer(stop_words='english', token_pattern=r"(?u)\b[A-Za-z\-]{4,}\b") # can also use \w
	tf_fit = tf.fit(sentences)
	weights = np.asarray(tf_fit.transform(longSentences).sum(axis=1).squeeze()).flatten()
	results = weights/counts

	# Extract top positions
	topPositions = np.sort(np.flip(np.argsort(results))[0:20])
	topSentences = [longSentences[p] for p in topPositions]
	topLenghts = [len(s) for s in topSentences]
	print(topSentences)