Last active
January 7, 2020 15:31
-
-
Save kasnder/5169b449eb701024faad2c88438d0335 to your computer and use it in GitHub Desktop.
Attempt to classify top sentences in the Wikipedia article on New York
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Authored by Konrad Kollnig | |
# Oxford, 20 April 2019 | |
import wikipedia | |
from nltk.tokenize import sent_tokenize, RegexpTokenizer | |
from nltk.corpus import stopwords | |
from collections import defaultdict | |
import math | |
import numpy as np | |
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer | |
data = wikipedia.page("New York City").content | |
allSentences = sent_tokenize(data) | |
lenghts = [len(s) for s in allSentences] | |
m = np.array(lenghts).mean() | |
longSentences = [s for s in allSentences if len(s) > m/2 | |
sentences = longSentences | |
# Get number of tokens | |
cv = CountVectorizer(stop_words='english', token_pattern=r"(?u)\b[A-Za-z\-]{4,}\b") # can also use \w | |
cv_fit = cv.fit(sentences) | |
counts = cv_fit.transform(longSentences).toarray().sum(axis=1) | |
# Get weighting of words, compared to other sentences | |
tf = TfidfVectorizer(stop_words='english', token_pattern=r"(?u)\b[A-Za-z\-]{4,}\b") # can also use \w | |
tf_fit = tf.fit(sentences) | |
weights = np.asarray(tf_fit.transform(longSentences).sum(axis=1).squeeze()).flatten() | |
results = weights/counts | |
# Extract top positions | |
topPositions = np.sort(np.flip(np.argsort(results))[0:20]) | |
topSentences = [longSentences[p] for p in topPositions] | |
topLenghts = [len(s) for s in topSentences] | |
print(topSentences) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment