Skip to content

Instantly share code, notes, and snippets.

@kasnder
Last active January 7, 2020 15:31
Show Gist options
  • Save kasnder/5169b449eb701024faad2c88438d0335 to your computer and use it in GitHub Desktop.
Save kasnder/5169b449eb701024faad2c88438d0335 to your computer and use it in GitHub Desktop.
Attempt to classify top sentences in the Wikipedia article on New York
# Authored by Konrad Kollnig
# Oxford, 20 April 2019
import wikipedia
from nltk.tokenize import sent_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from collections import defaultdict
import math
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
data = wikipedia.page("New York City").content
allSentences = sent_tokenize(data)
lenghts = [len(s) for s in allSentences]
m = np.array(lenghts).mean()
longSentences = [s for s in allSentences if len(s) > m/2
sentences = longSentences
# Get number of tokens
cv = CountVectorizer(stop_words='english', token_pattern=r"(?u)\b[A-Za-z\-]{4,}\b") # can also use \w
cv_fit = cv.fit(sentences)
counts = cv_fit.transform(longSentences).toarray().sum(axis=1)
# Get weighting of words, compared to other sentences
tf = TfidfVectorizer(stop_words='english', token_pattern=r"(?u)\b[A-Za-z\-]{4,}\b") # can also use \w
tf_fit = tf.fit(sentences)
weights = np.asarray(tf_fit.transform(longSentences).sum(axis=1).squeeze()).flatten()
results = weights/counts
# Extract top positions
topPositions = np.sort(np.flip(np.argsort(results))[0:20])
topSentences = [longSentences[p] for p in topPositions]
topLenghts = [len(s) for s in topSentences]
print(topSentences)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment