Skip to content

Instantly share code, notes, and snippets.

@sloev
Created March 11, 2018 17:48
Show Gist options
  • Select an option

  • Save sloev/65816f5903940cafc98cd9315b01f321 to your computer and use it in GitHub Desktop.

Select an option

Save sloev/65816f5903940cafc98cd9315b01f321 to your computer and use it in GitHub Desktop.
spacy==2.0.0
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz
networkx>=1.11
"""Python implementation of the TextRank algoritm.
From this paper:
https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf
Based on:
https://gist.github.com/voidfiles/1646117
https://github.com/davidadamojr/TextRank
"""
import io
import itertools
import networkx as nx
import spacy
import os
__LANGUAGES = {}
def load_languages(languages=None):
languages = languages or ['en']
for language in languages:
__LANGUAGES[language] = spacy.load(language)
def filter_for_tags(tagged, tags=['v', 'n', 'j']):
"""Apply syntactic filters based on POS tags."""
return [item for item in tagged if item[1] and item[1].lower()[0] in tags]
def normalize(tagged):
"""Return a list of tuples with the first item's periods removed."""
return [(item[0].replace('.', ''), item[1]) for item in tagged]
def unique_everseen(iterable, key=None):
"""List unique elements in order of appearance.
Examples:
unique_everseen('AAAABBBCCDAABBB') --> A B C D
unique_everseen('ABBCcAD', str.lower) --> A B C D
"""
seen = set()
seen_add = seen.add
if key is None:
for element in [x for x in iterable if x not in seen]:
seen_add(element)
yield element
else:
for element in iterable:
k = key(element)
if k not in seen:
seen_add(k)
yield element
def levenshtein_distance(first, second):
"""Return the Levenshtein distance between two strings.
Based on:
http://rosettacode.org/wiki/Levenshtein_distance#Python
"""
if len(first) > len(second):
first, second = second, first
distances = range(len(first) + 1)
for index2, char2 in enumerate(second):
new_distances = [index2 + 1]
for index1, char1 in enumerate(first):
if char1 == char2:
new_distances.append(distances[index1])
else:
new_distances.append(1 + min((distances[index1],
distances[index1 + 1],
new_distances[-1])))
distances = new_distances
return distances[-1]
def build_graph(nodes):
"""Return a networkx graph instance.
:param nodes: List of hashables that represent the nodes of a graph.
"""
gr = nx.Graph() # initialize an undirected graph
gr.add_nodes_from(nodes)
nodePairs = list(itertools.combinations(nodes, 2))
# add edges to the graph (weighted by Levenshtein distance)
for pair in nodePairs:
firstString = pair[0]
secondString = pair[1]
levDistance = levenshtein_distance(firstString, secondString)
gr.add_edge(firstString, secondString, weight=levDistance)
return gr
def extract_key_phrases(text, language='en'):
"""Return a set of key phrases.
:param text: A string.
"""
# tokenize the text using nltk
spacy_nlp = __LANGUAGES[language]
doc = spacy_nlp(text)
tagged = [(token.text, token.tag_) for token in doc]
# assign POS tags to the words in the text
textlist = [x[0] for x in tagged]
tagged = filter_for_tags(tagged)
tagged = normalize(tagged)
unique_word_set = unique_everseen([x[0] for x in tagged])
word_set_list = list(unique_word_set)
# this will be used to determine adjacent words in order to construct
# keyphrases with two words
graph = build_graph(word_set_list)
# pageRank - initial value of 1.0, error tolerance of 0,0001,
calculated_page_rank = nx.pagerank(graph, weight='weight')
# most important words in ascending order of importance
keyphrases = sorted(calculated_page_rank, key=calculated_page_rank.get,
reverse=True)
# take keyphrases with multiple words into consideration as done in the
# paper - if two words are adjacent in the text and are selected as
# keywords, join them together
modified_key_phrases = set([])
# keeps track of individual keywords that have been joined to form a
# keyphrase
dealt_with = set([])
i = len(textlist) - 3
j = len(textlist) - 2
k = len(textlist) - 1
while k > 3:
first = textlist[i]
second = textlist[j]
third = textlist[k]
if first in keyphrases and second in keyphrases and third in keyphrases:
keyphrase = first + ' ' + second + ' ' + third
modified_key_phrases.add(keyphrase)
dealt_with.add(first)
dealt_with.add(second)
dealt_with.add(third)
elif first in keyphrases and second in keyphrases:
keyphrase = first + ' ' + second
modified_key_phrases.add(keyphrase)
dealt_with.add(first)
dealt_with.add(second)
elif second in keyphrases and third in keyphrases:
keyphrase = second + ' ' + third
modified_key_phrases.add(keyphrase)
dealt_with.add(second)
dealt_with.add(third)
else:
if first in keyphrases and first not in dealt_with:
modified_key_phrases.add(first)
# if this is the last word in the text, and it is a keyword, it
# definitely has no chance of being a keyphrase at this point
if j == len(textlist) - 1 and second in keyphrases and \
second not in dealt_with:
modified_key_phrases.add(second)
i -= 1
j -= 1
k -= 1
return modified_key_phrases
def orderby_ngram_rank(phrases):
_, sorted_phrases = zip(*sorted(enumerate(phrases), key=lambda x: (len(x[1].split()), -x[0]), reverse=True))
return sorted_phrases
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment