sloev · March 11, 2018 17:48
diff --git a/requirements.txt b/requirements.txt
 spacy==2.0.0
 https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz
 networkx>=1.11
diff --git a/textrank.py b/textrank.py
 """Python implementation of the TextRank algoritm.

 From this paper:
    https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf

 Based on:
    https://gist.github.com/voidfiles/1646117
    https://github.com/davidadamojr/TextRank
 """
 import io
 import itertools
 import networkx as nx
 import spacy
 import os


 __LANGUAGES = {}

 def load_languages(languages=None):

    languages = languages or ['en']
    for language in languages:
        __LANGUAGES[language] = spacy.load(language)


 def filter_for_tags(tagged, tags=['v', 'n', 'j']):
    """Apply syntactic filters based on POS tags."""
    return [item for item in tagged if item[1] and item[1].lower()[0] in tags]


 def normalize(tagged):
    """Return a list of tuples with the first item's periods removed."""
    return [(item[0].replace('.', ''), item[1]) for item in tagged]


 def unique_everseen(iterable, key=None):
    """List unique elements in order of appearance.

    Examples:
        unique_everseen('AAAABBBCCDAABBB') --> A B C D
        unique_everseen('ABBCcAD', str.lower) --> A B C D
    """
    seen = set()
    seen_add = seen.add
    if key is None:
        for element in [x for x in iterable if x not in seen]:
            seen_add(element)
            yield element
    else:
        for element in iterable:
            k = key(element)
            if k not in seen:
                seen_add(k)
                yield element


 def levenshtein_distance(first, second):
    """Return the Levenshtein distance between two strings.

    Based on:
        http://rosettacode.org/wiki/Levenshtein_distance#Python
    """
    if len(first) > len(second):
        first, second = second, first
    distances = range(len(first) + 1)
    for index2, char2 in enumerate(second):
        new_distances = [index2 + 1]
        for index1, char1 in enumerate(first):
            if char1 == char2:
                new_distances.append(distances[index1])
            else:
                new_distances.append(1 + min((distances[index1],
                                             distances[index1 + 1],
                                             new_distances[-1])))
        distances = new_distances
    return distances[-1]


 def build_graph(nodes):
    """Return a networkx graph instance.

    :param nodes: List of hashables that represent the nodes of a graph.
    """
    gr = nx.Graph()  # initialize an undirected graph
    gr.add_nodes_from(nodes)
    nodePairs = list(itertools.combinations(nodes, 2))

    # add edges to the graph (weighted by Levenshtein distance)
    for pair in nodePairs:
        firstString = pair[0]
        secondString = pair[1]
        levDistance = levenshtein_distance(firstString, secondString)
        gr.add_edge(firstString, secondString, weight=levDistance)

    return gr


 def extract_key_phrases(text, language='en'):
    """Return a set of key phrases.

    :param text: A string.
    """
    # tokenize the text using nltk
    spacy_nlp = __LANGUAGES[language]

    doc = spacy_nlp(text)
    tagged = [(token.text, token.tag_) for token in doc]
    # assign POS tags to the words in the text
    textlist = [x[0] for x in tagged]

    tagged = filter_for_tags(tagged)
    tagged = normalize(tagged)

    unique_word_set = unique_everseen([x[0] for x in tagged])
    word_set_list = list(unique_word_set)

    # this will be used to determine adjacent words in order to construct
    # keyphrases with two words

    graph = build_graph(word_set_list)

    # pageRank - initial value of 1.0, error tolerance of 0,0001,
    calculated_page_rank = nx.pagerank(graph, weight='weight')

    # most important words in ascending order of importance
    keyphrases = sorted(calculated_page_rank, key=calculated_page_rank.get,
                        reverse=True)

    # take keyphrases with multiple words into consideration as done in the
    # paper - if two words are adjacent in the text and are selected as
    # keywords, join them together
    modified_key_phrases = set([])
    # keeps track of individual keywords that have been joined to form a
    # keyphrase
    dealt_with = set([])
    i = len(textlist) - 3
    j = len(textlist) - 2
    k = len(textlist) - 1
    while k > 3:
        first = textlist[i]
        second = textlist[j]
        third = textlist[k]
        if first in keyphrases and second in keyphrases and third in keyphrases:
                    keyphrase = first + ' ' +  second + ' ' + third
                    modified_key_phrases.add(keyphrase)
                    dealt_with.add(first)
                    dealt_with.add(second)
                    dealt_with.add(third)
        elif first in keyphrases and second in keyphrases:
            keyphrase = first + ' ' + second
            modified_key_phrases.add(keyphrase)
            dealt_with.add(first)
            dealt_with.add(second)
        elif second in keyphrases and third in keyphrases:
            keyphrase = second + ' ' + third
            modified_key_phrases.add(keyphrase)
            dealt_with.add(second)
            dealt_with.add(third)
        else:
            if first in keyphrases and first not in dealt_with:
                modified_key_phrases.add(first)

            # if this is the last word in the text, and it is a keyword, it
            # definitely has no chance of being a keyphrase at this point
            if j == len(textlist) - 1 and second in keyphrases and \
                    second not in dealt_with:
                modified_key_phrases.add(second)

        i -= 1
        j -= 1
        k -= 1

    return modified_key_phrases


 def orderby_ngram_rank(phrases):
    _, sorted_phrases = zip(*sorted(enumerate(phrases), key=lambda x: (len(x[1].split()), -x[0]), reverse=True))
    return sorted_phrases
	spacy==2.0.0
	https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz
	networkx>=1.11
	"""Python implementation of the TextRank algoritm.

	From this paper:
	https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf

	Based on:
	https://gist.github.com/voidfiles/1646117
	https://github.com/davidadamojr/TextRank
	"""
	import io
	import itertools
	import networkx as nx
	import spacy
	import os


	__LANGUAGES = {}

	def load_languages(languages=None):

	languages = languages or ['en']
	for language in languages:
	__LANGUAGES[language] = spacy.load(language)


	def filter_for_tags(tagged, tags=['v', 'n', 'j']):
	"""Apply syntactic filters based on POS tags."""
	return [item for item in tagged if item[1] and item[1].lower()[0] in tags]


	def normalize(tagged):
	"""Return a list of tuples with the first item's periods removed."""
	return [(item[0].replace('.', ''), item[1]) for item in tagged]


	def unique_everseen(iterable, key=None):
	"""List unique elements in order of appearance.

	Examples:
	unique_everseen('AAAABBBCCDAABBB') --> A B C D
	unique_everseen('ABBCcAD', str.lower) --> A B C D
	"""
	seen = set()
	seen_add = seen.add
	if key is None:
	for element in [x for x in iterable if x not in seen]:
	seen_add(element)
	yield element
	else:
	for element in iterable:
	k = key(element)
	if k not in seen:
	seen_add(k)
	yield element


	def levenshtein_distance(first, second):
	"""Return the Levenshtein distance between two strings.

	Based on:
	http://rosettacode.org/wiki/Levenshtein_distance#Python
	"""
	if len(first) > len(second):
	first, second = second, first
	distances = range(len(first) + 1)
	for index2, char2 in enumerate(second):
	new_distances = [index2 + 1]
	for index1, char1 in enumerate(first):
	if char1 == char2:
	new_distances.append(distances[index1])
	else:
	new_distances.append(1 + min((distances[index1],
	distances[index1 + 1],
	new_distances[-1])))
	distances = new_distances
	return distances[-1]


	def build_graph(nodes):
	"""Return a networkx graph instance.

	:param nodes: List of hashables that represent the nodes of a graph.
	"""
	gr = nx.Graph() # initialize an undirected graph
	gr.add_nodes_from(nodes)
	nodePairs = list(itertools.combinations(nodes, 2))

	# add edges to the graph (weighted by Levenshtein distance)
	for pair in nodePairs:
	firstString = pair[0]
	secondString = pair[1]
	levDistance = levenshtein_distance(firstString, secondString)
	gr.add_edge(firstString, secondString, weight=levDistance)

	return gr


	def extract_key_phrases(text, language='en'):
	"""Return a set of key phrases.

	:param text: A string.
	"""
	# tokenize the text using nltk
	spacy_nlp = __LANGUAGES[language]

	doc = spacy_nlp(text)
	tagged = [(token.text, token.tag_) for token in doc]
	# assign POS tags to the words in the text
	textlist = [x[0] for x in tagged]

	tagged = filter_for_tags(tagged)
	tagged = normalize(tagged)

	unique_word_set = unique_everseen([x[0] for x in tagged])
	word_set_list = list(unique_word_set)

	# this will be used to determine adjacent words in order to construct
	# keyphrases with two words

	graph = build_graph(word_set_list)

	# pageRank - initial value of 1.0, error tolerance of 0,0001,
	calculated_page_rank = nx.pagerank(graph, weight='weight')

	# most important words in ascending order of importance
	keyphrases = sorted(calculated_page_rank, key=calculated_page_rank.get,
	reverse=True)

	# take keyphrases with multiple words into consideration as done in the
	# paper - if two words are adjacent in the text and are selected as
	# keywords, join them together
	modified_key_phrases = set([])
	# keeps track of individual keywords that have been joined to form a
	# keyphrase
	dealt_with = set([])
	i = len(textlist) - 3
	j = len(textlist) - 2
	k = len(textlist) - 1
	while k > 3:
	first = textlist[i]
	second = textlist[j]
	third = textlist[k]
	if first in keyphrases and second in keyphrases and third in keyphrases:
	keyphrase = first + ' ' + second + ' ' + third
	modified_key_phrases.add(keyphrase)
	dealt_with.add(first)
	dealt_with.add(second)
	dealt_with.add(third)
	elif first in keyphrases and second in keyphrases:
	keyphrase = first + ' ' + second
	modified_key_phrases.add(keyphrase)
	dealt_with.add(first)
	dealt_with.add(second)
	elif second in keyphrases and third in keyphrases:
	keyphrase = second + ' ' + third
	modified_key_phrases.add(keyphrase)
	dealt_with.add(second)
	dealt_with.add(third)
	else:
	if first in keyphrases and first not in dealt_with:
	modified_key_phrases.add(first)

	# if this is the last word in the text, and it is a keyword, it
	# definitely has no chance of being a keyphrase at this point
	if j == len(textlist) - 1 and second in keyphrases and \
	second not in dealt_with:
	modified_key_phrases.add(second)

	i -= 1
	j -= 1
	k -= 1

	return modified_key_phrases


	def orderby_ngram_rank(phrases):
	_, sorted_phrases = zip(*sorted(enumerate(phrases), key=lambda x: (len(x[1].split()), -x[0]), reverse=True))
	return sorted_phrases