Created
March 11, 2018 17:48
-
-
Save sloev/65816f5903940cafc98cd9315b01f321 to your computer and use it in GitHub Desktop.
textrank in python, cudos to https://github.com/davidadamojr/TextRank/blob/master/requirements.txt
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| spacy==2.0.0 | |
| https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz | |
| networkx>=1.11 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """Python implementation of the TextRank algoritm. | |
| From this paper: | |
| https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf | |
| Based on: | |
| https://gist.github.com/voidfiles/1646117 | |
| https://github.com/davidadamojr/TextRank | |
| """ | |
| import io | |
| import itertools | |
| import networkx as nx | |
| import spacy | |
| import os | |
| __LANGUAGES = {} | |
| def load_languages(languages=None): | |
| languages = languages or ['en'] | |
| for language in languages: | |
| __LANGUAGES[language] = spacy.load(language) | |
| def filter_for_tags(tagged, tags=['v', 'n', 'j']): | |
| """Apply syntactic filters based on POS tags.""" | |
| return [item for item in tagged if item[1] and item[1].lower()[0] in tags] | |
| def normalize(tagged): | |
| """Return a list of tuples with the first item's periods removed.""" | |
| return [(item[0].replace('.', ''), item[1]) for item in tagged] | |
| def unique_everseen(iterable, key=None): | |
| """List unique elements in order of appearance. | |
| Examples: | |
| unique_everseen('AAAABBBCCDAABBB') --> A B C D | |
| unique_everseen('ABBCcAD', str.lower) --> A B C D | |
| """ | |
| seen = set() | |
| seen_add = seen.add | |
| if key is None: | |
| for element in [x for x in iterable if x not in seen]: | |
| seen_add(element) | |
| yield element | |
| else: | |
| for element in iterable: | |
| k = key(element) | |
| if k not in seen: | |
| seen_add(k) | |
| yield element | |
| def levenshtein_distance(first, second): | |
| """Return the Levenshtein distance between two strings. | |
| Based on: | |
| http://rosettacode.org/wiki/Levenshtein_distance#Python | |
| """ | |
| if len(first) > len(second): | |
| first, second = second, first | |
| distances = range(len(first) + 1) | |
| for index2, char2 in enumerate(second): | |
| new_distances = [index2 + 1] | |
| for index1, char1 in enumerate(first): | |
| if char1 == char2: | |
| new_distances.append(distances[index1]) | |
| else: | |
| new_distances.append(1 + min((distances[index1], | |
| distances[index1 + 1], | |
| new_distances[-1]))) | |
| distances = new_distances | |
| return distances[-1] | |
| def build_graph(nodes): | |
| """Return a networkx graph instance. | |
| :param nodes: List of hashables that represent the nodes of a graph. | |
| """ | |
| gr = nx.Graph() # initialize an undirected graph | |
| gr.add_nodes_from(nodes) | |
| nodePairs = list(itertools.combinations(nodes, 2)) | |
| # add edges to the graph (weighted by Levenshtein distance) | |
| for pair in nodePairs: | |
| firstString = pair[0] | |
| secondString = pair[1] | |
| levDistance = levenshtein_distance(firstString, secondString) | |
| gr.add_edge(firstString, secondString, weight=levDistance) | |
| return gr | |
| def extract_key_phrases(text, language='en'): | |
| """Return a set of key phrases. | |
| :param text: A string. | |
| """ | |
| # tokenize the text using nltk | |
| spacy_nlp = __LANGUAGES[language] | |
| doc = spacy_nlp(text) | |
| tagged = [(token.text, token.tag_) for token in doc] | |
| # assign POS tags to the words in the text | |
| textlist = [x[0] for x in tagged] | |
| tagged = filter_for_tags(tagged) | |
| tagged = normalize(tagged) | |
| unique_word_set = unique_everseen([x[0] for x in tagged]) | |
| word_set_list = list(unique_word_set) | |
| # this will be used to determine adjacent words in order to construct | |
| # keyphrases with two words | |
| graph = build_graph(word_set_list) | |
| # pageRank - initial value of 1.0, error tolerance of 0,0001, | |
| calculated_page_rank = nx.pagerank(graph, weight='weight') | |
| # most important words in ascending order of importance | |
| keyphrases = sorted(calculated_page_rank, key=calculated_page_rank.get, | |
| reverse=True) | |
| # take keyphrases with multiple words into consideration as done in the | |
| # paper - if two words are adjacent in the text and are selected as | |
| # keywords, join them together | |
| modified_key_phrases = set([]) | |
| # keeps track of individual keywords that have been joined to form a | |
| # keyphrase | |
| dealt_with = set([]) | |
| i = len(textlist) - 3 | |
| j = len(textlist) - 2 | |
| k = len(textlist) - 1 | |
| while k > 3: | |
| first = textlist[i] | |
| second = textlist[j] | |
| third = textlist[k] | |
| if first in keyphrases and second in keyphrases and third in keyphrases: | |
| keyphrase = first + ' ' + second + ' ' + third | |
| modified_key_phrases.add(keyphrase) | |
| dealt_with.add(first) | |
| dealt_with.add(second) | |
| dealt_with.add(third) | |
| elif first in keyphrases and second in keyphrases: | |
| keyphrase = first + ' ' + second | |
| modified_key_phrases.add(keyphrase) | |
| dealt_with.add(first) | |
| dealt_with.add(second) | |
| elif second in keyphrases and third in keyphrases: | |
| keyphrase = second + ' ' + third | |
| modified_key_phrases.add(keyphrase) | |
| dealt_with.add(second) | |
| dealt_with.add(third) | |
| else: | |
| if first in keyphrases and first not in dealt_with: | |
| modified_key_phrases.add(first) | |
| # if this is the last word in the text, and it is a keyword, it | |
| # definitely has no chance of being a keyphrase at this point | |
| if j == len(textlist) - 1 and second in keyphrases and \ | |
| second not in dealt_with: | |
| modified_key_phrases.add(second) | |
| i -= 1 | |
| j -= 1 | |
| k -= 1 | |
| return modified_key_phrases | |
| def orderby_ngram_rank(phrases): | |
| _, sorted_phrases = zip(*sorted(enumerate(phrases), key=lambda x: (len(x[1].split()), -x[0]), reverse=True)) | |
| return sorted_phrases |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment