Created
May 18, 2015 22:04
-
-
Save cigrainger/89c2bb16335ffeea0824 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.corpus import stopwords | |
from nltk import wordpunct_tokenize | |
from nltk.tokenize import RegexpTokenizer | |
from nltk.stem import WordNetLemmatizer | |
import nltk.data | |
import re | |
tagger = nltk.data.load("trained_brill.pickle") | |
tokenizer = RegexpTokenizer(r'\w+') | |
quoted = re.compile('"[^"]*"') | |
numbers = re.compile('[0-9]+') | |
def _calculate_languages_ratios(text): | |
languages_ratios = {} | |
tokens = wordpunct_tokenize(text) | |
words = [word.lower() for word in tokens] | |
for language in stopwords.fileids(): | |
stopwords_set = set(stopwords.words(language)) | |
words_set = set(words) | |
common_elements = words_set.intersection(stopwords_set) | |
languages_ratios[language] = len(common_elements) | |
return languages_ratios | |
def detect_language(text): | |
ratios = _calculate_languages_ratios(text) | |
most_rated_language = max(ratios, key=ratios.get) | |
return most_rated_language | |
def get_id(line): | |
if line.split(',',1)[0]: | |
pat_id = quoted.findall(line.split(',',1)[0]) | |
if pat_id: | |
pat_id = numbers.findall(pat_id[0]) | |
if pat_id: | |
return pat_id[0].encode('ascii','ignore') | |
else: | |
return 'NA' | |
else: | |
return 'NA' | |
else: | |
return 'NA' | |
def remove_stopwords(line): | |
tokens = tokenizer.tokenize(text) | |
words = [word.lower() for word in tokens if word not in stop] | |
return ' '.join(words) | |
def get_wordnet_pos(tag): | |
if tag.startswith('J'): | |
return wordnet.ADJ | |
elif tag.startswith('V'): | |
return wordnet.VERB | |
elif tag.startswith('N'): | |
return wordnet.NOUN | |
elif tag.startswith('R'): | |
return wordnet.ADV | |
else: | |
return '' | |
def lemmatize(line): | |
tagged = [bt.tag(word) for word in line.split(' ')] | |
tagged = [[word,get_wordnet_pos(tag)] for word,tag in tagged] | |
lemmatized = [WordNetLemmatizer.lemmatize(word,tag) for word,tag in tagged] | |
return ' '.join(lemmatized) | |
# load in data and keep english patents with known IDs | |
data = sc.textFile("hdfs:///abstracts", 640) | |
known_ids = data.filter( lambda x: get_id(x) != 'NA' ) | |
patent_ids = known_ids.map( lambda x: get_id(x) ) | |
english_lines = known_ids.filter( lambda x: detect_language(x) == 'english' ) | |
# stopword removal | |
stop = stopwords.words('english') | |
no_stop_lines = english_lines.map( lambda x: remove_stopwords(x) ) | |
lemmatized_lines = no_stop_lines.map( lambda x: lemmatize(x) ) | |
test = english_lines.takeSample(False,3) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment