Skip to content

Instantly share code, notes, and snippets.

@cigrainger
Created May 18, 2015 22:04
Show Gist options
  • Save cigrainger/89c2bb16335ffeea0824 to your computer and use it in GitHub Desktop.
Save cigrainger/89c2bb16335ffeea0824 to your computer and use it in GitHub Desktop.
from nltk.corpus import stopwords
from nltk import wordpunct_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
import nltk.data
import re
tagger = nltk.data.load("trained_brill.pickle")
tokenizer = RegexpTokenizer(r'\w+')
quoted = re.compile('"[^"]*"')
numbers = re.compile('[0-9]+')
def _calculate_languages_ratios(text):
languages_ratios = {}
tokens = wordpunct_tokenize(text)
words = [word.lower() for word in tokens]
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_elements)
return languages_ratios
def detect_language(text):
ratios = _calculate_languages_ratios(text)
most_rated_language = max(ratios, key=ratios.get)
return most_rated_language
def get_id(line):
if line.split(',',1)[0]:
pat_id = quoted.findall(line.split(',',1)[0])
if pat_id:
pat_id = numbers.findall(pat_id[0])
if pat_id:
return pat_id[0].encode('ascii','ignore')
else:
return 'NA'
else:
return 'NA'
else:
return 'NA'
def remove_stopwords(line):
tokens = tokenizer.tokenize(text)
words = [word.lower() for word in tokens if word not in stop]
return ' '.join(words)
def get_wordnet_pos(tag):
if tag.startswith('J'):
return wordnet.ADJ
elif tag.startswith('V'):
return wordnet.VERB
elif tag.startswith('N'):
return wordnet.NOUN
elif tag.startswith('R'):
return wordnet.ADV
else:
return ''
def lemmatize(line):
tagged = [bt.tag(word) for word in line.split(' ')]
tagged = [[word,get_wordnet_pos(tag)] for word,tag in tagged]
lemmatized = [WordNetLemmatizer.lemmatize(word,tag) for word,tag in tagged]
return ' '.join(lemmatized)
# load in data and keep english patents with known IDs
data = sc.textFile("hdfs:///abstracts", 640)
known_ids = data.filter( lambda x: get_id(x) != 'NA' )
patent_ids = known_ids.map( lambda x: get_id(x) )
english_lines = known_ids.filter( lambda x: detect_language(x) == 'english' )
# stopword removal
stop = stopwords.words('english')
no_stop_lines = english_lines.map( lambda x: remove_stopwords(x) )
lemmatized_lines = no_stop_lines.map( lambda x: lemmatize(x) )
test = english_lines.takeSample(False,3)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment