Skip to content

Instantly share code, notes, and snippets.

@vinovator
Created January 11, 2016 13:30
Show Gist options
  • Save vinovator/f74c51adcd1507d6ea34 to your computer and use it in GitHub Desktop.
Save vinovator/f74c51adcd1507d6ea34 to your computer and use it in GitHub Desktop.
Extract keywords from a text by eliminating stop words, punctuations etc
# keywordExtractor.py
# Python 2.7.6
import requests
import re
from collections import Counter
"""
Step 1: Obtain Text
Step 2: Strip punctuation, special characters, etc.
Step 3: Strip "simple" words (aka stop words)
Step 4: Split on Spaces
- Loop Over Split Text
- Add word to Array/HashTable/Etc if it doesn't exist;
if it does, increment counter for that word
"""
# Source of stop words - generic words to ignore
stopwords_url = "http://www.textfixer.com/resources/common-english-words.txt"
# The source from which key words to be extracted
source_file = "test_resume2.txt"
resp = requests.get(stopwords_url)
stopwords_lst = resp.content.split(",")
# Step 1 - Get Text
with open("Resumes/" + source_file, "r") as source_file:
extracted_text = ""
for each_line in source_file:
extracted_text += each_line
# Step 2 - Strip punctuation, special characters etc
# extracted_text = re.sub('[^A-Za-z0-9\-\_\@]+', ' ', extracted_text)
extracted_text = re.sub(r"\W+", " ", extracted_text)
# Step 3: Strip "simple" words (aka stop words)
extracted_text = " ".join(
[word for word in extracted_text.split() if word.lower() not in stopwords_lst])
# Extract keyword
key_count = Counter()
for word in extracted_text.split():
key_count[word] += 1
# Extracct keywords by order of frequency
keywords = [word for word, count in key_count.most_common()]
print keywords
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment