vinovator · January 11, 2016 13:30
diff --git a/keywordExtractor.py b/keywordExtractor.py
 # keywordExtractor.py
 # Python 2.7.6

 import requests
 import re
 from collections import Counter

 """
 Step 1: Obtain Text
 Step 2: Strip punctuation, special characters, etc.
 Step 3: Strip "simple" words (aka stop words)
 Step 4: Split on Spaces
      - Loop Over Split Text
      - Add word to Array/HashTable/Etc if it doesn't exist;
       if it does, increment counter for that word
 """


 # Source of stop words - generic words to ignore
 stopwords_url = "http://www.textfixer.com/resources/common-english-words.txt"

 # The source from which key words to be extracted
 source_file = "test_resume2.txt"


 resp = requests.get(stopwords_url)

 stopwords_lst = resp.content.split(",")

 # Step 1 - Get Text
 with open("Resumes/" + source_file, "r") as source_file:
    extracted_text = ""
    for each_line in source_file:
        extracted_text += each_line


 # Step 2 - Strip punctuation, special characters etc
 # extracted_text = re.sub('[^A-Za-z0-9\-\_\@]+', ' ', extracted_text)
 extracted_text = re.sub(r"\W+", " ", extracted_text)

 # Step 3: Strip "simple" words (aka stop words)
 extracted_text = " ".join(
    [word for word in extracted_text.split() if word.lower() not in stopwords_lst])


 # Extract keyword
 key_count = Counter()

 for word in extracted_text.split():
    key_count[word] += 1

 # Extracct keywords by order of frequency
 keywords = [word for word, count in key_count.most_common()]

 print keywords
	# keywordExtractor.py
	# Python 2.7.6

	import requests
	import re
	from collections import Counter

	"""
	Step 1: Obtain Text
	Step 2: Strip punctuation, special characters, etc.
	Step 3: Strip "simple" words (aka stop words)
	Step 4: Split on Spaces
	- Loop Over Split Text
	- Add word to Array/HashTable/Etc if it doesn't exist;
	if it does, increment counter for that word
	"""


	# Source of stop words - generic words to ignore
	stopwords_url = "http://www.textfixer.com/resources/common-english-words.txt"

	# The source from which key words to be extracted
	source_file = "test_resume2.txt"


	resp = requests.get(stopwords_url)

	stopwords_lst = resp.content.split(",")

	# Step 1 - Get Text
	with open("Resumes/" + source_file, "r") as source_file:
	extracted_text = ""
	for each_line in source_file:
	extracted_text += each_line


	# Step 2 - Strip punctuation, special characters etc
	# extracted_text = re.sub('[^A-Za-z0-9\-\_\@]+', ' ', extracted_text)
	extracted_text = re.sub(r"\W+", " ", extracted_text)

	# Step 3: Strip "simple" words (aka stop words)
	extracted_text = " ".join(
	[word for word in extracted_text.split() if word.lower() not in stopwords_lst])


	# Extract keyword
	key_count = Counter()

	for word in extracted_text.split():
	key_count[word] += 1

	# Extracct keywords by order of frequency
	keywords = [word for word, count in key_count.most_common()]

	print keywords