Last active
December 7, 2016 23:19
-
-
Save DannyMcwaves/ea4032fa1193b91acfdc30b0bfa6a5b2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
NATURAL LANGUAGE PROCESSING | |
using the nltk library, i am going to read a pdf file and then remove all the | |
stop words from the pdf file and then save the rest of the document in a file. | |
""" | |
from nltk import wordpunct_tokenize | |
from nltk.corpus import stopwords | |
from concurrent import futures | |
import PyPDF2 | |
def yieldTextFromPdf(pdfname): | |
""" | |
this function would use generators to read and return the text from a page of the pdf. | |
It will do so for all the pages in the pdf and then yield the text from that page. | |
:param pdfname: the name of the pdf file. | |
:yield: the text from a specific page. | |
""" | |
# open the pdf file as you would open any other binary file. | |
file_descriptor = open(pdfname, "rb") | |
# use the PyPDF module imported to parse the binary data. | |
pdf_file = PyPDF2.PdfFileReader(file_descriptor) | |
# now loop through the number of pages in the pdf file, extract the text from the page and then yield the text. | |
for i in range(0, pdf_file.numPages): | |
page = pdf_file.getPage(i) | |
yield page.extractText() | |
def length(pdfname): | |
return PyPDF2.PdfFileReader(open(pdfname, "rb")).numPages | |
def detectLanguage(text): | |
""" | |
this module uses the nltk library to detect the language the text file is written in by using the. | |
:param text: the text file. | |
:return: the language of the text file. | |
""" | |
# using the wordpunct_tokensize function, I generate a list of the words in the text file. | |
# and then i reduce all to lowercase for brevity and language check. | |
tokens = wordpunct_tokenize(text) | |
words = {x.lower() for x in tokens} | |
# what I am doing over here is that, I am looping through all the languages and I am going to check which of the | |
# languages has the most stopwords appearing in the words list. the one with the highest number of stopword | |
# occurrences in the words list, is the language of the text file. | |
languages = {x: len(words.intersection({i for i in stopwords.words(x)})) for x in stopwords.fileids()} | |
return max(languages, key=languages.get) | |
def removeStopWords(text): | |
""" | |
this function gets the extracted text from the pdf and then removes the stop words from it. | |
:param text: the text obtained from the pdf file. | |
:return: the list of words remaining after removing stop words | |
""" | |
tokenize = {x for x in wordpunct_tokenize(text)} | |
stopwordsList = stopwords.words(detectLanguage(text)) | |
return [x for x in tokenize if x not in stopwordsList and len(x) > 1] | |
def main(pdfname): | |
""" | |
this is the main function that will do all the works needed. | |
:param pdfname: the name of the pdf file | |
:return: none. saves to the file. | |
run this in a concurrent thread by driving the generators and delegating to the result to a thread. | |
to read the files faster. | |
""" | |
with futures.ThreadPoolExecutor(max_workers=length(pdfname)) as executor: | |
future_list = [] | |
try: | |
for i in yieldTextFromPdf(pdfname): | |
future = executor.submit(removeStopWords, i) | |
future_list.append(future) | |
except StopIteration: | |
pass | |
file = open("filtered", "w") | |
for f in futures.as_completed(future_list): | |
try: | |
for i in f.result(): | |
file.write(i + "\n") | |
except AttributeError as ar: | |
pass | |
file.close() | |
pdfLoc = "danny.pdf" | |
pdf_location = "french.pdf" | |
if __name__ == '__main__': | |
main(pdf_location) | |
# this code has being testes and debugged. | |
# it should be free of any errors. | |
# (c) Ayikpah Danny Mcwaves. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment