Created
June 28, 2020 15:17
-
-
Save Klepvink/1838351241b1baee6f38732634fad562 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from tika import parser | |
import sys | |
import os | |
import re | |
from docx import Document | |
document = Document() | |
def filecheck(filepath): | |
return os.path.isfile(filepath) | |
def regexFilter(filepath): | |
print("[Textractor] Checking for email-addresses...") | |
raw = parser.from_file(filepath) | |
match = re.findall(r'[\w\.-]+@[\w\.-]+', raw['content'], re.DOTALL) | |
if len(match) == 0: | |
print("No email-addresses found.") | |
else: | |
document.add_heading('Email-addresses', level=1) | |
for e in match: | |
document.add_paragraph(e, style='List Bullet') | |
print(e) | |
print("[Textractor] Checking for phonenumbers...") | |
match = re.findall(r'\(?([0-9]{3})\)?([ .-]?)([0-9]{3})\2([0-9]{4})', raw['content'], re.DOTALL) | |
if len(match) == 0: | |
print("No phonenumbers found.") | |
else: | |
document.add_heading('Phonenumbers', level=1) | |
for p in match: | |
document.add_paragraph(p, style='List Bullet') | |
print(p) | |
if len(sys.argv) < 2: | |
print("[Textractor] Please append the filename you wish to extract the e-mail addresses from.") | |
else: | |
if filecheck(sys.argv[1]) == True: | |
regexFilter(sys.argv[1]) | |
if len(sys.argv) >= 3: | |
document.save(sys.argv[2]) | |
print("[Textractor] Wordfile written to " + sys.argv[2]) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment