Skip to content

Instantly share code, notes, and snippets.

@Klepvink
Created June 28, 2020 15:17
Show Gist options
  • Save Klepvink/1838351241b1baee6f38732634fad562 to your computer and use it in GitHub Desktop.
Save Klepvink/1838351241b1baee6f38732634fad562 to your computer and use it in GitHub Desktop.
from tika import parser
import sys
import os
import re
from docx import Document
document = Document()
def filecheck(filepath):
return os.path.isfile(filepath)
def regexFilter(filepath):
print("[Textractor] Checking for email-addresses...")
raw = parser.from_file(filepath)
match = re.findall(r'[\w\.-]+@[\w\.-]+', raw['content'], re.DOTALL)
if len(match) == 0:
print("No email-addresses found.")
else:
document.add_heading('Email-addresses', level=1)
for e in match:
document.add_paragraph(e, style='List Bullet')
print(e)
print("[Textractor] Checking for phonenumbers...")
match = re.findall(r'\(?([0-9]{3})\)?([ .-]?)([0-9]{3})\2([0-9]{4})', raw['content'], re.DOTALL)
if len(match) == 0:
print("No phonenumbers found.")
else:
document.add_heading('Phonenumbers', level=1)
for p in match:
document.add_paragraph(p, style='List Bullet')
print(p)
if len(sys.argv) < 2:
print("[Textractor] Please append the filename you wish to extract the e-mail addresses from.")
else:
if filecheck(sys.argv[1]) == True:
regexFilter(sys.argv[1])
if len(sys.argv) >= 3:
document.save(sys.argv[2])
print("[Textractor] Wordfile written to " + sys.argv[2])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment