Skip to content

Instantly share code, notes, and snippets.

@grafuls
Last active June 3, 2019 16:27
Show Gist options
  • Save grafuls/954cadbf17b411fcd46d0f6f47e78c7a to your computer and use it in GitHub Desktop.
Save grafuls/954cadbf17b411fcd46d0f6f47e78c7a to your computer and use it in GitHub Desktop.
from pathlib import Path
import textract
import numpy as np
import re
def main(_file):
text = textract.process(_file, method="pdfminer")
with open("%s.csv" % Path(_file).name[: -len(Path(_file).suffix)], "w+") as _file:
# find orders and DNIs
coords = re.compile(r"\d?\.?\d+\.+\d+\,\d{2}")
results = re.findall(coords, text.decode())
if results:
_file.write("|".join(results))
if __name__ == "__main__":
_file = "c://home/1598615.pdf"
main(_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment