Created
February 3, 2018 12:31
-
-
Save baffioso/79e3c5d32dd11cf677168ab39b72386a to your computer and use it in GitHub Desktop.
Opdater lokalplan tabel i postgres med tekst fra plandokument pdf
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import io | |
import PyPDF2 | |
import psycopg2 | |
from sqlalchemy import create_engine, Table, MetaData, update, select | |
def get_document(url): | |
try: | |
r = requests.get(url) | |
except Exception as e: | |
print(e.message, e.args) | |
pdf_file = io.BytesIO(r.content) | |
pdfReader = PyPDF2.PdfFileReader(pdf_file) | |
#discerning the number of pages will allow us to parse through all #the pages | |
num_pages = pdfReader.numPages | |
count = 0 | |
text = "" | |
#The while loop will read each page | |
while count < num_pages: | |
pageObj = pdfReader.getPage(count) | |
count +=1 | |
text += pageObj.extractText() | |
return text | |
# db creds | |
user = 'gc2' | |
pw = 'xxx' | |
port = 5432 | |
host = 'myhost' | |
db = 'ballerup' | |
# creating engine, connecting to db and fetching metadata | |
engine = create_engine('postgresql://{0}:{1}@{2}:{3}/{4}'.format(user, pw, host, port, db)) | |
connection = engine.connect() | |
metadata = MetaData() | |
# Updating created "document" column with text from pdf document | |
plan = Table('lokalplan_dokument', metadata, autoload=True, autoload_with=engine) | |
result_set = connection.execute("SELECT id, doklink FROM lokalplan_dokument where kommunenavn = 'Ballerup'") | |
for r in result_set: | |
plan_update = plan.update().values(document=get_document(r['doklink'])).where(plan.c.id == r['id']) | |
engine.execute(plan_update) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment