Skip to content

Instantly share code, notes, and snippets.

@yuru-sha
Last active April 18, 2019 15:14
Show Gist options
  • Save yuru-sha/49732f331060ddeacda49f97098af8ed to your computer and use it in GitHub Desktop.
Save yuru-sha/49732f331060ddeacda49f97098af8ed to your computer and use it in GitHub Desktop.
酒造マップPDFのスクレイピング
# -*- coding: utf-8 -*-
import json
import os
from io import BytesIO
from bs4 import BeautifulSoup
from pdfminer.converter import HTMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfcolor import LITERAL_DEVICE_GRAY
from pdfminer.pdfcolor import LITERAL_DEVICE_RGB
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdftypes import LITERALS_DCT_DECODE
from pdfminer.utils import enc
DEBUG = True
## SakeBreweriesHTMLConverter
##
class SakeBreweriesHTMLConverter(HTMLConverter):
def get_image_name(self, image):
stream = image.stream
filters = stream.get_filters()
(width, height) = image.srcsize
if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
ext = '.jpg'
elif (image.bits == 1 or
image.bits == 8 and image.colorspace in (LITERAL_DEVICE_RGB, LITERAL_DEVICE_GRAY)):
ext = '.%dx%d.bmp' % (width, height)
else:
ext = '.%d.%dx%d.img' % (image.bits, width, height)
name = image.name + ext
return name
def place_image(self, item, borderwidth, x, y, w, h):
if self.imagewriter is not None:
name = self.imagewriter.export_image(item)
self.write('<img src="%s" border="%d" style="position:absolute; left:%dpx; top:%dpx;" '
'width="%d" height="%d" />\n' %
(enc(name, None), borderwidth,
x * self.scale, (self._yoffset - y) * self.scale,
w * self.scale, h * self.scale))
else:
name = self.get_image_name(item)
self.write('<img src="%s" border="%d" style="position:absolute; left:%dpx; top:%dpx;" '
'width="%d" height="%d" />\n' %
(enc(name, None), borderwidth,
x * self.scale, (self._yoffset - y) * self.scale,
w * self.scale, h * self.scale))
return
def convert_pdf_to_html(path):
rsrcmgr = PDFResourceManager()
retstr = BytesIO()
codec = 'utf-8'
laparams = LAParams(detect_vertical=True)
device = SakeBreweriesHTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching,
check_extractable=True):
interpreter.process_page(page)
html = retstr.getvalue().decode(codec)
fp.close()
device.close()
retstr.close()
return html
def parse_html(html):
soup = BeautifulSoup(html, 'html.parser')
while (soup.br):
soup.br.unwrap()
div = soup.select('div > div')[0]
while (div.div):
div.div.unwrap()
items = list()
item = dict()
location = ""
spans = div.find_all('span')
for span in spans:
attrs = span.attrs
style = attrs['style']
if "b'DUCEDZ+UDShinMGoPr6-Regular'" in style:
text = span.text.replace('㈲', '(有)')
if '(株)' in text or '(有)' in text or '(資)' in text or '(同)' in text or '(名)' in text:
item['breweries'] = text.strip()
else:
location = text.strip()
elif "b'OAEEKS+UDShinGoPr6-Medium'" in style:
item['mainbrand'] = span.text.strip()
elif "b'SUGDVW+UDShinGoCOeizPr6-Med'" in style:
item['telephone_number'] = span.text.strip()
item['location'] = location
items.append(item)
item = dict()
return items
if __name__ == '__main__':
for root, dirs, files in os.walk(u'paper'):
for file_ in files:
FILENAME = u'paper/' + file_
if os.path.splitext(file_)[1] == u'.pdf':
print(file_ + " is being processed...")
# convert pdf to html
html = convert_pdf_to_html(FILENAME)
if DEBUG:
with open('%s.html' % FILENAME, "w") as f:
f.write(html)
# parse html
items = parse_html(html)
# save json
with open('%s.json' % FILENAME, "w") as f:
for item in items:
json.dump(item, f, ensure_ascii=False)
print("converted!")
@yuru-sha
Copy link
Author

add 合資会社、合同会社の略称判定の追加
add location

@yuru-sha
Copy link
Author

fix 有限会社の略称判定の修正

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment