Last active
April 18, 2019 15:14
-
-
Save yuru-sha/49732f331060ddeacda49f97098af8ed to your computer and use it in GitHub Desktop.
酒造マップPDFのスクレイピング
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import json | |
import os | |
from io import BytesIO | |
from bs4 import BeautifulSoup | |
from pdfminer.converter import HTMLConverter | |
from pdfminer.layout import LAParams | |
from pdfminer.pdfcolor import LITERAL_DEVICE_GRAY | |
from pdfminer.pdfcolor import LITERAL_DEVICE_RGB | |
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | |
from pdfminer.pdfpage import PDFPage | |
from pdfminer.pdftypes import LITERALS_DCT_DECODE | |
from pdfminer.utils import enc | |
DEBUG = True | |
## SakeBreweriesHTMLConverter | |
## | |
class SakeBreweriesHTMLConverter(HTMLConverter): | |
def get_image_name(self, image): | |
stream = image.stream | |
filters = stream.get_filters() | |
(width, height) = image.srcsize | |
if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE: | |
ext = '.jpg' | |
elif (image.bits == 1 or | |
image.bits == 8 and image.colorspace in (LITERAL_DEVICE_RGB, LITERAL_DEVICE_GRAY)): | |
ext = '.%dx%d.bmp' % (width, height) | |
else: | |
ext = '.%d.%dx%d.img' % (image.bits, width, height) | |
name = image.name + ext | |
return name | |
def place_image(self, item, borderwidth, x, y, w, h): | |
if self.imagewriter is not None: | |
name = self.imagewriter.export_image(item) | |
self.write('<img src="%s" border="%d" style="position:absolute; left:%dpx; top:%dpx;" ' | |
'width="%d" height="%d" />\n' % | |
(enc(name, None), borderwidth, | |
x * self.scale, (self._yoffset - y) * self.scale, | |
w * self.scale, h * self.scale)) | |
else: | |
name = self.get_image_name(item) | |
self.write('<img src="%s" border="%d" style="position:absolute; left:%dpx; top:%dpx;" ' | |
'width="%d" height="%d" />\n' % | |
(enc(name, None), borderwidth, | |
x * self.scale, (self._yoffset - y) * self.scale, | |
w * self.scale, h * self.scale)) | |
return | |
def convert_pdf_to_html(path): | |
rsrcmgr = PDFResourceManager() | |
retstr = BytesIO() | |
codec = 'utf-8' | |
laparams = LAParams(detect_vertical=True) | |
device = SakeBreweriesHTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) | |
fp = open(path, 'rb') | |
interpreter = PDFPageInterpreter(rsrcmgr, device) | |
password = "" | |
maxpages = 0 | |
caching = True | |
pagenos = set() | |
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, | |
check_extractable=True): | |
interpreter.process_page(page) | |
html = retstr.getvalue().decode(codec) | |
fp.close() | |
device.close() | |
retstr.close() | |
return html | |
def parse_html(html): | |
soup = BeautifulSoup(html, 'html.parser') | |
while (soup.br): | |
soup.br.unwrap() | |
div = soup.select('div > div')[0] | |
while (div.div): | |
div.div.unwrap() | |
items = list() | |
item = dict() | |
location = "" | |
spans = div.find_all('span') | |
for span in spans: | |
attrs = span.attrs | |
style = attrs['style'] | |
if "b'DUCEDZ+UDShinMGoPr6-Regular'" in style: | |
text = span.text.replace('㈲', '(有)') | |
if '(株)' in text or '(有)' in text or '(資)' in text or '(同)' in text or '(名)' in text: | |
item['breweries'] = text.strip() | |
else: | |
location = text.strip() | |
elif "b'OAEEKS+UDShinGoPr6-Medium'" in style: | |
item['mainbrand'] = span.text.strip() | |
elif "b'SUGDVW+UDShinGoCOeizPr6-Med'" in style: | |
item['telephone_number'] = span.text.strip() | |
item['location'] = location | |
items.append(item) | |
item = dict() | |
return items | |
if __name__ == '__main__': | |
for root, dirs, files in os.walk(u'paper'): | |
for file_ in files: | |
FILENAME = u'paper/' + file_ | |
if os.path.splitext(file_)[1] == u'.pdf': | |
print(file_ + " is being processed...") | |
# convert pdf to html | |
html = convert_pdf_to_html(FILENAME) | |
if DEBUG: | |
with open('%s.html' % FILENAME, "w") as f: | |
f.write(html) | |
# parse html | |
items = parse_html(html) | |
# save json | |
with open('%s.json' % FILENAME, "w") as f: | |
for item in items: | |
json.dump(item, f, ensure_ascii=False) | |
print("converted!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
fix 有限会社の略称判定の修正