meganehouser · December 23, 2015 12:25
diff --git a/megro.py b/megro.py
 # coding: utf-8

 from enum import Enum
 import requests
 from os import path
 from html.parser import HTMLParser
 from time import sleep
 from random import randint
 from jinja2 import Environment, FileSystemLoader
 import datetime
 import sqlite3
 import json
 import math


 with open('interests.json', 'r') as f:
    interests = json.load(f)

 db_file = path.splitext(__file__)[0] + '.db'

 root_url = 'http://www.meguro-library.jp/opw/OPW/OPWNEWBOOK.CSP?ReloginFlag=1&DB=LIB&MODE=1'
 #base_url = "http://www.meguro-library.jp/opw/OPW/OPWNEWBOOK.CSP" \
 #           "?DB=LIB&SID={sid}&MODE=1&PID2=OPWNEWBOOK&FLG=LIST&" \
 #           "SRCID=6&SORT=-D&HOLD=NOHOLD&WRTCOUNT=100&CMODE=&CNO=&" \
 #           "SOFTKEY=&PAGE={page}"
 base_url = "http://www.meguro-library.jp/opw/OPW/OPWNEWBOOK.CSP" \
           "?DB=LIB&MODE=1&PID2=OPWNEWBOOK&FLG=LIST&" \
           "SRCID=6&SORT=-D&HOLD=NOHOLD&WRTCOUNT=100&CMODE=&CNO=&" \
           "SOFTKEY=&PAGE={page}"

 ''
 user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.73 Safari/537.3'

 class BookCursor(Enum):
    title = 0
    author = 1
    publisher = 2
    none = 9


 class BooksParser(HTMLParser):
    """ parse a book information from a html string"""

    def __init__(self):
        self.current_book = None
        self.current_cursor = BookCursor.none
        self.books = []
        super().__init__()

    def is_book_start(self, attrs):
        for attr, value in attrs:
            if attr == 'href' and 'LID=' in value:
                return True
        return False

    def handle_starttag(self, tag, attrs):
        if self.is_book_start(attrs):
            self.current_book = {'title': '', 'author': 'None', 'publisher': 'None'}
            self.current_cursor = BookCursor.title
        elif tag == 'td' and self.current_cursor == BookCursor.title:
            self.current_book['title'] = self.current_book['title'].strip()
            self.current_cursor = BookCursor.author
        elif tag == 'td' and self.current_cursor == BookCursor.author:
            self.current_cursor = BookCursor.publisher
        else:
            self.current_cursor = BookCursor.none

    def handle_data(self, data):
        if self.current_cursor == BookCursor.title:
            self.current_book['title'] += data
        elif self.current_cursor == BookCursor.author:
            self.current_book['author'] = data.split('／')[0].replace(' ', '').replace('\r\n', '')
        elif self.current_cursor == BookCursor.publisher:
            self.current_book['publisher'] = data
            self.books.append(self.current_book)
            self.current_book = None


 class Bookshelf:
    """ manage new books with sqlite"""

    def __init__(self, file_name):
        conn = sqlite3.connect(file_name)
        conn.row_factory = sqlite3.Row
        cursor = conn.cursor()
        cursor.execute("select * from sqlite_master where type='table' and name='books'")
        if len(cursor.fetchall()) <= 0:
            cursor.execute("create table books (title, author, publisher, status)")

        self.db = conn

    def __enter__(self):
        return self

    def __exit__(self, exec_type, value, traceback):
        self.db.close()

    def save_books(self, books):
        self.db.execute("update books set status='old'")

        for book in books:
            cursor = self.db.execute("update books set status='stay' " \
                                     "where title=? and author=? and publisher=? ",
                                     (book['title'], book['author'], book['publisher']))
            if cursor.rowcount <= 0:
                self.db.execute("insert into books(title, author, publisher, status)" \
                                "values(?,?,?,?)", (book['title'], book['author'], book['publisher'], 'new'))

        self.db.execute("delete from books where status='old'")
        self.db.commit()

    def get_new_books(self):
        new_books = []
        for row in self.db.execute("select title, author, publisher from books where status='new'"):
            new_book = {'title': row['title'],
                        'author': row['author'],
                        'publisher': row['publisher']}
            new_books.append(new_book)

        return new_books

 class PageInfoParser(HTMLParser):
    """ parser for getting page info (sid, page num)"""
    book_num = 0
    is_pgnum = False
    sid = ''
    is_sid = False

    def handle_starttag(self, tag, attrs):
        if tag == 'input' and ('type', 'hidden') in attrs and ('name', 'SID') in attrs:
            self.is_sid = True
            self.sid = [v for (name, v) in attrs if name == 'value'][0]
        else:
            self.is_sid = False

    def handle_data(self, data):
        if self.is_pgnum:
            self.book_num = int(data.replace(' ', '').replace('件\r\n', ''))
            self.is_pgnum = False
        if '【結果' in data:
            self.is_pgnum = True

    def get(url, useragent=user_agent):
        """Return Page infomation (sid,  number of pages)"""

        header = {'User-Agent': useragent}
        req = requests.get(url, headers=header)
        if req.status_code != 200:
            return ''

        body = req.text
        parser = PageInfoParser()
        parser.feed(body)
        return parser


 def get_books(root_url, book_url, useragent=user_agent, wait=lambda: sleep(1)):
    """Get all new books from Library Site"""

    pageinfo = PageInfoParser.get(root_url)
    print('sid:', pageinfo.sid)
    print('total books num:', pageinfo.book_num)

    books = []
    for i in range(1, math.ceil(pageinfo.book_num / 100) + 1):
        wait()
        url = book_url.format(page=i)
        print('request {}'.format(url))
        hdrs = {'User-Agent': useragent}

        req = None
        for _ in range(3):
            try:
                req = requests.get(url, headers=hdrs)
            except requests.exceptions.ConnectionError:
                continue
            if req.status_code == 200:
                break
            else:
                return ([], 'Url open Error')

        parser = BooksParser()
        parser.feed(req.text)
        books.extend(parser.books)
    return books

 if __name__ == '__main__':
    with Bookshelf(db_file) as bookshelf:
        books = get_books(root_url, base_url, wait=lambda: sleep(randint(3, 6)))
        bookshelf.save_books(books)

        new_books = bookshelf.get_new_books()
        print('new books count:{}'.format(len(new_books)))

        def search(key):
            list = []
            for book in new_books:
                for v in interests[key]:
                    if v in book[key]:
                        print('{}/{}/{}'.format(book['title'], book['author'], book['publisher']))
                        list.append(book)
            return list

        interests_books = []
        interests_books.extend(search('title'))
        interests_books.extend(search('author'))
        interests_books.extend(search('publisher'))

        # send to slack
        # make a html page
        env = Environment(loader=FileSystemLoader('./', encoding='utf8'))
        template = env.get_template('newbooks.html')

        today = datetime.datetime.today()
        yyyyMMddHHmm = today.year, today.month, today.day, today.hour, today.minute
        date = u"{0}年{1}月{2}日 {3}時 {4}分".format(*yyyyMMddHHmm)
        html = template.render({'date': date, 'books': new_books})

        file_name = "newbooks_{0}{1:0>2}{2:0>2}{3:0>2}{4:0>2}.html".format(*yyyyMMddHHmm)
        with open(file_name, mode='w', encoding='utf-8') as html_file:
            html_file.write(html)
diff --git a/newbooks.html b/newbooks.html
 <html>
    <head>
        <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
        <title>目黒区立図書館 新着書籍 {{ date }}</title>
    </head>
    <body>
        <h1>新着書籍 日付：{{ date }}</h1>
        <table border="1">
            <tr><th>書籍名</th><th>著者</th><th>出版社</th></tr>
            {% for book in books %}
            <tr>
                <td>{{ book['title'] }}</td>
                <td>{{ book['author'] }}</td>
                <td>{{ book['publisher'] }}</td>
            </tr>
            {% endfor %}
        </table>
    </body>
 </html>
	# coding: utf-8

	from enum import Enum
	import requests
	from os import path
	from html.parser import HTMLParser
	from time import sleep
	from random import randint
	from jinja2 import Environment, FileSystemLoader
	import datetime
	import sqlite3
	import json
	import math


	with open('interests.json', 'r') as f:
	interests = json.load(f)

	db_file = path.splitext(__file__)[0] + '.db'

	root_url = 'http://www.meguro-library.jp/opw/OPW/OPWNEWBOOK.CSP?ReloginFlag=1&DB=LIB&MODE=1'
	#base_url = "http://www.meguro-library.jp/opw/OPW/OPWNEWBOOK.CSP" \
	# "?DB=LIB&SID={sid}&MODE=1&PID2=OPWNEWBOOK&FLG=LIST&" \
	# "SRCID=6&SORT=-D&HOLD=NOHOLD&WRTCOUNT=100&CMODE=&CNO=&" \
	# "SOFTKEY=&PAGE={page}"
	base_url = "http://www.meguro-library.jp/opw/OPW/OPWNEWBOOK.CSP" \
	"?DB=LIB&MODE=1&PID2=OPWNEWBOOK&FLG=LIST&" \
	"SRCID=6&SORT=-D&HOLD=NOHOLD&WRTCOUNT=100&CMODE=&CNO=&" \
	"SOFTKEY=&PAGE={page}"

	''
	user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.73 Safari/537.3'

	class BookCursor(Enum):
	title = 0
	author = 1
	publisher = 2
	none = 9


	class BooksParser(HTMLParser):
	""" parse a book information from a html string"""

	def __init__(self):
	self.current_book = None
	self.current_cursor = BookCursor.none
	self.books = []
	super().__init__()

	def is_book_start(self, attrs):
	for attr, value in attrs:
	if attr == 'href' and 'LID=' in value:
	return True
	return False

	def handle_starttag(self, tag, attrs):
	if self.is_book_start(attrs):
	self.current_book = {'title': '', 'author': 'None', 'publisher': 'None'}
	self.current_cursor = BookCursor.title
	elif tag == 'td' and self.current_cursor == BookCursor.title:
	self.current_book['title'] = self.current_book['title'].strip()
	self.current_cursor = BookCursor.author
	elif tag == 'td' and self.current_cursor == BookCursor.author:
	self.current_cursor = BookCursor.publisher
	else:
	self.current_cursor = BookCursor.none

	def handle_data(self, data):
	if self.current_cursor == BookCursor.title:
	self.current_book['title'] += data
	elif self.current_cursor == BookCursor.author:
	self.current_book['author'] = data.split('／')[0].replace(' ', '').replace('\r\n', '')
	elif self.current_cursor == BookCursor.publisher:
	self.current_book['publisher'] = data
	self.books.append(self.current_book)
	self.current_book = None


	class Bookshelf:
	""" manage new books with sqlite"""

	def __init__(self, file_name):
	conn = sqlite3.connect(file_name)
	conn.row_factory = sqlite3.Row
	cursor = conn.cursor()
	cursor.execute("select * from sqlite_master where type='table' and name='books'")
	if len(cursor.fetchall()) <= 0:
	cursor.execute("create table books (title, author, publisher, status)")

	self.db = conn

	def __enter__(self):
	return self

	def __exit__(self, exec_type, value, traceback):
	self.db.close()

	def save_books(self, books):
	self.db.execute("update books set status='old'")

	for book in books:
	cursor = self.db.execute("update books set status='stay' " \
	"where title=? and author=? and publisher=? ",
	(book['title'], book['author'], book['publisher']))
	if cursor.rowcount <= 0:
	self.db.execute("insert into books(title, author, publisher, status)" \
	"values(?,?,?,?)", (book['title'], book['author'], book['publisher'], 'new'))

	self.db.execute("delete from books where status='old'")
	self.db.commit()

	def get_new_books(self):
	new_books = []
	for row in self.db.execute("select title, author, publisher from books where status='new'"):
	new_book = {'title': row['title'],
	'author': row['author'],
	'publisher': row['publisher']}
	new_books.append(new_book)

	return new_books

	class PageInfoParser(HTMLParser):
	""" parser for getting page info (sid, page num)"""
	book_num = 0
	is_pgnum = False
	sid = ''
	is_sid = False

	def handle_starttag(self, tag, attrs):
	if tag == 'input' and ('type', 'hidden') in attrs and ('name', 'SID') in attrs:
	self.is_sid = True
	self.sid = [v for (name, v) in attrs if name == 'value'][0]
	else:
	self.is_sid = False

	def handle_data(self, data):
	if self.is_pgnum:
	self.book_num = int(data.replace(' ', '').replace('件\r\n', ''))
	self.is_pgnum = False
	if '【結果' in data:
	self.is_pgnum = True

	def get(url, useragent=user_agent):
	"""Return Page infomation (sid, number of pages)"""

	header = {'User-Agent': useragent}
	req = requests.get(url, headers=header)
	if req.status_code != 200:
	return ''

	body = req.text
	parser = PageInfoParser()
	parser.feed(body)
	return parser


	def get_books(root_url, book_url, useragent=user_agent, wait=lambda: sleep(1)):
	"""Get all new books from Library Site"""

	pageinfo = PageInfoParser.get(root_url)
	print('sid:', pageinfo.sid)
	print('total books num:', pageinfo.book_num)

	books = []
	for i in range(1, math.ceil(pageinfo.book_num / 100) + 1):
	wait()
	url = book_url.format(page=i)
	print('request {}'.format(url))
	hdrs = {'User-Agent': useragent}

	req = None
	for _ in range(3):
	try:
	req = requests.get(url, headers=hdrs)
	except requests.exceptions.ConnectionError:
	continue
	if req.status_code == 200:
	break
	else:
	return ([], 'Url open Error')

	parser = BooksParser()
	parser.feed(req.text)
	books.extend(parser.books)
	return books

	if __name__ == '__main__':
	with Bookshelf(db_file) as bookshelf:
	books = get_books(root_url, base_url, wait=lambda: sleep(randint(3, 6)))
	bookshelf.save_books(books)

	new_books = bookshelf.get_new_books()
	print('new books count:{}'.format(len(new_books)))

	def search(key):
	list = []
	for book in new_books:
	for v in interests[key]:
	if v in book[key]:
	print('{}/{}/{}'.format(book['title'], book['author'], book['publisher']))
	list.append(book)
	return list

	interests_books = []
	interests_books.extend(search('title'))
	interests_books.extend(search('author'))
	interests_books.extend(search('publisher'))

	# send to slack
	# make a html page
	env = Environment(loader=FileSystemLoader('./', encoding='utf8'))
	template = env.get_template('newbooks.html')

	today = datetime.datetime.today()
	yyyyMMddHHmm = today.year, today.month, today.day, today.hour, today.minute
	date = u"{0}年{1}月{2}日 {3}時 {4}分".format(*yyyyMMddHHmm)
	html = template.render({'date': date, 'books': new_books})

	file_name = "newbooks_{0}{1:0>2}{2:0>2}{3:0>2}{4:0>2}.html".format(*yyyyMMddHHmm)
	with open(file_name, mode='w', encoding='utf-8') as html_file:
	html_file.write(html)
	<html>
	<head>
	<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
	<title>目黒区立図書館新着書籍 {{ date }}</title>
	</head>
	<body>
	<h1>新着書籍日付：{{ date }}</h1>
	<table border="1">
	<tr><th>書籍名</th><th>著者</th><th>出版社</th></tr>
	{% for book in books %}
	<tr>
	<td>{{ book['title'] }}</td>
	<td>{{ book['author'] }}</td>
	<td>{{ book['publisher'] }}</td>
	</tr>
	{% endfor %}
	</table>
	</body>
	</html>