Created
December 23, 2015 12:25
-
-
Save meganehouser/5204c678959a3192bd3e to your computer and use it in GitHub Desktop.
get new books information from megro lib
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
from enum import Enum | |
import requests | |
from os import path | |
from html.parser import HTMLParser | |
from time import sleep | |
from random import randint | |
from jinja2 import Environment, FileSystemLoader | |
import datetime | |
import sqlite3 | |
import json | |
import math | |
with open('interests.json', 'r') as f: | |
interests = json.load(f) | |
db_file = path.splitext(__file__)[0] + '.db' | |
root_url = 'http://www.meguro-library.jp/opw/OPW/OPWNEWBOOK.CSP?ReloginFlag=1&DB=LIB&MODE=1' | |
#base_url = "http://www.meguro-library.jp/opw/OPW/OPWNEWBOOK.CSP" \ | |
# "?DB=LIB&SID={sid}&MODE=1&PID2=OPWNEWBOOK&FLG=LIST&" \ | |
# "SRCID=6&SORT=-D&HOLD=NOHOLD&WRTCOUNT=100&CMODE=&CNO=&" \ | |
# "SOFTKEY=&PAGE={page}" | |
base_url = "http://www.meguro-library.jp/opw/OPW/OPWNEWBOOK.CSP" \ | |
"?DB=LIB&MODE=1&PID2=OPWNEWBOOK&FLG=LIST&" \ | |
"SRCID=6&SORT=-D&HOLD=NOHOLD&WRTCOUNT=100&CMODE=&CNO=&" \ | |
"SOFTKEY=&PAGE={page}" | |
'' | |
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.73 Safari/537.3' | |
class BookCursor(Enum): | |
title = 0 | |
author = 1 | |
publisher = 2 | |
none = 9 | |
class BooksParser(HTMLParser): | |
""" parse a book information from a html string""" | |
def __init__(self): | |
self.current_book = None | |
self.current_cursor = BookCursor.none | |
self.books = [] | |
super().__init__() | |
def is_book_start(self, attrs): | |
for attr, value in attrs: | |
if attr == 'href' and 'LID=' in value: | |
return True | |
return False | |
def handle_starttag(self, tag, attrs): | |
if self.is_book_start(attrs): | |
self.current_book = {'title': '', 'author': 'None', 'publisher': 'None'} | |
self.current_cursor = BookCursor.title | |
elif tag == 'td' and self.current_cursor == BookCursor.title: | |
self.current_book['title'] = self.current_book['title'].strip() | |
self.current_cursor = BookCursor.author | |
elif tag == 'td' and self.current_cursor == BookCursor.author: | |
self.current_cursor = BookCursor.publisher | |
else: | |
self.current_cursor = BookCursor.none | |
def handle_data(self, data): | |
if self.current_cursor == BookCursor.title: | |
self.current_book['title'] += data | |
elif self.current_cursor == BookCursor.author: | |
self.current_book['author'] = data.split('/')[0].replace(' ', '').replace('\r\n', '') | |
elif self.current_cursor == BookCursor.publisher: | |
self.current_book['publisher'] = data | |
self.books.append(self.current_book) | |
self.current_book = None | |
class Bookshelf: | |
""" manage new books with sqlite""" | |
def __init__(self, file_name): | |
conn = sqlite3.connect(file_name) | |
conn.row_factory = sqlite3.Row | |
cursor = conn.cursor() | |
cursor.execute("select * from sqlite_master where type='table' and name='books'") | |
if len(cursor.fetchall()) <= 0: | |
cursor.execute("create table books (title, author, publisher, status)") | |
self.db = conn | |
def __enter__(self): | |
return self | |
def __exit__(self, exec_type, value, traceback): | |
self.db.close() | |
def save_books(self, books): | |
self.db.execute("update books set status='old'") | |
for book in books: | |
cursor = self.db.execute("update books set status='stay' " \ | |
"where title=? and author=? and publisher=? ", | |
(book['title'], book['author'], book['publisher'])) | |
if cursor.rowcount <= 0: | |
self.db.execute("insert into books(title, author, publisher, status)" \ | |
"values(?,?,?,?)", (book['title'], book['author'], book['publisher'], 'new')) | |
self.db.execute("delete from books where status='old'") | |
self.db.commit() | |
def get_new_books(self): | |
new_books = [] | |
for row in self.db.execute("select title, author, publisher from books where status='new'"): | |
new_book = {'title': row['title'], | |
'author': row['author'], | |
'publisher': row['publisher']} | |
new_books.append(new_book) | |
return new_books | |
class PageInfoParser(HTMLParser): | |
""" parser for getting page info (sid, page num)""" | |
book_num = 0 | |
is_pgnum = False | |
sid = '' | |
is_sid = False | |
def handle_starttag(self, tag, attrs): | |
if tag == 'input' and ('type', 'hidden') in attrs and ('name', 'SID') in attrs: | |
self.is_sid = True | |
self.sid = [v for (name, v) in attrs if name == 'value'][0] | |
else: | |
self.is_sid = False | |
def handle_data(self, data): | |
if self.is_pgnum: | |
self.book_num = int(data.replace(' ', '').replace('件\r\n', '')) | |
self.is_pgnum = False | |
if '【結果' in data: | |
self.is_pgnum = True | |
def get(url, useragent=user_agent): | |
"""Return Page infomation (sid, number of pages)""" | |
header = {'User-Agent': useragent} | |
req = requests.get(url, headers=header) | |
if req.status_code != 200: | |
return '' | |
body = req.text | |
parser = PageInfoParser() | |
parser.feed(body) | |
return parser | |
def get_books(root_url, book_url, useragent=user_agent, wait=lambda: sleep(1)): | |
"""Get all new books from Library Site""" | |
pageinfo = PageInfoParser.get(root_url) | |
print('sid:', pageinfo.sid) | |
print('total books num:', pageinfo.book_num) | |
books = [] | |
for i in range(1, math.ceil(pageinfo.book_num / 100) + 1): | |
wait() | |
url = book_url.format(page=i) | |
print('request {}'.format(url)) | |
hdrs = {'User-Agent': useragent} | |
req = None | |
for _ in range(3): | |
try: | |
req = requests.get(url, headers=hdrs) | |
except requests.exceptions.ConnectionError: | |
continue | |
if req.status_code == 200: | |
break | |
else: | |
return ([], 'Url open Error') | |
parser = BooksParser() | |
parser.feed(req.text) | |
books.extend(parser.books) | |
return books | |
if __name__ == '__main__': | |
with Bookshelf(db_file) as bookshelf: | |
books = get_books(root_url, base_url, wait=lambda: sleep(randint(3, 6))) | |
bookshelf.save_books(books) | |
new_books = bookshelf.get_new_books() | |
print('new books count:{}'.format(len(new_books))) | |
def search(key): | |
list = [] | |
for book in new_books: | |
for v in interests[key]: | |
if v in book[key]: | |
print('{}/{}/{}'.format(book['title'], book['author'], book['publisher'])) | |
list.append(book) | |
return list | |
interests_books = [] | |
interests_books.extend(search('title')) | |
interests_books.extend(search('author')) | |
interests_books.extend(search('publisher')) | |
# send to slack | |
# make a html page | |
env = Environment(loader=FileSystemLoader('./', encoding='utf8')) | |
template = env.get_template('newbooks.html') | |
today = datetime.datetime.today() | |
yyyyMMddHHmm = today.year, today.month, today.day, today.hour, today.minute | |
date = u"{0}年{1}月{2}日 {3}時 {4}分".format(*yyyyMMddHHmm) | |
html = template.render({'date': date, 'books': new_books}) | |
file_name = "newbooks_{0}{1:0>2}{2:0>2}{3:0>2}{4:0>2}.html".format(*yyyyMMddHHmm) | |
with open(file_name, mode='w', encoding='utf-8') as html_file: | |
html_file.write(html) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<html> | |
<head> | |
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> | |
<title>目黒区立図書館 新着書籍 {{ date }}</title> | |
</head> | |
<body> | |
<h1>新着書籍 日付:{{ date }}</h1> | |
<table border="1"> | |
<tr><th>書籍名</th><th>著者</th><th>出版社</th></tr> | |
{% for book in books %} | |
<tr> | |
<td>{{ book['title'] }}</td> | |
<td>{{ book['author'] }}</td> | |
<td>{{ book['publisher'] }}</td> | |
</tr> | |
{% endfor %} | |
</table> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment