Skip to content

Instantly share code, notes, and snippets.

@meganehouser
Created December 23, 2015 12:25
Show Gist options
  • Save meganehouser/5204c678959a3192bd3e to your computer and use it in GitHub Desktop.
Save meganehouser/5204c678959a3192bd3e to your computer and use it in GitHub Desktop.
get new books information from megro lib
# coding: utf-8
from enum import Enum
import requests
from os import path
from html.parser import HTMLParser
from time import sleep
from random import randint
from jinja2 import Environment, FileSystemLoader
import datetime
import sqlite3
import json
import math
with open('interests.json', 'r') as f:
interests = json.load(f)
db_file = path.splitext(__file__)[0] + '.db'
root_url = 'http://www.meguro-library.jp/opw/OPW/OPWNEWBOOK.CSP?ReloginFlag=1&DB=LIB&MODE=1'
#base_url = "http://www.meguro-library.jp/opw/OPW/OPWNEWBOOK.CSP" \
# "?DB=LIB&SID={sid}&MODE=1&PID2=OPWNEWBOOK&FLG=LIST&" \
# "SRCID=6&SORT=-D&HOLD=NOHOLD&WRTCOUNT=100&CMODE=&CNO=&" \
# "SOFTKEY=&PAGE={page}"
base_url = "http://www.meguro-library.jp/opw/OPW/OPWNEWBOOK.CSP" \
"?DB=LIB&MODE=1&PID2=OPWNEWBOOK&FLG=LIST&" \
"SRCID=6&SORT=-D&HOLD=NOHOLD&WRTCOUNT=100&CMODE=&CNO=&" \
"SOFTKEY=&PAGE={page}"
''
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.73 Safari/537.3'
class BookCursor(Enum):
title = 0
author = 1
publisher = 2
none = 9
class BooksParser(HTMLParser):
""" parse a book information from a html string"""
def __init__(self):
self.current_book = None
self.current_cursor = BookCursor.none
self.books = []
super().__init__()
def is_book_start(self, attrs):
for attr, value in attrs:
if attr == 'href' and 'LID=' in value:
return True
return False
def handle_starttag(self, tag, attrs):
if self.is_book_start(attrs):
self.current_book = {'title': '', 'author': 'None', 'publisher': 'None'}
self.current_cursor = BookCursor.title
elif tag == 'td' and self.current_cursor == BookCursor.title:
self.current_book['title'] = self.current_book['title'].strip()
self.current_cursor = BookCursor.author
elif tag == 'td' and self.current_cursor == BookCursor.author:
self.current_cursor = BookCursor.publisher
else:
self.current_cursor = BookCursor.none
def handle_data(self, data):
if self.current_cursor == BookCursor.title:
self.current_book['title'] += data
elif self.current_cursor == BookCursor.author:
self.current_book['author'] = data.split('/')[0].replace(' ', '').replace('\r\n', '')
elif self.current_cursor == BookCursor.publisher:
self.current_book['publisher'] = data
self.books.append(self.current_book)
self.current_book = None
class Bookshelf:
""" manage new books with sqlite"""
def __init__(self, file_name):
conn = sqlite3.connect(file_name)
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
cursor.execute("select * from sqlite_master where type='table' and name='books'")
if len(cursor.fetchall()) <= 0:
cursor.execute("create table books (title, author, publisher, status)")
self.db = conn
def __enter__(self):
return self
def __exit__(self, exec_type, value, traceback):
self.db.close()
def save_books(self, books):
self.db.execute("update books set status='old'")
for book in books:
cursor = self.db.execute("update books set status='stay' " \
"where title=? and author=? and publisher=? ",
(book['title'], book['author'], book['publisher']))
if cursor.rowcount <= 0:
self.db.execute("insert into books(title, author, publisher, status)" \
"values(?,?,?,?)", (book['title'], book['author'], book['publisher'], 'new'))
self.db.execute("delete from books where status='old'")
self.db.commit()
def get_new_books(self):
new_books = []
for row in self.db.execute("select title, author, publisher from books where status='new'"):
new_book = {'title': row['title'],
'author': row['author'],
'publisher': row['publisher']}
new_books.append(new_book)
return new_books
class PageInfoParser(HTMLParser):
""" parser for getting page info (sid, page num)"""
book_num = 0
is_pgnum = False
sid = ''
is_sid = False
def handle_starttag(self, tag, attrs):
if tag == 'input' and ('type', 'hidden') in attrs and ('name', 'SID') in attrs:
self.is_sid = True
self.sid = [v for (name, v) in attrs if name == 'value'][0]
else:
self.is_sid = False
def handle_data(self, data):
if self.is_pgnum:
self.book_num = int(data.replace(' ', '').replace('件\r\n', ''))
self.is_pgnum = False
if '【結果' in data:
self.is_pgnum = True
def get(url, useragent=user_agent):
"""Return Page infomation (sid, number of pages)"""
header = {'User-Agent': useragent}
req = requests.get(url, headers=header)
if req.status_code != 200:
return ''
body = req.text
parser = PageInfoParser()
parser.feed(body)
return parser
def get_books(root_url, book_url, useragent=user_agent, wait=lambda: sleep(1)):
"""Get all new books from Library Site"""
pageinfo = PageInfoParser.get(root_url)
print('sid:', pageinfo.sid)
print('total books num:', pageinfo.book_num)
books = []
for i in range(1, math.ceil(pageinfo.book_num / 100) + 1):
wait()
url = book_url.format(page=i)
print('request {}'.format(url))
hdrs = {'User-Agent': useragent}
req = None
for _ in range(3):
try:
req = requests.get(url, headers=hdrs)
except requests.exceptions.ConnectionError:
continue
if req.status_code == 200:
break
else:
return ([], 'Url open Error')
parser = BooksParser()
parser.feed(req.text)
books.extend(parser.books)
return books
if __name__ == '__main__':
with Bookshelf(db_file) as bookshelf:
books = get_books(root_url, base_url, wait=lambda: sleep(randint(3, 6)))
bookshelf.save_books(books)
new_books = bookshelf.get_new_books()
print('new books count:{}'.format(len(new_books)))
def search(key):
list = []
for book in new_books:
for v in interests[key]:
if v in book[key]:
print('{}/{}/{}'.format(book['title'], book['author'], book['publisher']))
list.append(book)
return list
interests_books = []
interests_books.extend(search('title'))
interests_books.extend(search('author'))
interests_books.extend(search('publisher'))
# send to slack
# make a html page
env = Environment(loader=FileSystemLoader('./', encoding='utf8'))
template = env.get_template('newbooks.html')
today = datetime.datetime.today()
yyyyMMddHHmm = today.year, today.month, today.day, today.hour, today.minute
date = u"{0}年{1}月{2}日 {3}時 {4}分".format(*yyyyMMddHHmm)
html = template.render({'date': date, 'books': new_books})
file_name = "newbooks_{0}{1:0>2}{2:0>2}{3:0>2}{4:0>2}.html".format(*yyyyMMddHHmm)
with open(file_name, mode='w', encoding='utf-8') as html_file:
html_file.write(html)
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>目黒区立図書館 新着書籍 {{ date }}</title>
</head>
<body>
<h1>新着書籍 日付:{{ date }}</h1>
<table border="1">
<tr><th>書籍名</th><th>著者</th><th>出版社</th></tr>
{% for book in books %}
<tr>
<td>{{ book['title'] }}</td>
<td>{{ book['author'] }}</td>
<td>{{ book['publisher'] }}</td>
</tr>
{% endfor %}
</table>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment