Created
March 12, 2017 07:47
-
-
Save creamidea/1a946cbe07f9a523e6d861171e3c2813 to your computer and use it in GitHub Desktop.
获取苏州职业大学图书馆书籍信息(DEMO)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python | |
# -*- coding: utf-8 -*- | |
from time import sleep | |
from argparse import ArgumentParser | |
import requests | |
from bs4 import BeautifulSoup | |
from bs4.element import Tag | |
class Search(object): | |
""" | |
Search the books | |
""" | |
api = "http://opac.jssvc.edu.cn:8080/opac/openlink.php" | |
headers = { | |
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' | |
} | |
cookies = None | |
books = [] | |
def __init__(self): | |
""" | |
Init Search | |
""" | |
pass | |
def toDict(self, books = []): | |
""" | |
将 bs4.element.Tag 类型装换成 字典 类型 | |
""" | |
return list(map(self.getInfo, books)) | |
def getInfo(self, book): | |
""" | |
从 bs4.element.Tag 中提取出 书 的信息 | |
""" | |
children = book.findChildren() | |
h3 = children[0] | |
title = h3.find('a') | |
link = 'http://opac.jssvc.edu.cn:8080/opac/{item}'.format(item=title['href']) | |
code = h3.find(text=True, recursive=False).strip() | |
author, publisher = list((map(lambda r: r.strip(), book.find('p').text.split('\n')[3:5]))) | |
return { | |
'title': title.text.split('.')[1], | |
'link': link, | |
'code': code, | |
'author': author, | |
'publisher': publisher | |
} | |
def do (self, sKey, sType="title"): | |
""" | |
请求获取图书信息的 API | |
""" | |
searchs = '?strSearchType={sType}&strText={sKey}&doctype=ALL'.format( | |
sType = sType, sKey = sKey) | |
self.request(searchs) | |
return self.toDict(self.books) | |
def request(self, searchs): | |
""" | |
实际调用查询接口 | |
""" | |
url = '{api}{searchs}'.format(api = self.api, searchs = searchs) | |
r = requests.get(url, headers = self.headers, cookies = self.cookies) | |
self.cookies = r.cookies | |
self.parse(r.text) | |
def parse (self, content): | |
""" | |
解析返回的页面,获取图书条目 | |
""" | |
soup = BeautifulSoup(content, 'html5lib') | |
bookContent = soup.find('div', {'id': 'book_content'}) | |
# 获取下一跳的连接 | |
btn = bookContent.find('div', {'class': 'num'}).findAll('a') | |
if len(self.books) is 0: | |
searchs = btn[0]['href'] | |
else: | |
try: | |
searchs = btn[1]['href'] | |
except IndexError: | |
searchs = None | |
# 获取图书条目,放入容器 | |
books = bookContent.findAll('div', {'class': 'list_books'}) | |
self.books.extend(books) | |
# 如果有下一跳地址,继续请求 | |
if searchs: | |
print('>>> next...') | |
sleep(1.1) # 避免过快请求,导致问题 | |
self.request(searchs) | |
if __name__ == '__main__': | |
# 解析命令行 | |
parser = ArgumentParser(description="Get the books from the library.") | |
parser.add_argument('-k', dest="keyword", required=True, | |
help='keyword') | |
parser.add_argument('-o', dest="output_filename", | |
default="books.txt", | |
help='output the file [default: books.txt]') | |
args = parser.parse_args() | |
search = Search() | |
books = search.do(args.keyword) | |
# 写入文件 | |
filename = args.output_filename | |
with open(filename, 'w') as f: | |
content = '\n'.join(map(lambda book: '{code}\t{title}\t{author}\t{publisher}\t{link}'.format(**book), books)) | |
f.write(content) | |
print('DONE. Save in {filename}'.format(filename=filename)) |
$ ./request-jssvc-lib.py -k python
>>> next...
>>> next...
DONE. Save in books.txt
之后,查看文件 books.txt 即可。或者导入 excel。
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
一个信息的 HTML 结构
