Last active
November 21, 2022 13:59
-
-
Save ahxxm/5b012985008708d96b22102258c1b90d to your computer and use it in GitHub Desktop.
title + <div id="info" /> => local redis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns doubanbook-cralwer.core | |
(:require [clj-http.client :as http] | |
[clojure.core.async :refer [go-loop chan <! >!! <!!] :as a] | |
[taoensso.carmine :as car :refer (wcar)]) | |
(:gen-class)) | |
(def ua "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36") | |
(def redis-opts {:pool {} :spec {:uri "redis://127.0.0.1:6379/0"}}) | |
(defmacro wcar* [& body] `(car/wcar redis-opts ~@body)) | |
(def seen (atom (into #{} (wcar* (car/keys "*"))))) | |
(defn get-ua | |
[url] | |
(try | |
(http/get url {:headers {"User-Agent" ua} :throw-exceptions false :max-redirects 5}) | |
(catch Exception e | |
(println (str "caught exception: " (.getMessage e))) | |
{}))) | |
(defn extract-rsp | |
[rsp] | |
(let [body (or (:body rsp) "") | |
books (into #{} (re-seq #"(?i)(?<=book.douban.com/subject/)\d+" body)) | |
title (clojure.string/replace | |
(or (re-find #"(?<=title\>)[\s\S]*?(?=</title>)" body) "") | |
"(豆瓣)" "") | |
info (re-find #"<div id=\"info\"[\s\S]*?</div>" body) | |
full (str (clojure.string/trim title) info)] | |
[full books])) | |
(defn go-crawl | |
[id-ch info-ch] | |
(loop [i 0] | |
(let [id (<!! id-ch) | |
url (str "https://book.douban.com/subject/" id "/") | |
before (quot (. System (nanoTime)) 1000000) | |
rsp (get-ua url) | |
after (quot (. System (nanoTime)) 1000000) | |
[info ids] (extract-rsp rsp)] | |
;;(<! (a/timeout 600)) | |
(when (= 0 (mod i 100)) | |
(println "GET book" id "finished in" (- after before) "ms")) | |
(>!! info-ch [id info]) | |
(doseq [-id ids] | |
(when-not (@seen -id) | |
;; new id set nil for restart bootstrap | |
(wcar* (car/set id nil)) | |
(>!! id-ch i))) | |
(swap! seen clojure.set/union ids) | |
(recur (inc i))))) | |
(defn go-write | |
[info-ch] | |
(go-loop [] | |
(let [[id info] (<! info-ch)] | |
(if-not (wcar* (car/keys id)) | |
(wcar* (car/set id info)) | |
;; else only overwrite nil | |
(when info | |
(wcar* (car/set id info))))) | |
(recur))) | |
(defn -main | |
[] | |
(let [bookid-ch (chan 500000) | |
info-ch (chan 10000)] | |
(a/thread | |
(loop [] | |
(<!! (a/timeout (* 15 1000))) | |
(println "seen urls" (count @seen) "book remaining" (count (.buf bookid-ch))) | |
(recur))) | |
;; bootstrap using keys with nil value | |
(loop [keys [] | |
s (vec @seen)] | |
(let [k (first s) | |
r (rest s)] | |
(if-not k | |
(doseq [key keys] | |
(>!! bookid-ch key)) | |
(if-not (empty? (wcar* (car/get k))) ;; else check if nippy/nil or "" | |
(recur keys r) | |
(recur (conj keys k) r))))) | |
;; n crawler | |
(doseq [_ (range 12)] | |
(a/thread (go-crawl bookid-ch info-ch))) | |
;; single redis writer | |
(<!! (go-write info-ch)))) |
导出books-remaining.csv
,其中isbns为Goodreads没有的ISBN list,需要从网页上整理:
def extract_info(data):
# Title, Author, ISBN,
# Publisher, Binding, Year Published, Original Publication Year, Date Read, Date Added, Bookshelves, My Review
title, remain = data.split("<div id=\"info")
author = re.findall("(?<=作者</span>:)[\s\S]*?</span>", remain)
if not author:
author = re.findall("(?<=作者:</span>)[\s\S]*?</span>", remain)
if not author:
author = re.findall("(?<=译者:</span>)[\s\S]*?</span>", remain)
if not author:
author = re.findall("(?<=译者</span>:)[\s\S]*?</span>", remain)
if not author:
author = ["Anonymous"] #HACK
author = author[0].strip()
if "<a" in author:
author = re.findall("(?<=>)[\s\S]*?(?=<)", author)[0].strip()
isbn = re.findall("(?<=ISBN:</span> )\d+", remain)
if not isbn:
return None, None
isbn = isbn[0]
year = re.findall("(?<=出版年:</span>).*(?=<)", remain)
if year:
year = year[0].strip()
publisher = re.findall("出版社.*</span>.*?(?=<)", remain)
if publisher:
publisher = publisher[0].split("</span>")[1].strip()
else:
publisher = ""
return isbn, [author, title, year, publisher]
isbn_to_info = dict()
isbn_to_id = dict()
keys = r.keys() # 所有爬下来的
for k in keys:
data = r.get(k).decode()
if data == "页面不存在" or data == "条目不存在":
continue
isbn, row = extract_info(data)
if isbn:
isbn_to_info[isbn] = row
isbn_to_id[isbn] = k
# 用到了上面的 isbn_added 和 isbn_shelf
books = []
for isbn in isbns:
isbn = str(isbn)
shelf = isbn_shelf.get(isbn, "to-read")
row = isbn_to_info.get(isbn)
if not row:
continue
added = isbn_added.get(isbn, "")
shelf = isbn_shelf.get(isbn, "currently-reading")
author, title, date, publisher = row
row = [isbn, added, shelf]+row
books.append(row)
# remaining dump
with open("books-remaining.csv", 'w') as f:
wr = csv.writer(f, quoting=csv.QUOTE_ALL)
wr.writerow(["ISBN", "Date Added", "Bookshelves", "Author", "Publisher", "Title", "Year Published"])
wr.writerows(books)```
创建新书:
```python
import requests
import re
import time
# Goodreads cookies里的这几个都需要
s = requests.Session()
s.cookies.set("ccsid", "")
s.cookies.set("u", "")
s.cookies.set("p", "")
s.cookies.set("_session_id2", "")
s.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
def create_one(isbn, author, title, publisher):
rsp = s.get("https://www.goodreads.com/book/new")
csrf = re.findall('(?<=csrf-token" content=").*(?="\ )', rsp.text)[0]
pdata = {
"utf-8": "✓",
"authenticity_token": csrf,
"book[title]": title,
"book[sort_by_title]": title,
"author[name]": author,
"book[isbn]": isbn,
"book[publisher]": publisher,
"book[update_default_description]": 1,
"work[media_type]": "book",
"commit": "Create book",
}
rsp = s.post("https://www.goodreads.com/book/new", data=pdata)
if title in rsp.text:
print(title, "created at", rsp.url)
# csv_data是个list of list
for d in csv_data:
isbn, date, _, author, title, date_create, publisher = d
create_one(isbn, author, title, publisher)
time.sleep(1)
现在 https://www.goodreads.com/book/new 页面需要 reCAPTCHA,是不是没法用脚本添加书籍了?
大概是吧…… 我用的时候还没有recaptcha
…On Wed, Apr 15, 2020, 23:56 Leo Liang ***@***.***> wrote:
***@***.**** commented on this gist.
------------------------------
现在 https://www.goodreads.com/book/new 页面需要 reCAPTCHA,是不是没法用脚本添加书籍了?
—
You are receiving this because you authored the thread.
Reply to this email directly, view it on GitHub
<https://gist.github.com/5b012985008708d96b22102258c1b90d#gistcomment-3255641>,
or unsubscribe
<https://github.com/notifications/unsubscribe-auth/AAJ2AUOATWY72M5E4F7MSPTRMXKKTANCNFSM4LD77JYA>
.
现在没办法了么?
现在要librarian才能创建图书,不如去neodb,自动爬各处图书,维护也比较积极
neodb真不错
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
第一个csv