Last active
January 5, 2019 02:55
Save Chitsing/7e1fe9a31fb647405feedc3e2dbf7b67 to your computer and use it in GitHub Desktop.
爬取十余个p2p网站的爬虫 python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
本爬虫可以爬取几个著名互联网平台如小赢理财,爱钱进,玖富普惠,积木盒子,等平台P2P产品的产品名,期限,利息和加息等信息 | |
主要使用的方法是request获取网页信息,再用bs4、lxml(xpath),pyquery(query)等select选择器筛选定位所需信息,或者直接通过json读取相应接口内容,再numpy的array暂存数组,最后用pandas保存到csv里面,以供继续分析 | |
相关网页链接主要通过谷歌浏览器抓包获得,APP相关网页信息接口主要通过stream平台获得 | |
本爬虫仅供技术交流,请勿商用 | |
如有问题,欢迎随时给我留言 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -- coding: utf-8 -- 备注文件编码,方便中文 | |
import requests #访问网页 | |
from lxml import etree #用xpath解析网页找信息 | |
import json #json化网页数据方便提取 | |
from bs4 import BeautifulSoup #用soup解析网页 | |
from fake_useragent import UserAgent #防反爬虫识别UA | |
import time #时间,休眠,防止服务器反爬 | |
import random #随机函数,设定随机时间 | |
import pandas as pd #计算,存储 | |
from pyquery import PyQuery #用jpy选择器找信息 | |
import numpy as np #用矩阵来暂存并新增信息 | |
import datetime #给保存的信息加上时间戳 | |
nowTime ='%Y-%m-%d %H:%M:%S') #获取系统当前时间,日期+时间 | |
nowdate = #获取系统当前日期 | |
nowTime_hms ='%H:%M:%S')#获取系统当前时间,时分秒 | |
# 获取小赢理财信息的函数 | |
def get_xiaoying(): | |
headers = { | |
'accept': 'application/json, text/javascript, */*; q=0.01', | |
'accept-encoding': 'gzip, deflate, br', | |
'accept-language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,zh-TW;q=0.6', | |
'cache-control': 'no-cache', | |
'pragma': 'no-cache', | |
'referer': '', | |
'x-requested-with': 'XMLHttpRequest' | |
} #伪装浏览器 | |
ua = UserAgent() | |
headers['User-Agent'] = ua.random # 这里用随机的user-agent 变换浏览器说明, 来防止网站认出爬虫 | |
#第1页产品信息列表 | |
url_1 = '' | |
r = requests.get(url_1,headers=headers).text | |
#用 lxml的etree.HTML方法解析网页 | |
s = etree.HTML(r) | |
info = np.array(['小赢理财',nowTime,'']) #在矩阵里面标记下网站名称 | |
for i in range(10): | |
path_name = '//*[@id="contentList"]/ul/li[{}]/div[1]/div/a/text()'.format(i+1) | |
path_month = ' // *[ @ id = "contentList"]/ul/li[{}]/div[1]/ul/li[2]/p[1]/span/text()'.format(i + 1) | |
path_price = '//*[@id="contentList"]/ul/li[{}]/div[1]/ul/li[1]/p[1]/text()'.format(i+1) | |
#获取的是list,用,join转化成为str | |
name_= ','.join(s.xpath(path_name)) | |
month_ = ','.join(s.xpath(path_month)) | |
price_ = ','.join(s.xpath(path_price)) | |
print(name_,month_,price_) | |
info = np.row_stack((info,[name_,month_,price_])) | |
#解析第1页之后的页数 | |
for p in range(2,10): | |
t =random.randint(1,5) | |
time.sleep(t) | |
print("page",p) | |
url_p = '{}&_fromAjax_=1&_csrfToken_=d41d8cd98f00b204e9800998ecf8427e&_=1527152826113'.format( | |
p) | |
content = requests.get(url_p,headers=headers) | |
r2 = content.text | |
json_content = | |
soup = BeautifulSoup(json_content['data']['html'],'html.parser') | |
p_list = soup.find_all(class_ = "fl card-info") | |
# 判断是否有内容,要是没有就不获取,退出循环 | |
if len(p_list): | |
#用循环的方式获取各个产品的名字,利率,期限等 | |
for each_p_list in p_list: | |
rate = each_p_list.find(class_="light-txt").string | |
time2 = each_p_list.find(class_="big-txt").string | |
p_name = each_p_list.find(class_="weak-fontc").string | |
print(p_name, time2, rate) | |
info = np.row_stack((info, [p_name, time2, rate])) | |
else: | |
break | |
f = pd.DataFrame.from_dict(info) #把矩阵字典化 | |
f.to_csv('p2p.csv', encoding='utf-8-sig', mode='a') #把信息矩阵存入csv | |
#爱钱进网站 | |
def get_iqianjin(): | |
headers = { | |
'Accept': 'application/json, text/javascript, */*; q=0.01', | |
'Accept-Encoding': 'gzip, deflate', | |
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,zh-TW;q=0.6', | |
'Cache-Control': 'no-cache', | |
'Connection': 'keep-alive', | |
'Host': '', | |
'Pragma': 'no-cache', | |
'Referer': '', | |
'X-Requested-With': 'XMLHttpRequest' | |
} #伪装浏览器 | |
url2= '' #js隐藏,关键字 plandata?,内有定期数据 | |
url3 = '' #js隐藏,关键字data?15271777318601 内有活期数据 | |
ua = UserAgent() | |
headers['User-Agent'] = ua.random # 这里用随机的user-agent 变换浏览器说明, 来防止网站认出爬虫 | |
#获取网页内容的text | |
net_content_html2 = requests.get(url2,headers=headers) | |
net_content_html3 = requests.get(url3,headers=headers) | |
net_content2 = net_content_html2.text | |
net_content3 = net_content_html3.text | |
json_content1 = json.loads(net_content3) | |
print('活期产品锁定期,利率,新手加息') | |
print(json_content1['bean']['fullTimeDesc'],json_content1['bean']['avgYield'],json_content1['bean']['interestlimit']) | |
info = np.array(['爱钱进',nowTime,'']) | |
# title = ['(零存宝)期限','利率','新手加息'] | |
info = np.row_stack((info,['(零存宝)期限-天','利率','新手加息'])) | |
info = np.row_stack((info,[json_content1['bean']['fullTimeDesc'],json_content1['bean']['avgYield'],json_content1['bean']['interestlimit']])) | |
json_content = json.loads(net_content2) | |
print('整存宝(定期产品期限,利率,新手加息)') | |
product_info = [(item.get('period', 'NA'), item.get('basicProfileRate', 'NA'), item.get('extraReward', 'NA')) for item in | |
json_content['bean']] | |
info = np.row_stack((info, ['(定存宝)期限-月', '利率%', '新手加息%'])) | |
info = np.row_stack((info, product_info)) | |
f = pd.DataFrame.from_dict(info) #把矩阵字典化 | |
f.to_csv('p2p.csv', encoding='utf-8-sig', mode='a') #把信息矩阵存入csv | |
for result in product_info: | |
info = list(result) | |
print(info[0],info[1],info[2]) | |
# 陆金服网站 | |
def get_lup2p(): | |
headers = { | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', | |
'Connection': 'keep-alive', | |
'Host': '', | |
'Referer': '', | |
'Upgrade-Insecure-Requests': '1' | |
} #伪装浏览器 | |
url = '' #信息页面网址 | |
ua = UserAgent() | |
headers['User-Agent'] = ua.random # 这里用随机的user-agent 变换浏览器说明, 来防止网站认出爬虫 | |
# #获取网页内容的text | |
net_content_html = requests.get(url,headers=headers) | |
net_content = net_content_html.text | |
s = etree.HTML(net_content) | |
path_name = '//*[@id="p2p-list"]/div/div[2]/ul/li/a/@title' | |
path_period = '//*[@id="p2p-list"]/div/div[2]/ul/li/ul/li[2]/p/text()' | |
path_rate = '//*[@id="p2p-list"]/div/div[2]/ul/li/ul/li[1]/p/text()' | |
name_= ','.join(s.xpath(path_name)) #把list转换成str | |
period_withb= ','.join(s.xpath(path_period)) | |
rate_withb= ','.join(s.xpath(path_rate)) | |
period_ = period_withb.strip() #把空格去掉 | |
rate_ = rate_withb.strip() | |
print(name_,period_,rate_) | |
info = np.array(['陆金所', nowTime, '']) | |
info = np.row_stack((info, ['名称', '期限(月)', '利率%'])) | |
info = np.row_stack((info, [name_, period_.replace('个月',''), rate_.replace('%','')])) | |
f = pd.DataFrame.from_dict(info) #把矩阵字典化 | |
f.to_csv('p2p.csv', encoding='utf-8-sig', mode='a') #把信息矩阵存入csv | |
# 获取玖富普惠相关数据 可自定义需要爬取的优选产品页数 | |
def get_9fph(page=5): | |
headers = { | |
'Referer': '', | |
'Host': '', | |
'Origin': '', | |
'X-Requested-With': 'XMLHttpRequest', | |
'Accept': 'text/html, */*; q=0.01' | |
} # 伪装浏览器 | |
url = '' # 产品信息页的js | |
url_h = '' #玖富普惠首页home | |
ua = UserAgent() | |
headers['User-Agent'] = ua.random # 这里用随机的user-agent 变换浏览器说明, 来防止网站认出爬虫 | |
payload = { | |
'queryProfit': '', | |
'queryPeriod': '', | |
'queryProductCode': '', | |
'showType': 'K', | |
'page': 0, | |
'orderby': '0', | |
'orderByType': 'desc', | |
'productType': '' | |
} | |
#获得首页信息 | |
h_content_html =, headers=headers) | |
h_content = h_content_html.text | |
s1 = etree.HTML(h_content) | |
info = np.array(['玖富普惠', nowTime, '','']) | |
#获取新手数据 | |
n_url = '' # 新手产品的url | |
n_content_html =, headers=headers).text | |
json_n = json.loads(n_content_html) | |
result = [(item.get('productName', 'NA'),item.get('period', 'NA'), item.get('standardProfit', 'NA'), item.get('plusProfit', 'NA')) for item in json_n] | |
info = np.row_stack((info, ['名称', '期限(天)', '利率%','加息'])) | |
info = np.row_stack((info,result)) | |
for productinfo in result: | |
p_info = list(productinfo) | |
print(p_info[0],p_info[1],p_info[2],p_info[3]) | |
#获取特供数据 | |
path_name = '//div[@class ="subprod bg_ff box-bor po_re"]/div/div/a/text()' | |
path_period = '//p[@class ="fr"]/em/text()' | |
path_rate = '//div[@class ="fl"]/em/text()' | |
name_1_1 = ",".join(s1.xpath(path_name)) | |
period_1_1 = s1.xpath(path_period)[2] | |
rate_1_1 = s1.xpath(path_rate)[1].replace("%","") | |
print(name_1_1, period_1_1, rate_1_1) | |
info = np.row_stack((info, [name_1_1, period_1_1, rate_1_1, 0])) | |
# 获取宝贝计划 | |
payload['productType'] = "BBJH" | |
net_content_html =, data=payload, headers=headers) | |
bbjh_content = net_content_html.text | |
s2 = etree.HTML(bbjh_content) | |
path_name = '//div[@class="opname clearfix"]/em/a/text()' | |
path_period = '//li[@class="opinfo-li-r"]/span/text()' | |
path_rate = '//div[@class="oplixi clearfix"]/h2/em/text()' | |
for i in range(6): | |
name_2 = s2.xpath(path_name)[i] | |
period_2 = s2.xpath(path_period)[i] | |
rate_2 = s2.xpath(path_rate)[i] | |
print(name_2, period_2, rate_2) | |
info = np.row_stack((info, [name_2, period_2, rate_2, 0])) | |
#获取优选计划 | |
for p in range(page): | |
payload['page'] = p | |
payload['productType'] = "yx" | |
net_content_html =, data=payload, headers=headers) | |
net_content = net_content_html.text | |
s3 = etree.HTML(net_content) | |
path_name = '//div[@class="opname clearfix"]/em/a/text()' # ok | |
path_period = '//li[@class="opinfo-li-r"]/span/text()' # ok | |
path_rate = '//div[@class="oplixi clearfix"]/h2/em/text()' # ok | |
t = random.randint(1, 5) | |
print("正在获取第{}页数据,下一页数据将在{}秒后显示".format(p + 1, t)) | |
time.sleep(t) | |
for i in range(6): | |
name_3 = s3.xpath(path_name)[i] | |
period_3 = s3.xpath(path_period)[i] | |
rate_3 = s3.xpath(path_rate)[i] | |
print(name_3, period_3, rate_3) | |
info = np.row_stack((info, [name_3, period_3, rate_3, 0])) | |
f = pd.DataFrame.from_dict(info) #把矩阵字典化 | |
f.to_csv('p2p.csv', encoding='utf-8-sig', mode='a') #把信息矩阵存入csv | |
# 积木盒子 | |
def get_jimu(num = 20 ,page = 3): | |
headers = { | |
'Host': '', | |
'Referer': '', | |
'Upgrade-Insecure-Requests': '1' | |
} #伪装浏览器 | |
url_q = '' #轻松投 | |
#获取网页内容的text | |
html_content_q = requests.get(url_q,headers=headers) | |
content_q = html_content_q.text | |
q = etree.HTML(content_q) | |
jpyq = PyQuery(content_q) | |
infoj = np.array(['积木盒子', nowTime, '', '','']) | |
infoj = np.row_stack((infoj, (['名称', '期限(月)', '利率', '加息', '开放状态']))) | |
#获取轻松投数据 | |
for i in range(num): | |
path_name = '//html/body/div[4]/div[2]/div/a[{}]/div/div[1]/text()'.format(i+1) | |
path_period = '/html/body/div[4]/div[2]/div/a[{}]/div/div[3]/div[2]/div[1]/text()'.format(i+1) | |
path_status = '/html/body/div[4]/div[2]/div/a[{}]/div/div[3]/div[3]/div[1]/text()'.format(i+1) | |
name = ','.join(q.xpath(path_name)) | |
period = ','.join(q.xpath(path_period)).strip() | |
status = ','.join(q.xpath(path_status)) | |
ratejpyq = jpyq( | |
'body > div.container.venus-container > div.project-container > div > a:nth-child({}) > div > > div.rate > div.num.invest-item-profit'.format( | |
i + 1)).text() | |
rate = ratejpyq.split('+') | |
if len(rate) == 2: | |
print(name, period, rate[0], rate[1].replace('%', ''), status) | |
infoj = np.row_stack((infoj,(name, period.replace(',',''), rate[0], rate[1].replace('%', ''), status))) | |
else: | |
print(name, period, rate[0].replace('%', ''), status) | |
infoj = np.row_stack((infoj,(name, period.replace(',',''), rate[0].replace('%', ''),'0', status))) | |
#获取自选投数据 | |
infoj = np.row_stack((infoj, (['名称', '期限(月)', '利率', '加息', '募集进度/已募集金额(万元)']))) | |
for page_z in range(page): | |
t = random.randint(1, 5) | |
print('正在获取第{}页数据,下一页数据将在{}秒后显示'.format(page_z+1,t)) | |
time.sleep(t) | |
url_z = '{}&category=&status='.format(page_z+1) # 自选投 | |
html_content_z = requests.get(url_z,headers=headers) | |
content_z = html_content_z.text | |
z = etree.HTML(content_z) | |
jpyz = PyQuery(content_z) | |
for a in range(12): | |
path_name_1 = "/html/body/div[6]/div/div[{}]/a/div/div[1]/div[1]/text()".format(a+1) | |
path_period_1 = "/html/body/div[6]/div/div[{}]/a/div/div[4]/div/div[3]/div[1]/span/text()".format(a+1) | |
path_status_1 = "/html/body/div[6]/div/div[{}]/a/div/p/span[1]/text()".format(a+1) | |
name_1 = ','.join(z.xpath(path_name_1)).strip() | |
ratejpyz = jpyz( | |
'body > div.container.project-list > div > div:nth-child({}) > a > div:nth-child(1) > div.invest-item-features > div > div.invest-item-feature.invest-item-rate > div:nth-child(1) > span'.format( | |
a+1 | |
)).text() | |
period_1 = ','.join(z.xpath(path_period_1)).strip() | |
status_1 = ','.join(z.xpath(path_status_1)).strip() | |
infoj = np.row_stack((infoj,(name_1,period_1,ratejpyz.replace('%',''),'0',status_1))) | |
print(name_1,period_1,ratejpyz.replace('%',''),status_1) | |
f = pd.DataFrame.from_dict(infoj) #把矩阵字典化 | |
f.to_csv('p2p.csv', encoding='utf-8-sig', mode='a') #把信息矩阵存入csv | |
#获取拍拍贷 | |
def get_ppd(): | |
headers = { | |
'Content-Type': 'application/json;charset=UTF-8', | |
'Origin': '', | |
'Referer': '' | |
} # 伪装浏览器 | |
url = '' # 产品信息页 | |
ua = UserAgent() | |
headers['User-Agent'] = ua.random # 这里用随机的user-agent 变换浏览器说明, 来防止网站认出爬虫 | |
payload = { 'appid':'h5', | |
'source':'1', | |
'version':'1' | |
} | |
h_content_html =, data=json.dumps(payload), headers=headers) | |
h_content = h_content_html.text | |
json_content = json.loads(h_content) | |
info = np.array(['【拍拍贷】', nowTime, '','']) | |
info = np.row_stack((info, ['名称', '期限(天)', '利率%','加息%'])) | |
result = [(item.get('title', 'NA'),item.get('days', 'NA'),item.get('rate', 'NA'),item.get('addInterestRate', 'NA')) for item in json_content["resultContent"]['produts']] | |
info = np.row_stack((info, result)) | |
f = pd.DataFrame.from_dict(info) #把矩阵字典化 | |
f.to_csv('p2p.csv', encoding='utf-8-sig', mode='a') #把信息矩阵存入csv | |
for productinfo in result: | |
print(productinfo) | |
# 信而富 | |
def get_xinerfu(pro_num = 11): | |
headers = { | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', | |
'Accept-Encoding': 'gzip, deflate, br', | |
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,zh-TW;q=0.6', | |
'Connection': 'keep-alive', | |
'Host': '', | |
'Upgrade-Insecure-Requests': '1' | |
} # 伪装浏览器 | |
url = '' # 产品信息页 | |
ua = UserAgent() | |
headers['User-Agent'] = ua.random # 这里用随机的user-agent 变换浏览器说明, 来防止网站认出爬 | |
h_content_html = requests.get(url, headers=headers) | |
h_content = h_content_html.text | |
h = etree.HTML(h_content) | |
info = np.array(['信而富', nowTime, '','','']) | |
info = np.row_stack((info, ['名称', '期限(天)', '利率下限%','利率上限%','平均利率'])) | |
for i in range (pro_num): | |
name = ",".join(h.xpath('/html/body/div[1]/div/div[3]/div[2]/ul[{}]/li[1]/a/text()'.format(i + 1))) #名称'' | |
period = ",".join(h.xpath('/html/body/div[1]/div/div[3]/div[2]/ul[{}]/li[4]/text()'.format(i + 1))) #天数 | |
rate = ",".join(h.xpath('/html/body/div[1]/div/div[3]/div[2]/ul[{}]/li[5]/*/text()'.format(i + 1))) #利率 | |
rate2 = rate.split(',') | |
meanrate = (float(rate2[0])+ float(rate2[1]))/len(rate2) | |
print(name,period,rate2[0],rate2[1],meanrate) | |
info = np.row_stack((info, [name,period,rate2[0],rate2[1],meanrate])) | |
f = pd.DataFrame.from_dict(info) #把矩阵字典化 | |
f.to_csv('p2p.csv', encoding='utf-8-sig', mode='a') #把信息矩阵存入csv | |
# 人人贷 | |
def get_rrd(): | |
headers = { | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', | |
'Accept-Encoding': 'gzip, deflate, br', | |
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,zh-TW;q=0.6', | |
'Connection': 'keep-alive', | |
'Host': '', | |
'Upgrade-Insecure-Requests': '1' | |
} # 伪装浏览器 | |
url = '' # 产品信息页 | |
ua = UserAgent() | |
headers['User-Agent'] = ua.random # 这里用随机的user-agent 变换浏览器说明, 来防止网站认出爬 | |
#访问首页 | |
h_content_html = requests.get(url, headers=headers) | |
h_content = h_content_html.text | |
h = etree.HTML(h_content) | |
info = np.array(['人人贷', nowTime, '','']) | |
info = np.row_stack((info, ['名称', '期限(月)', '利率%','加息%'])) | |
#获取新手专享 | |
name = ",".join(h.xpath('/html/body/div[4]/div[3]/div[3]/div[1]/div/span/text()')) # 名称 | |
period = ",".join(h.xpath('/html/body/div[4]/div[3]/div[3]/div[2]/div[2]/div[1]/text()')) # 期限 | |
rate = ",".join(h.xpath('/html/body/div[4]/div[3]/div[3]/div[2]/div[1]/div[1]/text()')) # 利率 | |
print(name, period.replace('个月',''), rate) | |
info = np.row_stack((info, [name, period.replace('个月',''), rate, '0'])) | |
#获取优选计划 | |
name = ",".join(h.xpath('/html/body/div[4]/div[4]/div[1]/div/span/text()')) # 名称 | |
period = ",".join(h.xpath('/html/body/div[4]/div[4]/div[2]/div[2]/div[1]/text()')) # 期限 | |
rate1 = ",".join(h.xpath('/html/body/div[4]/div[4]/div[2]/div[1]/div[1]/text()')) # 基础利率 | |
rate2 = ",".join(h.xpath('/html/body/div[4]/div[4]/div[2]/div[1]/div[1]/*/text()')) # 加息利率 | |
rate = (rate1.replace("%","")+rate2.replace("%","")).replace(",+"," ") | |
info = np.row_stack((info, [name, period.replace('个月',''), rate1, rate2.replace("%","").replace(",+"," ")])) | |
print(name, period.replace('个月',''), rate) | |
#获取U计划 | |
for u in range(6): | |
period = ",".join(h.xpath('/html/body/div[4]/div[5]/div[2]/ul/li[{}]/a/p[1]/span/em/text()'.format(u+1))) # 期限 | |
rate = ",".join(h.xpath('/html/body/div[4]/div[5]/div[2]/ul/li[{}]/a/p[2]/span[1]/i/text()'.format(u+1))) # 基础利率 | |
rates = rate.split(',+') | |
if len(rates) == 2: | |
info = np.row_stack((info, ["U计划", period.replace('个月',''), rates[0], rates[1]])) | |
else: | |
info = np.row_stack((info, ["U计划", period.replace('个月',''), rates[0], ''])) | |
print("U计划",period.replace('个月',''),rate.replace(",+"," ")) | |
#print(rates) | |
#info = np.row_stack((info, ["U计划", period, rates[0], rates[1]])) | |
#获取薪计划 | |
name = ",".join(h.xpath('/html/body/div[4]/div[6]/div[1]/div[1]/div/div/text()')) # 名称 | |
period = ",".join(h.xpath('/html/body/div[4]/div[6]/div[1]/div[2]/div[3]/div[1]/text()')) # 期限 | |
rate = ",".join(h.xpath('/html/body/div[4]/div[6]/div[1]/div[2]/div[1]/div[1]/text()')) # 利率 | |
print(name, period.replace('个月',''), rate) | |
info = np.row_stack((info, [name, period.replace('个月',''), rate, '0'])) | |
f = pd.DataFrame.from_dict(info) #把矩阵字典化 | |
f.to_csv('p2p.csv', encoding='utf-8-sig', mode='a') #把信息矩阵存入csv | |
# 网信普惠 | |
def get_wxph(page=5): | |
info = np.array(['网信普惠', nowTime, '', '']) | |
info = np.row_stack((info, ['名称1', '名称2', '期限(月)', '利率%'])) | |
for p in range(page): | |
headers = { | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', | |
'Accept-Encoding': 'gzip, deflate, br', | |
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,zh-TW;q=0.6', | |
'Cache-Control': 'max-age=0', | |
'Connection': 'keep-alive', | |
'Host': '', | |
'Upgrade-Insecure-Requests': '1' | |
} # 伪装浏览器 | |
url = '{}'.format(p+1) # 产品信息页 | |
ua = UserAgent() | |
headers['User-Agent'] = ua.random # 这里用随机的user-agent 变换浏览器说明, 来防止网站认出爬 | |
# 获取页面信息 | |
h_content_html = requests.get(url, headers=headers) | |
h_content = h_content_html.text | |
jpy = PyQuery(h_content) | |
t = random.randint(1,5) | |
print("第{}页产品信息下载中,下一页将在{}秒后下载".format(p+1,t)) | |
time.sleep(t) | |
# 智多鑫 | |
if p == 0: | |
for i in range(3): | |
jpyzdx = jpy("#duotou > div.ph_zdxlist > div:nth-child({})".format(i + 1)) | |
name1 = jpyzdx('div > div.con_l > h3 > a').text() | |
rate1 = jpyzdx('div>div>div>p>span>i').text() | |
period1 = jpyzdx('div> p> em').text() | |
rate2 = rate1.replace('%','').split('~ ') | |
rate3 = (float(rate2[0])+float(rate2[1]))/len(rate2) | |
period2 = period1.replace('天可申请转让/退出','').replace('天','') | |
period3 = float(period2)/30 | |
info = np.row_stack((info, [name1, '', period3, rate3])) | |
print(name1, period3, rate3) | |
# 消费贷,经营贷 | |
for i in range(10): | |
jpyp2p = jpy("#conbd > div:nth-child({})".format(i + 1)) | |
name = jpyp2p('div > div.con_l > h3 > a').text().replace(' ', '') | |
namegyl = jpyp2p('div > div.con_l > h3 > span').text().replace(' ', '') | |
rate = jpyp2p('div>div>div>p>span>i').text().replace('进度条', '') | |
period = jpyp2p('div> p> em').text() | |
info = np.row_stack((info, [name, namegyl, period.replace('个月',''), rate])) | |
print(name, namegyl, period.replace('个月',''), rate) | |
f = pd.DataFrame.from_dict(info) # 把矩阵字典化 | |
f.to_csv('p2p.csv', encoding='utf-8-sig', mode='a') # 把信息矩阵存入csv | |
# 微贷网 | |
def get_weidai(p_yx=5,p_sb=5): | |
headers = { | |
'Accept': '*/*', | |
'Accept-Encoding': 'gzip, deflate, br', | |
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,zh-TW;q=0.6', | |
'Connection': 'keep-alive', | |
'Host': '', | |
'Referer': '', | |
'X-Requested-With': 'XMLHttpRequest' | |
} # 伪装浏览器 | |
ua = UserAgent() | |
headers['User-Agent'] = ua.random # 这里用随机的user-agent 变换浏览器说明, 来防止网站认出爬 | |
info = np.array(['微贷网', nowTime, '','','']) | |
info = np.row_stack((info, ['名称', '期限(天)', '期限(月)', '利率%', '加息%'])) | |
# X计划 | |
for pe in [3,6,12]: | |
url_xplan = '{}&_stamp=1527673964336'.format(pe) # 产品信息页 | |
xplan_content_html = requests.get(url_xplan, headers=headers) | |
xplan_content = xplan_content_html.text | |
xplan_json = json.loads(xplan_content) | |
period = xplan_json['data']['month'] | |
baserate = xplan_json['data']['baseRate'] | |
addrate = xplan_json['data']['addRate'] | |
print("X计划 0 {} {} {}".format(period, baserate, addrate)) | |
info = np.row_stack((info, ['X计划', '0', period, baserate*100, addrate*100])) | |
time.sleep(1) | |
#优选智投 | |
for page in range(p_yx): #优选产品只有2页 | |
t = random.randint(1,5) | |
time.sleep(t) | |
url_yx = '{}&rows=10&goodsType=PACKAGE'.format(page+1) # 产品信息页 | |
yx_content_html = requests.get(url_yx, headers=headers) | |
yx_content = yx_content_html.text | |
yx_json = json.loads(yx_content) | |
if yx_json['resultCode'] == '1000': #如果获取数据状态码校验正常,则获取进一步的数据 | |
if len(yx_json['data']['data']) : #如果数据信息不为空 | |
info1 = [(item.get('goodsTitle', 'NA'), item.get('days', 'NA'), item.get('month', 'NA'),item.get('baseRate', 'NA'), item.get('addRate', 'NA')) for item in yx_json['data']['data']] | |
info = np.row_stack((info, np.array(info1))) | |
print(info1) | |
else: | |
print('数据信息为空') | |
break | |
else: #如果获取数据状态码校验异常,则跳出并报错 | |
print('网页状态有误') | |
break | |
# 散标 | |
for page in range(p_sb): | |
t = random.randint(1, 5) | |
time.sleep(t) | |
url_sb = '{}&rows=10&goodsType=BIDDING'.format(page+1) # 产品信息页 | |
sb_content_html = requests.get(url_sb, headers=headers) | |
sb_content = sb_content_html.text | |
sb_json = json.loads(sb_content) | |
info2 = [(item.get('goodsTitle', 'NA'), item.get('days', 'NA'), item.get('month', 'NA'),item.get('baseRate', 'NA'),item.get('addRate', 'NA')) for item in sb_json["data"]['data']] | |
info = np.row_stack((info,np.array(info2))) | |
for sb_info in info2: | |
info_d = list(sb_info) | |
name = info_d[0].replace(" ","") | |
print(name,info_d[1],info_d[2],info_d[3],info_d[4]) | |
f = pd.DataFrame.from_dict(info) #把矩阵字典化 | |
f.to_csv('p2p.csv', encoding='utf-8-sig', mode='a') #把信息矩阵存入csv | |
# 星火 | |
def get_xinghuo(): | |
headers = { | |
'Accept': '*/*', | |
'Accept-Encoding': 'gzip, deflate, br', | |
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,zh-TW;q=0.6', | |
'Connection': 'keep-alive', | |
'Host': '', | |
'Referer': '', | |
'X-Requested-With': 'XMLHttpRequest' | |
} # 伪装浏览器 | |
ua = UserAgent() | |
headers['User-Agent'] = ua.random # 这里用随机的user-agent 变换浏览器说明, 来防止网站认出爬 | |
info = np.array(['星火', nowTime, '', '', '']) | |
info = np.row_stack((info, ['名称', '期限(月)', '利率%', '加息%', '剩余额度(元)'])) | |
url_ltb = '' # 产品信息页 | |
url_yyb = '' # 产品信息页 | |
ltb_content_html = requests.get(url_ltb, headers=headers) | |
ltb_content = ltb_content_html.text | |
ltb_json = json.loads(ltb_content) | |
data_ltb = [(item.get('productName', 'NA'),round(item.get('productPeriod', 'NA')/30,2),item.get('annualRate', 'NA'),item.get('floatAnnualRate', 'NA'),item.get('productQuota', 'NA')) for item in ltb_json['data']] | |
info = np.row_stack((info,np.array(data_ltb))) | |
for ltb in data_ltb: | |
list(ltb) | |
print(ltb[0],ltb[1],ltb[2],ltb[3],ltb[4]) | |
yyb_content_html = requests.get(url_yyb, headers=headers) | |
yyb_content = yyb_content_html.text | |
yyb_json = json.loads(yyb_content) | |
data_yyb = [(item.get('productName', 'NA'),item.get('productPeriod', 'NA'),item.get('annualRate', 'NA'),item.get('floatAnnualRate', 'NA'),item.get('productQuota', 'NA')) for item in yyb_json['data']] | |
info = np.row_stack((info, np.array(data_yyb))) | |
for yyb in data_yyb: | |
list(yyb) | |
print(yyb[0],yyb[1],yyb[2],yyb[3],yyb[4]) | |
f = pd.DataFrame.from_dict(info) #把矩阵字典化 | |
f.to_csv('p2p.csv', encoding='utf-8-sig', mode='a') #把信息矩阵存入csv | |
#投米 | |
def get_toumi(): | |
headers = { | |
'Accept': '*/*', | |
'Accept-Encoding': 'gzip, deflate, br', | |
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,zh-TW;q=0.6', | |
'Connection': 'keep-alive', | |
'Host': '', | |
'Referer':'', | |
'X-Requested-With': 'XMLHttpRequest' | |
} # 伪装浏览器 | |
ua = UserAgent() | |
headers['User-Agent'] = ua.random # 这里用随机的user-agent 变换浏览器说明, 来防止网站认出爬 | |
url_tm = '' # 产品信息页 | |
tm_content_html = requests.get(url_tm, headers=headers) | |
tm_content = tm_content_html.text | |
info = np.array(['投米', nowTime, '', '', '']) | |
info = np.row_stack((info, ['产品名称', '期限(月)', '利率%', '加息%', '可购状态'])) | |
tm_json = json.loads(tm_content) | |
data_tm = [(item.get('subProductName', 'NA'),round(float(item.get('runDays', 'NA'))/30,2),item.get('annualRate', 'NA'),item.get('floatAnnualRate', 'NA'),item.get('canBuy', 'NA')) for item in tm_json['data']['list']] | |
info = np.row_stack((info, np.array(data_tm))) | |
for tm in data_tm: | |
list(tm) | |
print(tm[0],tm[1],tm[2],tm[3],tm[4]) | |
f = pd.DataFrame.from_dict(info) # 把矩阵字典化 | |
f.to_csv('p2p.csv', encoding='utf-8-sig', mode='a') # 把信息矩阵存入csv | |
#指旺 | |
import requests | |
import json | |
import time | |
url_newuser_product_list='×tamp=1529284324564.817&user_id=0' | |
headers = {'Host': '', | |
'Accept': '*/*', | |
'User-Agent': 'ZW/4.7.3 (iPhone; iOS 11.4; Scale/2.00)', | |
'Accept-Language': 'en-CN;q=1, zh-Hans-CN;q=0.9', | |
'Accept-Encoding': 'br, gzip, deflate', | |
'Connection': 'keep-alive'} | |
def get_product_info(): | |
if jsoninfo['return_code'] == 0: #如果返回状态正确 | |
try: | |
pro_group = jsoninfo['product_categories'][0]['product_groups'] #标注一下group方便后面引用 | |
for i in range(len(pro_group)): #用遍历获取所有产品类型的列表 | |
if pro_group[int('{}'.format(i))]['group_name'] != '': #如果产品类型列表不为空 | |
products = pro_group[int('{}'.format(i))]['products'] #标注一下products方便后面引用 | |
if len(products) > 1: #如果产品列表里面有多个产品 | |
for i2 in range(len(products)): #遍历产品信息 | |
name = products[int('{}'.format(i2))]['name'] | |
period = products[int('{}'.format(i2))]['product_list_items'][1]['value'] | |
rate1 = products[int('{}'.format(i2))]['product_list_items'][0]['value'] | |
rate2 = products[int('{}'.format(i2))]['product_list_items'][0]['extra'].replace('%','0') | |
if len(products[int('{}'.format(i2))]['prod_list_icons']) == 2: #判断该产品券和红包栏里面的元素数量 | |
quan = '可用红包、券' | |
if len(products[int('{}'.format(i2))]['prod_list_icons']) == 1: | |
quan1 = products[int('{}'.format(i2))]['prod_list_icons'] #获取icon信息 | |
quanq = '' | |
quanh = '' | |
if quanq in quan1: #如果是券的icon,就可用券 | |
quan = '可用券' | |
if quanh in quan1: #如果是红包的icon,就可用红包 | |
quan = '可用红包' | |
if len(products[int('{}'.format(i2))]['prod_list_icons']) == 0: #如果券和红包栏里面没有信息,那就都不可用 | |
quan = '不可用券' | |
print(name, period, rate1, rate2, quan) | |
if len(products) == 1: #如果产品列表只有一个产品,就不用遍历,直接获取即可 | |
name = products[0].get('name') | |
period = products[0]['product_list_items'][1].get('value') | |
rate1 = products[0]['annual_rate_info'].get('annual_rate_str') | |
rate2 = ((products[0]['annual_rate_info'].get('added_annual_rate_str')).replace('+', '')).replace( | |
'%', '') | |
if pro_group[int('{}'.format(i))]['group_name'] == '福卡专享': | |
quan = '不可用券' | |
if pro_group[int('{}'.format(i))]['group_name'] != '福卡专享': | |
i2 = '0' | |
if len(products[int('{}'.format(i2))]['prod_list_icons']) == 2: #判断该产品券和红包栏里面的元素数量 | |
quan = '可用红包、券' | |
if len(products[int('{}'.format(i2))]['prod_list_icons']) == 1: | |
quan1 = products[int('{}'.format(i2))]['prod_list_icons'] #获取icon信息 | |
quanq = '' | |
quanh = '' | |
if quanq in quan1: #如果是券的icon,就可用券 | |
quan = '可用券' | |
if quanh in quan1: #如果是红包的icon,就可用红包 | |
quan = '可用红包' | |
if len(products[int('{}'.format(i2))]['prod_list_icons']) == 0: #如果券和红包栏里面没有信息,那就都不可用 | |
quan = '不可用券' | |
print(name, period, rate1, rate2,quan) | |
except: #用exept来做异常时候的捕获和处理,输出异常时获取到的json信息内容 | |
print('超出list') | |
else: | |
print('json error') | |
#获取未登录时候的产品列表信息 | |
html = requests.get(url_newuser_product_list, headers=headers) | |
jsoninfo = json.loads(html.text) | |
get_product_info() | |
time.sleep(5) | |
if __name__ == '__main__': | |
get_xinghuo() #获取星火数据 | |
#get_toumi() #获取投米数据 | |
#get_xiaoying() #执行获取小赢理财信息,产品名,期限,利率的函数 | |
#get_iqianjin()#执行获取爱钱进信息的函数 | |
#get_lup2p()#执行获取陆金所信息的函数 | |
#get_9fph(5) #执行获取玖富信息的函数,可自定义需要多少页的优选计划信息,一般5-10页,默认是5页 | |
#get_jimu() #执行获取积木盒子相关信息的函数,需要自定义告诉积木盒子轻松投产品数量和自选投爬取页数,默认是21个,3页 | |
#get_ppd() #拍拍贷,名称,期限,利率,加息 | |
#get_xinerfu() # 注意,期限单位是天数,利率是区间,新手信息是3月期的信息。括号内可以填信息页面的产品数,默认11个 | |
#get_rrd() # 注意,散标信息并没有爬取,散标信息和活动信息需要手动查看。项目为:产品名称,期限,利率,加息(如有) | |
#get_wxph(10) #在括号内输入要下载的页数,一般是5或者10页,默认5页 | |
#get_weidai(10,10) #在括号内输入要下载的优选页数(一般2或者3),散标页数(一般是5或者10页,默认5) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment