Last active
March 31, 2019 10:55
-
-
Save incidunt/8a2cf96c20b06186defa7225219b21b3 to your computer and use it in GitHub Desktop.
用pandas过滤出WordPress精品插件.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# 首先引入所有需要用的库\n", | |
"\n", | |
"#读取jsonl文件的库\n", | |
"import jsonlines\n", | |
"\n", | |
"# 数据分析的库\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"\n", | |
"\n", | |
"import maya\n", | |
"import json\n", | |
"\n", | |
"import requests\n", | |
"\n", | |
"import functools\n", | |
"\n", | |
"import html\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"plugins=[]\n", | |
"\n", | |
"keepkeys=[\n", | |
" \"slug\",\n", | |
" \"name\",\n", | |
" \"author\",\n", | |
" \"downloaded\",\n", | |
" \"rating\",\n", | |
" \"num_ratings\",\n", | |
" \"added\",\n", | |
" \"last_updated\",\n", | |
" \"tested\",\n", | |
" \"support_threads\",\n", | |
" \"support_threads_resolved\"]\n", | |
"\n", | |
"# output.jsonl 由爬虫得来,详见 :\n", | |
"# 用Python爬取WordPress官网所有插件\n", | |
"# https://bestscreenshot.com/scrap-all-plugins-from-wordpress-org/\n", | |
"\n", | |
"with jsonlines.open('../output.jsonl') as reader:\n", | |
" for obj in reader: \n", | |
" for k in list(obj.keys()):\n", | |
" if k not in keepkeys:\n", | |
" del obj[k]\n", | |
" plugins.append(obj)\n", | |
" \n", | |
"df = pd.DataFrame(plugins)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# 把name里的html转义符做一下反转义\n", | |
"df['name']=df['name'].apply(lambda x: html.unescape(x))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# 根据当前日期计算出距离上次更新的天数\n", | |
"df['last_updated_days']=df['last_updated'].apply(lambda x: (maya.now() - maya.when(x) ).days ) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# 返回符合所有条件为真的数据\n", | |
"def conjunction(*conditions):\n", | |
" return functools.reduce(np.logical_and, conditions)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# 所有的过滤的条件\n", | |
"last_updated_days_bool = df['last_updated_days'] <= 365\n", | |
"\n", | |
"downloaded_bool = df['downloaded'] > 1000\n", | |
"\n", | |
"rating_bool = df['rating'] >= 90 \n", | |
"\n", | |
"num_ratings_bool = df['num_ratings'] >= 100 \n", | |
"\n", | |
"support_threads_bool = df['support_threads'] > 0 \n", | |
"\n", | |
"support_threads_resolved_bool = df[\"support_threads_resolved\"] > 0 \n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"data_filtered = df[conjunction(\n", | |
" last_updated_days_bool,\n", | |
" downloaded_bool,\n", | |
" rating_bool,\n", | |
" num_ratings_bool,\n", | |
" support_threads_bool,\n", | |
" support_threads_resolved_bool,\n", | |
" )]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"added 312\n", | |
"author 312\n", | |
"downloaded 312\n", | |
"last_updated 312\n", | |
"name 312\n", | |
"num_ratings 312\n", | |
"rating 312\n", | |
"slug 312\n", | |
"support_threads 312\n", | |
"support_threads_resolved 312\n", | |
"tested 312\n", | |
"last_updated_days 312\n", | |
"dtype: int64" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data_filtered.count()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# 把过滤结果转换成字典\n", | |
"data_filtered_dict=data_filtered.set_index('slug').T.to_dict()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# 把字典转换成列表\n", | |
"data_filtered_list=[]\n", | |
"for k,v in data_filtered_dict.items():\n", | |
" x={'slug':k}\n", | |
" y=v\n", | |
" z = {**x, **y}\n", | |
" data_filtered_list.append(z)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'slug': 'accelerated-mobile-pages',\n", | |
" 'added': '2016-02-07',\n", | |
" 'author': '<a href=\"https://ampforwp.com/\">Ahmed Kaludi, Mohammed Kaludi</a>',\n", | |
" 'downloaded': 4137343,\n", | |
" 'last_updated': '2019-02-28 12:52pm GMT',\n", | |
" 'name': 'AMP for WP – Accelerated Mobile Pages',\n", | |
" 'num_ratings': 780,\n", | |
" 'rating': 90,\n", | |
" 'support_threads': 286,\n", | |
" 'support_threads_resolved': 60,\n", | |
" 'tested': '5.1',\n", | |
" 'last_updated_days': 30}" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data_filtered_list[1]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# 把结果列表存为json文件\n", | |
"json.dump(data_filtered_list, open(\"data_filtered_result.json\",\"w\"))\n" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment