Skip to content

Instantly share code, notes, and snippets.

@Joshua1989
Last active October 9, 2018 20:18
Show Gist options
  • Save Joshua1989/0be39aa42dd8490f170618d91e3c8dd0 to your computer and use it in GitHub Desktop.
Save Joshua1989/0be39aa42dd8490f170618d91e3c8dd0 to your computer and use it in GitHub Desktop.
Bangumi_Average_Rating_By_Company.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "适用于制作公司已经打成最后一个tag的情况"
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2018-10-09T20:17:05.918454Z",
"end_time": "2018-10-09T20:17:05.929882Z"
},
"trusted": true
},
"cell_type": "code",
"source": "from bs4 import BeautifulSoup\nimport requests\nimport pandas as pd\nuser_id, page_num = 'hentyclopedia', 40\ndata, companies = {}, set()\nfor i in range(1, page_num + 1):\n url = f'https://bgm.tv/anime/list/{user_id}/collect?page={i}'\n html = requests.get(url).content.decode('utf8')\n soup = BeautifulSoup(html, 'lxml')\n items = soup.findAll('div', attrs={'class':'inner'})[2:-1]\n for item in items:\n try:\n subID = item.a['href'].split('/')[-1]\n name = item.a.string\n rate = int(item.findAll('span', attrs={'class': 'starsinfo'})[0]['class'][0][6:])\n tags = item.findAll('span', attrs={'class': 'tip'})[0].string.split()[1:]\n data[subID] = {'name': name, 'rate': rate, 'tags': set(tags)}\n # I put the company info as the last tag\n companies.add(tags[-1])\n except Exception:\n pass\n# There are some anime lacking this info so I need to manually remove them\ncompanies -= {'军事','后宫','奇幻','恋爱','战斗', '搞笑','日常','校园','母爱','治愈','游戏改','漫改','燃','猎奇','百合','肉番','里番'}\n# Split genre tags and company tags\nfor k, v in data.items():\n v['company'] = companies & v['tags']\n v['tags'] -= v['company']\n# Compute statistics\nstat = {x: {'rate_sum': 0, 'count': 0} for x in companies}\nfor k, v in data.items():\n for c in v['company']:\n stat[c]['rate_sum'] += v['rate']\n stat[c]['count'] += 1\ndf = pd.DataFrame(stat).transpose()\ndf['avg'] = df['rate_sum'] / df['count']\n# Show average rating for companies whose count >= 10\ndf[df['count'] >= 10].sort_values(by='avg', ascending=False)",
"execution_count": 110,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 110,
"data": {
"text/plain": " count rate_sum avg\nWHITEFOX 12 95 7.916667\nSHAFT 29 223 7.689655\nMADHouse 27 201 7.444444\nSUNRISE 21 156 7.428571\nufotable 19 141 7.421053\nBONES 29 214 7.379310\ndavidproduction 11 80 7.272727\nP.A.WORKS 21 152 7.238095\nProduction.I.G 17 123 7.235294\nBrainsBase 11 79 7.181818\nA-1Pictures 43 308 7.162791\n京阿尼 35 250 7.142857\nWITSTUDIO 14 99 7.071429\nJ.C.STAFF 60 419 6.983333\nfeel 23 158 6.869565\nstudiodeen 26 178 6.846154\nTMSEntertainment 13 89 6.846154\nLerche 27 182 6.740741\n动画工房 20 134 6.700000\nStudio五组 22 144 6.545455\nSILVERLINK. 51 330 6.470588\nTNK 12 77 6.416667\nAIC 25 160 6.400000\nDiomedéa 23 147 6.391304\nprojectNo.9 10 63 6.300000\nLIDENFILMS 12 75 6.250000\nXEBEC 21 131 6.238095\n8bit 13 81 6.230769\nGONZO 10 62 6.200000\nArms 11 66 6.000000\nProductionIMS 14 82 5.857143",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>count</th>\n <th>rate_sum</th>\n <th>avg</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>WHITEFOX</th>\n <td>12</td>\n <td>95</td>\n <td>7.916667</td>\n </tr>\n <tr>\n <th>SHAFT</th>\n <td>29</td>\n <td>223</td>\n <td>7.689655</td>\n </tr>\n <tr>\n <th>MADHouse</th>\n <td>27</td>\n <td>201</td>\n <td>7.444444</td>\n </tr>\n <tr>\n <th>SUNRISE</th>\n <td>21</td>\n <td>156</td>\n <td>7.428571</td>\n </tr>\n <tr>\n <th>ufotable</th>\n <td>19</td>\n <td>141</td>\n <td>7.421053</td>\n </tr>\n <tr>\n <th>BONES</th>\n <td>29</td>\n <td>214</td>\n <td>7.379310</td>\n </tr>\n <tr>\n <th>davidproduction</th>\n <td>11</td>\n <td>80</td>\n <td>7.272727</td>\n </tr>\n <tr>\n <th>P.A.WORKS</th>\n <td>21</td>\n <td>152</td>\n <td>7.238095</td>\n </tr>\n <tr>\n <th>Production.I.G</th>\n <td>17</td>\n <td>123</td>\n <td>7.235294</td>\n </tr>\n <tr>\n <th>BrainsBase</th>\n <td>11</td>\n <td>79</td>\n <td>7.181818</td>\n </tr>\n <tr>\n <th>A-1Pictures</th>\n <td>43</td>\n <td>308</td>\n <td>7.162791</td>\n </tr>\n <tr>\n <th>京阿尼</th>\n <td>35</td>\n <td>250</td>\n <td>7.142857</td>\n </tr>\n <tr>\n <th>WITSTUDIO</th>\n <td>14</td>\n <td>99</td>\n <td>7.071429</td>\n </tr>\n <tr>\n <th>J.C.STAFF</th>\n <td>60</td>\n <td>419</td>\n <td>6.983333</td>\n </tr>\n <tr>\n <th>feel</th>\n <td>23</td>\n <td>158</td>\n <td>6.869565</td>\n </tr>\n <tr>\n <th>studiodeen</th>\n <td>26</td>\n <td>178</td>\n <td>6.846154</td>\n </tr>\n <tr>\n <th>TMSEntertainment</th>\n <td>13</td>\n <td>89</td>\n <td>6.846154</td>\n </tr>\n <tr>\n <th>Lerche</th>\n <td>27</td>\n <td>182</td>\n <td>6.740741</td>\n </tr>\n <tr>\n <th>动画工房</th>\n <td>20</td>\n <td>134</td>\n <td>6.700000</td>\n </tr>\n <tr>\n <th>Studio五组</th>\n <td>22</td>\n <td>144</td>\n <td>6.545455</td>\n </tr>\n <tr>\n <th>SILVERLINK.</th>\n <td>51</td>\n <td>330</td>\n <td>6.470588</td>\n </tr>\n <tr>\n <th>TNK</th>\n <td>12</td>\n <td>77</td>\n <td>6.416667</td>\n </tr>\n <tr>\n <th>AIC</th>\n <td>25</td>\n <td>160</td>\n <td>6.400000</td>\n </tr>\n <tr>\n <th>Diomedéa</th>\n <td>23</td>\n <td>147</td>\n <td>6.391304</td>\n </tr>\n <tr>\n <th>projectNo.9</th>\n <td>10</td>\n <td>63</td>\n <td>6.300000</td>\n </tr>\n <tr>\n <th>LIDENFILMS</th>\n <td>12</td>\n <td>75</td>\n <td>6.250000</td>\n </tr>\n <tr>\n <th>XEBEC</th>\n <td>21</td>\n <td>131</td>\n <td>6.238095</td>\n </tr>\n <tr>\n <th>8bit</th>\n <td>13</td>\n <td>81</td>\n <td>6.230769</td>\n </tr>\n <tr>\n <th>GONZO</th>\n <td>10</td>\n <td>62</td>\n <td>6.200000</td>\n </tr>\n <tr>\n <th>Arms</th>\n <td>11</td>\n <td>66</td>\n <td>6.000000</td>\n </tr>\n <tr>\n <th>ProductionIMS</th>\n <td>14</td>\n <td>82</td>\n <td>5.857143</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {}
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "对于没有打制作公司tag的情况, 需要一个页面一个页面的爬(非常慢)"
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2018-10-09T20:15:58.690428Z",
"end_time": "2018-10-09T20:15:58.693522Z"
},
"trusted": true
},
"cell_type": "code",
"source": "def get_company(subject_id):\n delim = '×/+'\n url = f'https://bgm.tv/subject/{subject_id}'\n html = requests.get(url).content.decode('utf8')\n soup = BeautifulSoup(html, 'lxml')\n for x in soup.findAll('li')[47:]:\n if x.span and x.span.string == '动画制作: ':\n return {z.string for z in x.findAll('a')}\n return {}\n\nuser_id, page_num = 'hentyclopedia', 40\ndata, companies = {}, set()\nfor i in range(1, page_num + 1):\n url = f'https://bgm.tv/anime/list/{user_id}/collect?page={i}'\n html = requests.get(url).content.decode('utf8')\n soup = BeautifulSoup(html, 'lxml')\n items = soup.findAll('div', attrs={'class':'inner'})[2:-1]\n for item in items:\n try:\n subID = item.a['href'].split('/')[-1]\n name = item.a.string\n rate = int(item.findAll('span', attrs={'class': 'starsinfo'})[0]['class'][0][6:])\n tags = item.findAll('span', attrs={'class': 'tip'})[0].string.split()[1:]\n company = get_company(subID)\n data[subID] = {'name': name, 'rate': rate, 'tags': set(tags), 'company': company}\n print(subID, data[subID])\n except Exception:\n pass\n# Compute statistics\nstat = {x: {'rate_sum': 0, 'count': 0} for x in companies}\nfor k, v in data.items():\n for c in v['company']:\n stat[c]['rate_sum'] += v['rate']\n stat[c]['count'] += 1\ndf = pd.DataFrame(stat).transpose()\ndf['avg'] = df['rate_sum'] / df['count']\n# Show average rating for companies whose count >= 10\ndf[df['count'] >= 10].sort_values(by='avg', ascending=False)",
"execution_count": 109,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"gist": {
"id": "",
"data": {
"description": "Bangumi_Average_Rating_By_Company.ipynb",
"public": true
}
},
"hide_input": false,
"kernelspec": {
"name": "python3",
"display_name": "Python 3",
"language": "python"
},
"language_info": {
"name": "python",
"version": "3.6.5",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"varInspector": {
"window_display": false,
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"library": "var_list.py",
"delete_cmd_prefix": "del ",
"delete_cmd_postfix": "",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"library": "var_list.r",
"delete_cmd_prefix": "rm(",
"delete_cmd_postfix": ") ",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
]
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment