Last active
February 12, 2021 19:12
-
-
Save rejsmont/ae0269b541ca768c9b869e7fda325d39 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"DC Comics superheroines: 166\n", | |
"Marvel Comics superheroines: 170\n" | |
] | |
} | |
], | |
"source": [ | |
"import requests\n", | |
"from bs4 import BeautifulSoup\n", | |
"\n", | |
"url = 'https://en.wikipedia.org/wiki/List_of_superheroines'\n", | |
"with requests.get(url) as r:\n", | |
" soup = BeautifulSoup(r.content, 'lxml')\n", | |
" ulist = soup.select('h2 + ul > li')\n", | |
" print('DC Comics superheroines:',\n", | |
" len([ x for x in ulist if 'DC' in str(x) ]))\n", | |
" print('Marvel Comics superheroines:',\n", | |
" len([ x for x in ulist if 'Marvel' in str(x) ]))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Marvel superheroes: 684 including 180 female (26.32%)\n", | |
"DC superheroes: 622 including 144 female (23.15%)\n" | |
] | |
} | |
], | |
"source": [ | |
"import requests\n", | |
"\n", | |
"template = {'action': \"query\", 'list': 'categorymembers', 'cmlimit': '200', 'format': \"json\"}\n", | |
"categories = ['Marvel Comics superheroes', 'DC Comics superheroes',\n", | |
" 'Marvel Comics female superheroes', 'DC Comics female superheroes']\n", | |
"tags = ['Marvel', 'DC', 'Marvel female', 'DC female']\n", | |
"\n", | |
"params = [dict({'cmtitle': 'Category: ' + c}, **template) for c in categories]\n", | |
"results = {}\n", | |
"\n", | |
"for c, p in zip(tags, params):\n", | |
" superheroes = []\n", | |
" do_continue = True\n", | |
" while do_continue:\n", | |
" r = requests.get('https://en.wikipedia.org/w/api.php', params=p)\n", | |
" if r:\n", | |
" data = r.json()\n", | |
" do_continue = 'continue' in data\n", | |
" if do_continue:\n", | |
" p['cmcontinue'] = data['continue']['cmcontinue']\n", | |
" candidates = (i['title'].split(' (')[0] for i in data['query']['categorymembers'])\n", | |
" superheroes += [c for c in candidates if not c.startswith('Category:')]\n", | |
" results[c] = list(set(superheroes))\n", | |
"\n", | |
"results['Marvel female'] = [superheroin for superheroin in results['Marvel female']\n", | |
" if superheroin in results['Marvel']]\n", | |
"results['DC female'] = [superheroin for superheroin in results['DC female']\n", | |
" if superheroin in results['DC']]\n", | |
" \n", | |
"ml_total = len(results['Marvel'])\n", | |
"ml_female = len(results['Marvel female'])\n", | |
"ml_percent = 100 * ml_female / ml_total\n", | |
"dc_total = len(results['DC'])\n", | |
"dc_female = len(results['DC female'])\n", | |
"dc_percent = 100 * dc_female / dc_total\n", | |
"\n", | |
"print(f'Marvel superheroes: {ml_total} including {ml_female} female ({ml_percent:.2f}%)')\n", | |
"print(f'DC superheroes: {dc_total} including {dc_female} female ({dc_percent:.2f}%)')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Significant: False, p-value:0.09457\n" | |
] | |
}, | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"import seaborn as sns\n", | |
"import matplotlib.ticker as mticker\n", | |
"import pandas as pd\n", | |
"from scipy.stats import mannwhitneyu\n", | |
"\n", | |
"superheroin_names = results['Marvel female'] + results['DC female']\n", | |
"superhero_names = results['Marvel'] + results['DC']\n", | |
"superhero_universe = ['Marvel' for _ in results['Marvel']] + ['DC' for _ in results['DC']]\n", | |
"superhero_gender = ['Female' if superhero in superheroin_names else 'Male' for superhero in superhero_names]\n", | |
"\n", | |
"superheroes = pd.DataFrame({'Name': superhero_names, \n", | |
" 'Universe': superhero_universe, \n", | |
" 'Gender': superhero_gender}).sort_values('Name')\n", | |
"\n", | |
"data = superheroes.groupby(['Universe', 'Gender'], as_index=False).size()\n", | |
"data.loc[data['Gender'] == 'Female', 'size'] = -data.loc[data['Gender'] == 'Female', 'size']\n", | |
"ax = sns.barplot(x='size', y='Universe', hue='Gender', data=data, dodge=False, orient='h')\n", | |
"ax.set_xlim(-data['size'].max() * 1.1, data['size'].max() * 1.1)\n", | |
"ticks_loc = ax.get_xticks().tolist()\n", | |
"labels = [str(abs(int(t))) for t in ticks_loc]\n", | |
"ax.xaxis.set_major_locator(mticker.FixedLocator(ticks_loc))\n", | |
"ax.set_xticklabels(labels)\n", | |
"ax.set_ylabel('Universe')\n", | |
"ax.set_xlabel('Number of superheroes by gender')\n", | |
"\n", | |
"stats = mannwhitneyu(superheroes.loc[superheroes['Universe'] == 'Marvel', 'Gender'],\n", | |
" superheroes.loc[superheroes['Universe'] == 'DC', 'Gender'])\n", | |
"print(f'Significant: {stats.pvalue < 0.05}, p-value:{stats.pvalue:0.5f}')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.0" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment