lawlesst · October 8, 2021 20:45
diff --git a/.gitignore b/.gitignore
 .ipynb_checkpoints/*
diff --git a/README.md b/README.md
diff --git a/evidence-from-country-x.ipynb b/evidence-from-country-x.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "925ca2bc",
   "metadata": {},
   "source": [
    "# Evidence from Country X\n",
    "\n",
    "A response to this Tweet:\n",
    "\n",
    "<blockquote class=\"twitter-tweet\"><p lang=\"en\" dir=\"ltr\">Idea for bored computational social scientists: scrape JSTOR or Google Scholar for titles with the format &quot;: Evidence from &lt;Country X&gt;&quot; and count in how many articles Country X = United States. Is it more than 0?</p>&mdash; Fabrizio Gilardi 💬 (@fgilardi) <a href=\"https://twitter.com/fgilardi/status/1445848352088735751?ref_src=twsrc%5Etfw\">October 6, 2021</a></blockquote> <script async src=\"https://platform.twitter.com/widgets.js\" charset=\"utf-8\"></script>\n",
    "\n",
    "[Constellate](https://constellate.org) contains all of JSTOR plus additional ~16 million scholarly articles and book chapters via Portico. Read more here: https://constellate.org/docs/about"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d84b26c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "import csv\n",
    "\n",
    "import constellate\n",
    "from nltk import ngrams\n",
    "import requests"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b0abfe8d",
   "metadata": {},
   "source": [
    "Download the metdadata from the [Constellate dataset](https://constellate.org/dataset/e3c57e9d-05ce-9586-e9c7-17aa93fe78e0)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c02624d",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "metadata = constellate.download(\"e3c57e9d-05ce-9586-e9c7-17aa93fe78e0\", \"metadata\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1f4fc478",
   "metadata": {},
   "source": [
    "Load a list of country names from datahub.io."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3a4b553f",
   "metadata": {},
   "outputs": [],
   "source": [
    "countries = requests.get(\"https://datahub.io/core/country-list/r/data.json\").json()\n",
    "country_names = [c[\"Name\"].lower() for c in countries]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a717739b",
   "metadata": {},
   "source": [
    "Create a helper function to generate ngrams from a block of text."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7b147aff",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_ngrams(text, n):\n",
    "    tokens = text.lower().split()\n",
    "    out = []\n",
    "    for size in range(1, n + 1):\n",
    "        for grams in ngrams(tokens, size):\n",
    "            g = \" \".join(grams)\n",
    "            out.append(g)\n",
    "    return out"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4a04e7ba",
   "metadata": {},
   "source": [
    "Split the title by the defined text `: Evidence from` and attempt to identify country names in the remaining string. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "766c362e",
   "metadata": {},
   "outputs": [],
   "source": [
    "country_count = Counter()\n",
    "\n",
    "with open(metadata) as mfile:\n",
    "    for row in csv.DictReader(mfile):\n",
    "        chunks = row[\"title\"].split(\": Evidence from\")\n",
    "        if len(chunks) > 1:\n",
    "            title_segment = chunks[1].strip().lower()\n",
    "            for ngram in get_ngrams(title_segment, 4):\n",
    "                if ngram in country_names:\n",
    "                    country_count[ngram] += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4b62ee3d",
   "metadata": {},
   "outputs": [],
   "source": [
    "for c, n in country_count.most_common():\n",
    "    print(c.ljust(20), n)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "819d3ecf",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.9"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": true,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"id": "925ca2bc",
	"metadata": {},
	"source": [
	"# Evidence from Country X\n",
	"\n",
	"A response to this Tweet:\n",
	"\n",
	"<blockquote class=\"twitter-tweet\"><p lang=\"en\" dir=\"ltr\">Idea for bored computational social scientists: scrape JSTOR or Google Scholar for titles with the format ": Evidence from <Country X>" and count in how many articles Country X = United States. Is it more than 0?</p>— Fabrizio Gilardi 💬 (@fgilardi) <a href=\"https://twitter.com/fgilardi/status/1445848352088735751?ref_src=twsrc%5Etfw\">October 6, 2021</a></blockquote> <script async src=\"https://platform.twitter.com/widgets.js\" charset=\"utf-8\"></script>\n",
	"\n",
	"[Constellate](https://constellate.org) contains all of JSTOR plus additional ~16 million scholarly articles and book chapters via Portico. Read more here: https://constellate.org/docs/about"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "d84b26c0",
	"metadata": {},
	"outputs": [],
	"source": [
	"from collections import Counter\n",
	"import csv\n",
	"\n",
	"import constellate\n",
	"from nltk import ngrams\n",
	"import requests"
	]
	},
	{
	"cell_type": "markdown",
	"id": "b0abfe8d",
	"metadata": {},
	"source": [
	"Download the metdadata from the [Constellate dataset](https://constellate.org/dataset/e3c57e9d-05ce-9586-e9c7-17aa93fe78e0)."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "7c02624d",
	"metadata": {
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"metadata = constellate.download(\"e3c57e9d-05ce-9586-e9c7-17aa93fe78e0\", \"metadata\")"
	]
	},
	{
	"cell_type": "markdown",
	"id": "1f4fc478",
	"metadata": {},
	"source": [
	"Load a list of country names from datahub.io."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "3a4b553f",
	"metadata": {},
	"outputs": [],
	"source": [
	"countries = requests.get(\"https://datahub.io/core/country-list/r/data.json\").json()\n",
	"country_names = [c[\"Name\"].lower() for c in countries]"
	]
	},
	{
	"cell_type": "markdown",
	"id": "a717739b",
	"metadata": {},
	"source": [
	"Create a helper function to generate ngrams from a block of text."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "7b147aff",
	"metadata": {},
	"outputs": [],
	"source": [
	"def get_ngrams(text, n):\n",
	" tokens = text.lower().split()\n",
	" out = []\n",
	" for size in range(1, n + 1):\n",
	" for grams in ngrams(tokens, size):\n",
	" g = \" \".join(grams)\n",
	" out.append(g)\n",
	" return out"
	]
	},
	{
	"cell_type": "markdown",
	"id": "4a04e7ba",
	"metadata": {},
	"source": [
	"Split the title by the defined text `: Evidence from` and attempt to identify country names in the remaining string. "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "766c362e",
	"metadata": {},
	"outputs": [],
	"source": [
	"country_count = Counter()\n",
	"\n",
	"with open(metadata) as mfile:\n",
	" for row in csv.DictReader(mfile):\n",
	" chunks = row[\"title\"].split(\": Evidence from\")\n",
	" if len(chunks) > 1:\n",
	" title_segment = chunks[1].strip().lower()\n",
	" for ngram in get_ngrams(title_segment, 4):\n",
	" if ngram in country_names:\n",
	" country_count[ngram] += 1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "4b62ee3d",
	"metadata": {},
	"outputs": [],
	"source": [
	"for c, n in country_count.most_common():\n",
	" print(c.ljust(20), n)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "819d3ecf",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.9"
	},
	"toc": {
	"base_numbering": 1,
	"nav_menu": {},
	"number_sections": true,
	"sideBar": true,
	"skip_h1_title": true,
	"title_cell": "Table of Contents",
	"title_sidebar": "Contents",
	"toc_cell": false,
	"toc_position": {},
	"toc_section_display": true,
	"toc_window_display": false
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}