christopherkullenberg · November 16, 2017 21:47
diff --git a/instagramscraperparser.ipynb b/instagramscraperparser.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "import networkx as nx\n",
    "import re"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In case the json won't parse, try:\n",
    "```python\n",
    "try:\n",
    "    jsonobject = json.load(jsonfile)\n",
    "except json.JSONDecodeError:\n",
    "    print(\"error\") # there is just one error in the beginning of file\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "sourcefile = \"fotografiska.json\"\n",
    "jsonfile = open(sourcefile)\n",
    "jsonobject = json.load(jsonfile)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def commentstodataframe(jsonobject):\n",
    "    '''This function makes a dataframe of selected content\n",
    "    in the json data structure.'''\n",
    "    commentcounter = 0\n",
    "    thedata = []\n",
    "    for j in jsonobject:\n",
    "        commentdata = {}\n",
    "        postid = j['shortcode']\n",
    "        posturl = 'https://www.instagram.com/p/' + postid\n",
    "        commentdata[\"id\"] = postid\n",
    "        commentdata[\"url\"] = posturl  \n",
    "        commentlist = []\n",
    "        for k in j['comments']['data']:\n",
    "           \n",
    "            commentlist.append((k['owner']['username'], k['text']))\n",
    "            commentcounter += 1           \n",
    "        commentdata[\"comments\"] = commentlist\n",
    "        thedata.append(commentdata)\n",
    "    print(\"Number of comments added to dataframe: \" + str(commentcounter))\n",
    "    df = pd.DataFrame(thedata, columns=['id','url','comments'])\n",
    "    return(df)\n",
    "       \n",
    "df = commentstodataframe(jsonobject)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for comment in df.iterrows():\n",
    "    print(\"\\n\")\n",
    "    print(comment[1][1])\n",
    "    for c in comment[1][2]: \n",
    "        print(c[0], c[1])\n",
    " "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    " def makepostdirectednetwork(df):\n",
    "    '''Makes a directed network from a user to a\n",
    "    post. Can be visualised as an in- or outdegree\n",
    "    network depending on your question.'''\n",
    "    G = nx.DiGraph()\n",
    "    postcounter = 0\n",
    "    interactionscounter = 0\n",
    "    userlist = []\n",
    "    for row in df.iterrows():\n",
    "        postid = row[1][0]\n",
    "        postcounter += 1\n",
    "        for c in row[1][2]: \n",
    "            username = c[0]\n",
    "            interactionscounter += 1\n",
    "            userlist.append(username)\n",
    "            G.add_edge(username, postid) # direction of graph, from user to post\n",
    "    nx.write_gexf(G, sourcefile + \"postdirected.gexf\")\n",
    "    print(\"Posts: \" + str(postcounter))\n",
    "    print(\"Interactions (incl. multiple interactions with same post: \" \n",
    "          + str(interactionscounter))\n",
    "    print(\"Unique users: \" + str(len(set(userlist))))\n",
    "\n",
    "makepostdirectednetwork(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def makeuserdirectednetwork(df):\n",
    "    '''Makes a directe network when users\n",
    "    ping each other using @'''\n",
    "    G = nx.MultiDiGraph()\n",
    "    interactionscounter = 0\n",
    "    userlist = []\n",
    "    for row in df.iterrows():\n",
    "            for c in row[1][2]: \n",
    "                match = re.findall(\"(?<=@).*?(?=[\\s])\", c[1], re.IGNORECASE)\n",
    "                if match:\n",
    "                    source = c[0]\n",
    "                    #G.add_node(source)\n",
    "                    #print(\"\\nSource: \" + source)\n",
    "                    for m in match:\n",
    "                        #G.add_node(m)\n",
    "                        interactionscounter += 1\n",
    "                        #print(\"Target: \" + m)\n",
    "                        G.add_edge(source, m)\n",
    "                        userlist.append(source)\n",
    "                        userlist.append(m)\n",
    "                        \n",
    "    nx.write_gexf(G, sourcefile + \"userdirected.gexf\")\n",
    "    print(\"Number of interactions: \" + str(interactionscounter))\n",
    "    print(\"Number of unique users: \" + str(len(set(userlist))))\n",
    "    print(nx.info(G))\n",
    "                        \n",
    "                        \n",
    "makeuserdirectednetwork(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import json\n",
	"import pandas as pd\n",
	"import networkx as nx\n",
	"import re"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"In case the json won't parse, try:\n",
	"```python\n",
	"try:\n",
	" jsonobject = json.load(jsonfile)\n",
	"except json.JSONDecodeError:\n",
	" print(\"error\") # there is just one error in the beginning of file\n",
	"```"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true,
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"sourcefile = \"fotografiska.json\"\n",
	"jsonfile = open(sourcefile)\n",
	"jsonobject = json.load(jsonfile)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def commentstodataframe(jsonobject):\n",
	" '''This function makes a dataframe of selected content\n",
	" in the json data structure.'''\n",
	" commentcounter = 0\n",
	" thedata = []\n",
	" for j in jsonobject:\n",
	" commentdata = {}\n",
	" postid = j['shortcode']\n",
	" posturl = 'https://www.instagram.com/p/' + postid\n",
	" commentdata[\"id\"] = postid\n",
	" commentdata[\"url\"] = posturl \n",
	" commentlist = []\n",
	" for k in j['comments']['data']:\n",
	" \n",
	" commentlist.append((k['owner']['username'], k['text']))\n",
	" commentcounter += 1 \n",
	" commentdata[\"comments\"] = commentlist\n",
	" thedata.append(commentdata)\n",
	" print(\"Number of comments added to dataframe: \" + str(commentcounter))\n",
	" df = pd.DataFrame(thedata, columns=['id','url','comments'])\n",
	" return(df)\n",
	" \n",
	"df = commentstodataframe(jsonobject)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"for comment in df.iterrows():\n",
	" print(\"\\n\")\n",
	" print(comment[1][1])\n",
	" for c in comment[1][2]: \n",
	" print(c[0], c[1])\n",
	" "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	" def makepostdirectednetwork(df):\n",
	" '''Makes a directed network from a user to a\n",
	" post. Can be visualised as an in- or outdegree\n",
	" network depending on your question.'''\n",
	" G = nx.DiGraph()\n",
	" postcounter = 0\n",
	" interactionscounter = 0\n",
	" userlist = []\n",
	" for row in df.iterrows():\n",
	" postid = row[1][0]\n",
	" postcounter += 1\n",
	" for c in row[1][2]: \n",
	" username = c[0]\n",
	" interactionscounter += 1\n",
	" userlist.append(username)\n",
	" G.add_edge(username, postid) # direction of graph, from user to post\n",
	" nx.write_gexf(G, sourcefile + \"postdirected.gexf\")\n",
	" print(\"Posts: \" + str(postcounter))\n",
	" print(\"Interactions (incl. multiple interactions with same post: \" \n",
	" + str(interactionscounter))\n",
	" print(\"Unique users: \" + str(len(set(userlist))))\n",
	"\n",
	"makepostdirectednetwork(df)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def makeuserdirectednetwork(df):\n",
	" '''Makes a directe network when users\n",
	" ping each other using @'''\n",
	" G = nx.MultiDiGraph()\n",
	" interactionscounter = 0\n",
	" userlist = []\n",
	" for row in df.iterrows():\n",
	" for c in row[1][2]: \n",
	" match = re.findall(\"(?<=@).*?(?=[\\s])\", c[1], re.IGNORECASE)\n",
	" if match:\n",
	" source = c[0]\n",
	" #G.add_node(source)\n",
	" #print(\"\\nSource: \" + source)\n",
	" for m in match:\n",
	" #G.add_node(m)\n",
	" interactionscounter += 1\n",
	" #print(\"Target: \" + m)\n",
	" G.add_edge(source, m)\n",
	" userlist.append(source)\n",
	" userlist.append(m)\n",
	" \n",
	" nx.write_gexf(G, sourcefile + \"userdirected.gexf\")\n",
	" print(\"Number of interactions: \" + str(interactionscounter))\n",
	" print(\"Number of unique users: \" + str(len(set(userlist))))\n",
	" print(nx.info(G))\n",
	" \n",
	" \n",
	"makeuserdirectednetwork(df)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python [default]",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}