Last active
December 25, 2024 05:25
-
-
Save dsignr/c3f7a67fcfb1fb93698a507f4cce8eef to your computer and use it in GitHub Desktop.
A python script to extract data from CSV and convert it into Gephi compatible GML.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import sys, time\n", | |
"import pandas as pd\n", | |
"import datetime as dt\n", | |
"from IPython.display import display\n", | |
"\n", | |
"import plotly.plotly as py # interactive graphing\n", | |
"from plotly.graph_objs import Bar, Scatter, Marker, Layout " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"FILE_NAME = \"output.gml\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"authors = pd.read_csv('authors.csv', sep=' ')\n", | |
"occurrence = pd.read_csv('occurrence.csv', sep=' ')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#Utility functions\n", | |
"def progress(v):\n", | |
" v = str(v)\n", | |
" sys.stdout.flush()\n", | |
" sys.stdout.write('\\r')\n", | |
" sys.stdout.flush()\n", | |
" sys.stdout.write(v)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>AUTHOR_ID</th>\n", | |
" <th>CO-AUTHOR_ID</th>\n", | |
" <th>NO_OF_BOOKS</th>\n", | |
" <th>AUTHOR</th>\n", | |
" <th>CO-AUTHOR</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>190</td>\n", | |
" <td>7</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>TONER, J</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>66</th>\n", | |
" <td>1</td>\n", | |
" <td>2281</td>\n", | |
" <td>1</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>FREY, E</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>93</th>\n", | |
" <td>1</td>\n", | |
" <td>3896</td>\n", | |
" <td>1</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>GINZBURG, VV</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>101</th>\n", | |
" <td>1</td>\n", | |
" <td>3897</td>\n", | |
" <td>2</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>CLARK, NA</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>110</th>\n", | |
" <td>1</td>\n", | |
" <td>12347</td>\n", | |
" <td>1</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>JACOBSEN, B</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>113</th>\n", | |
" <td>1</td>\n", | |
" <td>12348</td>\n", | |
" <td>1</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>SAUNDERS, K</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>116</th>\n", | |
" <td>1</td>\n", | |
" <td>12700</td>\n", | |
" <td>1</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>LINK, DR</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>123</th>\n", | |
" <td>1</td>\n", | |
" <td>12701</td>\n", | |
" <td>1</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>NATALE, G</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>130</th>\n", | |
" <td>1</td>\n", | |
" <td>12702</td>\n", | |
" <td>1</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>MACLENNAN, JE</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>137</th>\n", | |
" <td>1</td>\n", | |
" <td>12703</td>\n", | |
" <td>1</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>WALSH, M</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>144</th>\n", | |
" <td>1</td>\n", | |
" <td>12704</td>\n", | |
" <td>1</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>KEAST, SS</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>151</th>\n", | |
" <td>1</td>\n", | |
" <td>12705</td>\n", | |
" <td>1</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>NEUBERT, ME</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>55</th>\n", | |
" <td>1</td>\n", | |
" <td>1075</td>\n", | |
" <td>3</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>MARCHETTI, MC</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>35</th>\n", | |
" <td>1</td>\n", | |
" <td>562</td>\n", | |
" <td>2</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>NELSON, DR</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>1</td>\n", | |
" <td>201</td>\n", | |
" <td>3</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>BALENTS, L</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>159</th>\n", | |
" <td>2</td>\n", | |
" <td>1237</td>\n", | |
" <td>2</td>\n", | |
" <td>FRISCHAT, SD</td>\n", | |
" <td>DORON, E</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>158</th>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>FRISCHAT, SD</td>\n", | |
" <td>KUHN, R</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>160</th>\n", | |
" <td>3</td>\n", | |
" <td>2</td>\n", | |
" <td>1</td>\n", | |
" <td>KUHN, R</td>\n", | |
" <td>FRISCHAT, SD</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>311</th>\n", | |
" <td>4</td>\n", | |
" <td>10757</td>\n", | |
" <td>9</td>\n", | |
" <td>BEENAKKER, CWJ</td>\n", | |
" <td>PATRA, M</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>215</th>\n", | |
" <td>4</td>\n", | |
" <td>891</td>\n", | |
" <td>2</td>\n", | |
" <td>BEENAKKER, CWJ</td>\n", | |
" <td>LEYRONAS, X</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>219</th>\n", | |
" <td>4</td>\n", | |
" <td>1785</td>\n", | |
" <td>1</td>\n", | |
" <td>BEENAKKER, CWJ</td>\n", | |
" <td>BUTTIKER, M</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>243</th>\n", | |
" <td>4</td>\n", | |
" <td>2212</td>\n", | |
" <td>5</td>\n", | |
" <td>BEENAKKER, CWJ</td>\n", | |
" <td>LANGEN, SAV</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>200</th>\n", | |
" <td>4</td>\n", | |
" <td>722</td>\n", | |
" <td>1</td>\n", | |
" <td>BEENAKKER, CWJ</td>\n", | |
" <td>FRAHM, K</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>255</th>\n", | |
" <td>4</td>\n", | |
" <td>2264</td>\n", | |
" <td>1</td>\n", | |
" <td>BEENAKKER, CWJ</td>\n", | |
" <td>BLANTER, YM</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>196</th>\n", | |
" <td>4</td>\n", | |
" <td>7</td>\n", | |
" <td>4</td>\n", | |
" <td>BEENAKKER, CWJ</td>\n", | |
" <td>JONG, MJMD</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>268</th>\n", | |
" <td>4</td>\n", | |
" <td>2876</td>\n", | |
" <td>9</td>\n", | |
" <td>BEENAKKER, CWJ</td>\n", | |
" <td>SCHOMERUS, H</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>290</th>\n", | |
" <td>4</td>\n", | |
" <td>3232</td>\n", | |
" <td>5</td>\n", | |
" <td>BEENAKKER, CWJ</td>\n", | |
" <td>MISIRPASHAEV, TS</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>306</th>\n", | |
" <td>4</td>\n", | |
" <td>8994</td>\n", | |
" <td>2</td>\n", | |
" <td>BEENAKKER, CWJ</td>\n", | |
" <td>TWORZYDLO, J</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>293</th>\n", | |
" <td>4</td>\n", | |
" <td>3478</td>\n", | |
" <td>7</td>\n", | |
" <td>BEENAKKER, CWJ</td>\n", | |
" <td>FRAHM, KM</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>162</th>\n", | |
" <td>4</td>\n", | |
" <td>5</td>\n", | |
" <td>5</td>\n", | |
" <td>BEENAKKER, CWJ</td>\n", | |
" <td>MELSEN, JA</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>67404</th>\n", | |
" <td>16718</td>\n", | |
" <td>8621</td>\n", | |
" <td>1</td>\n", | |
" <td>RENZ, F</td>\n", | |
" <td>JAKOB, G</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>87838</th>\n", | |
" <td>16718</td>\n", | |
" <td>16716</td>\n", | |
" <td>1</td>\n", | |
" <td>RENZ, F</td>\n", | |
" <td>TREMEL, W</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>87566</th>\n", | |
" <td>16719</td>\n", | |
" <td>13396</td>\n", | |
" <td>1</td>\n", | |
" <td>GUETLICH, P</td>\n", | |
" <td>RITTER, C</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>83218</th>\n", | |
" <td>16719</td>\n", | |
" <td>13170</td>\n", | |
" <td>1</td>\n", | |
" <td>GUETLICH, P</td>\n", | |
" <td>WESTERBURG, W</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>87855</th>\n", | |
" <td>16719</td>\n", | |
" <td>16718</td>\n", | |
" <td>1</td>\n", | |
" <td>GUETLICH, P</td>\n", | |
" <td>RENZ, F</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>87847</th>\n", | |
" <td>16719</td>\n", | |
" <td>16717</td>\n", | |
" <td>1</td>\n", | |
" <td>GUETLICH, P</td>\n", | |
" <td>WALDECK, M</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>67405</th>\n", | |
" <td>16719</td>\n", | |
" <td>8621</td>\n", | |
" <td>1</td>\n", | |
" <td>GUETLICH, P</td>\n", | |
" <td>JAKOB, G</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>87839</th>\n", | |
" <td>16719</td>\n", | |
" <td>16716</td>\n", | |
" <td>1</td>\n", | |
" <td>GUETLICH, P</td>\n", | |
" <td>TREMEL, W</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>87831</th>\n", | |
" <td>16719</td>\n", | |
" <td>16715</td>\n", | |
" <td>1</td>\n", | |
" <td>GUETLICH, P</td>\n", | |
" <td>FELSER, C</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>87823</th>\n", | |
" <td>16719</td>\n", | |
" <td>16714</td>\n", | |
" <td>1</td>\n", | |
" <td>GUETLICH, P</td>\n", | |
" <td>LANG, O</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>80930</th>\n", | |
" <td>16720</td>\n", | |
" <td>11553</td>\n", | |
" <td>1</td>\n", | |
" <td>HAEUSSLER, R</td>\n", | |
" <td>LOEHNEYSEN, HV</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>51427</th>\n", | |
" <td>16720</td>\n", | |
" <td>15834</td>\n", | |
" <td>1</td>\n", | |
" <td>HAEUSSLER, R</td>\n", | |
" <td>SCHEER, E</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>91849</th>\n", | |
" <td>16720</td>\n", | |
" <td>16721</td>\n", | |
" <td>1</td>\n", | |
" <td>HAEUSSLER, R</td>\n", | |
" <td>WEBER, HB</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>51428</th>\n", | |
" <td>16721</td>\n", | |
" <td>15834</td>\n", | |
" <td>1</td>\n", | |
" <td>WEBER, HB</td>\n", | |
" <td>SCHEER, E</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>80931</th>\n", | |
" <td>16721</td>\n", | |
" <td>11553</td>\n", | |
" <td>1</td>\n", | |
" <td>WEBER, HB</td>\n", | |
" <td>LOEHNEYSEN, HV</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>91846</th>\n", | |
" <td>16721</td>\n", | |
" <td>16720</td>\n", | |
" <td>1</td>\n", | |
" <td>WEBER, HB</td>\n", | |
" <td>HAEUSSLER, R</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>74577</th>\n", | |
" <td>16723</td>\n", | |
" <td>14983</td>\n", | |
" <td>1</td>\n", | |
" <td>LEUNG, MA</td>\n", | |
" <td>CARR, LD</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>68050</th>\n", | |
" <td>16723</td>\n", | |
" <td>4256</td>\n", | |
" <td>1</td>\n", | |
" <td>LEUNG, MA</td>\n", | |
" <td>REINHARDT, WP</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>41242</th>\n", | |
" <td>16724</td>\n", | |
" <td>8897</td>\n", | |
" <td>1</td>\n", | |
" <td>CORNISH, SL</td>\n", | |
" <td>WIEMAN, CE</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>80425</th>\n", | |
" <td>16724</td>\n", | |
" <td>16725</td>\n", | |
" <td>1</td>\n", | |
" <td>CORNISH, SL</td>\n", | |
" <td>CLAUSSEN, NR</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>80429</th>\n", | |
" <td>16724</td>\n", | |
" <td>16726</td>\n", | |
" <td>1</td>\n", | |
" <td>CORNISH, SL</td>\n", | |
" <td>ROBERTS, JL</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>41176</th>\n", | |
" <td>16724</td>\n", | |
" <td>5350</td>\n", | |
" <td>1</td>\n", | |
" <td>CORNISH, SL</td>\n", | |
" <td>CORNELL, EA</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>41177</th>\n", | |
" <td>16725</td>\n", | |
" <td>5350</td>\n", | |
" <td>1</td>\n", | |
" <td>CLAUSSEN, NR</td>\n", | |
" <td>CORNELL, EA</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>80430</th>\n", | |
" <td>16725</td>\n", | |
" <td>16726</td>\n", | |
" <td>1</td>\n", | |
" <td>CLAUSSEN, NR</td>\n", | |
" <td>ROBERTS, JL</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>41243</th>\n", | |
" <td>16725</td>\n", | |
" <td>8897</td>\n", | |
" <td>1</td>\n", | |
" <td>CLAUSSEN, NR</td>\n", | |
" <td>WIEMAN, CE</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>80421</th>\n", | |
" <td>16725</td>\n", | |
" <td>16724</td>\n", | |
" <td>1</td>\n", | |
" <td>CLAUSSEN, NR</td>\n", | |
" <td>CORNISH, SL</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>41244</th>\n", | |
" <td>16726</td>\n", | |
" <td>8897</td>\n", | |
" <td>1</td>\n", | |
" <td>ROBERTS, JL</td>\n", | |
" <td>WIEMAN, CE</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>80426</th>\n", | |
" <td>16726</td>\n", | |
" <td>16725</td>\n", | |
" <td>1</td>\n", | |
" <td>ROBERTS, JL</td>\n", | |
" <td>CLAUSSEN, NR</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>80422</th>\n", | |
" <td>16726</td>\n", | |
" <td>16724</td>\n", | |
" <td>1</td>\n", | |
" <td>ROBERTS, JL</td>\n", | |
" <td>CORNISH, SL</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>41178</th>\n", | |
" <td>16726</td>\n", | |
" <td>5350</td>\n", | |
" <td>1</td>\n", | |
" <td>ROBERTS, JL</td>\n", | |
" <td>CORNELL, EA</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>95188 rows × 5 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" AUTHOR_ID CO-AUTHOR_ID NO_OF_BOOKS AUTHOR CO-AUTHOR\n", | |
"0 1 190 7 RADZIHOVSKY, L TONER, J\n", | |
"66 1 2281 1 RADZIHOVSKY, L FREY, E\n", | |
"93 1 3896 1 RADZIHOVSKY, L GINZBURG, VV\n", | |
"101 1 3897 2 RADZIHOVSKY, L CLARK, NA\n", | |
"110 1 12347 1 RADZIHOVSKY, L JACOBSEN, B\n", | |
"113 1 12348 1 RADZIHOVSKY, L SAUNDERS, K\n", | |
"116 1 12700 1 RADZIHOVSKY, L LINK, DR\n", | |
"123 1 12701 1 RADZIHOVSKY, L NATALE, G\n", | |
"130 1 12702 1 RADZIHOVSKY, L MACLENNAN, JE\n", | |
"137 1 12703 1 RADZIHOVSKY, L WALSH, M\n", | |
"144 1 12704 1 RADZIHOVSKY, L KEAST, SS\n", | |
"151 1 12705 1 RADZIHOVSKY, L NEUBERT, ME\n", | |
"55 1 1075 3 RADZIHOVSKY, L MARCHETTI, MC\n", | |
"35 1 562 2 RADZIHOVSKY, L NELSON, DR\n", | |
"11 1 201 3 RADZIHOVSKY, L BALENTS, L\n", | |
"159 2 1237 2 FRISCHAT, SD DORON, E\n", | |
"158 2 3 1 FRISCHAT, SD KUHN, R\n", | |
"160 3 2 1 KUHN, R FRISCHAT, SD\n", | |
"311 4 10757 9 BEENAKKER, CWJ PATRA, M\n", | |
"215 4 891 2 BEENAKKER, CWJ LEYRONAS, X\n", | |
"219 4 1785 1 BEENAKKER, CWJ BUTTIKER, M\n", | |
"243 4 2212 5 BEENAKKER, CWJ LANGEN, SAV\n", | |
"200 4 722 1 BEENAKKER, CWJ FRAHM, K\n", | |
"255 4 2264 1 BEENAKKER, CWJ BLANTER, YM\n", | |
"196 4 7 4 BEENAKKER, CWJ JONG, MJMD\n", | |
"268 4 2876 9 BEENAKKER, CWJ SCHOMERUS, H\n", | |
"290 4 3232 5 BEENAKKER, CWJ MISIRPASHAEV, TS\n", | |
"306 4 8994 2 BEENAKKER, CWJ TWORZYDLO, J\n", | |
"293 4 3478 7 BEENAKKER, CWJ FRAHM, KM\n", | |
"162 4 5 5 BEENAKKER, CWJ MELSEN, JA\n", | |
"... ... ... ... ... ...\n", | |
"67404 16718 8621 1 RENZ, F JAKOB, G\n", | |
"87838 16718 16716 1 RENZ, F TREMEL, W\n", | |
"87566 16719 13396 1 GUETLICH, P RITTER, C\n", | |
"83218 16719 13170 1 GUETLICH, P WESTERBURG, W\n", | |
"87855 16719 16718 1 GUETLICH, P RENZ, F\n", | |
"87847 16719 16717 1 GUETLICH, P WALDECK, M\n", | |
"67405 16719 8621 1 GUETLICH, P JAKOB, G\n", | |
"87839 16719 16716 1 GUETLICH, P TREMEL, W\n", | |
"87831 16719 16715 1 GUETLICH, P FELSER, C\n", | |
"87823 16719 16714 1 GUETLICH, P LANG, O\n", | |
"80930 16720 11553 1 HAEUSSLER, R LOEHNEYSEN, HV\n", | |
"51427 16720 15834 1 HAEUSSLER, R SCHEER, E\n", | |
"91849 16720 16721 1 HAEUSSLER, R WEBER, HB\n", | |
"51428 16721 15834 1 WEBER, HB SCHEER, E\n", | |
"80931 16721 11553 1 WEBER, HB LOEHNEYSEN, HV\n", | |
"91846 16721 16720 1 WEBER, HB HAEUSSLER, R\n", | |
"74577 16723 14983 1 LEUNG, MA CARR, LD\n", | |
"68050 16723 4256 1 LEUNG, MA REINHARDT, WP\n", | |
"41242 16724 8897 1 CORNISH, SL WIEMAN, CE\n", | |
"80425 16724 16725 1 CORNISH, SL CLAUSSEN, NR\n", | |
"80429 16724 16726 1 CORNISH, SL ROBERTS, JL\n", | |
"41176 16724 5350 1 CORNISH, SL CORNELL, EA\n", | |
"41177 16725 5350 1 CLAUSSEN, NR CORNELL, EA\n", | |
"80430 16725 16726 1 CLAUSSEN, NR ROBERTS, JL\n", | |
"41243 16725 8897 1 CLAUSSEN, NR WIEMAN, CE\n", | |
"80421 16725 16724 1 CLAUSSEN, NR CORNISH, SL\n", | |
"41244 16726 8897 1 ROBERTS, JL WIEMAN, CE\n", | |
"80426 16726 16725 1 ROBERTS, JL CLAUSSEN, NR\n", | |
"80422 16726 16724 1 ROBERTS, JL CORNISH, SL\n", | |
"41178 16726 5350 1 ROBERTS, JL CORNELL, EA\n", | |
"\n", | |
"[95188 rows x 5 columns]" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"a = authors.assign(CO_AUTHOR_ID=authors['AUTHOR_ID']).assign(CO_AUTHOR_NAME=authors['AUTHOR_NAME'])\n", | |
"b = occurrence.merge(authors, how='inner', on='AUTHOR_ID')\n", | |
"#.merge(a, how='inner', on='AUTHOR_ID')\n", | |
"df = b.merge(a, how='inner', on='CO_AUTHOR_ID') \\\n", | |
" .sort_values(by='AUTHOR_ID_x') \\\n", | |
" .drop('AUTHOR_ID_y',1) \\\n", | |
" .drop('AUTHOR_NAME_y',1)\n", | |
"df.columns = ['AUTHOR_ID', 'CO-AUTHOR_ID', 'NO_OF_BOOKS', 'AUTHOR', 'CO-AUTHOR']\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"95188\n", | |
"Printing nodes over\n", | |
"95188\n", | |
"Printing nodes and edges over\n" | |
] | |
} | |
], | |
"source": [ | |
"f = open(FILE_NAME, \"w\")\n", | |
"#helpers\n", | |
"s = \" \"\n", | |
"ss = s+s\n", | |
"sss = s+s+s\n", | |
"ssss = s+s+s+s\n", | |
"nl = \"\\n\"\n", | |
"\n", | |
"#loop helpers\n", | |
"added = []\n", | |
"ind = 0\n", | |
"\n", | |
"#Root node\n", | |
"f.write(\"graph\"+nl)\n", | |
"f.write(\"[\"+nl)\n", | |
"\n", | |
"#Write an edge\n", | |
"def write_edge(r):\n", | |
" f.write( ss + \"edge\" + nl)\n", | |
" f.write( ss + \"[\" + nl)\n", | |
" f.write( ssss + \"source\" + s + '\"' + str(r['AUTHOR_ID']) + '\"' + nl)\n", | |
" f.write( ssss + \"target\" + s + '\"' + str(r['CO-AUTHOR_ID']) + '\"' + nl)\n", | |
" f.write( ssss + \"value\" + s + str(r['NO_OF_BOOKS']) + '\"' + nl)\n", | |
" f.write( ss + \"]\"+ nl)\n", | |
"\n", | |
"#Write a node\n", | |
"def write_node(r):\n", | |
" f.write( ss + \"node\" + nl)\n", | |
" f.write( ss + \"[\" + nl)\n", | |
" f.write( ssss + \"id\" + s + '\"' + str(r['AUTHOR_ID']) + '\"' + nl)\n", | |
" f.write( ssss + \"label\" + s + '\"' + str(r['AUTHOR']) + '\"' + nl)\n", | |
" f.write( ss + \"]\"+ nl)\n", | |
"\n", | |
"#Generate nodes\n", | |
"for i, r in df.iterrows():\n", | |
" #increment, as index not reliable\n", | |
" ind += 1\n", | |
" #Check for duplicates\n", | |
" if (r['AUTHOR_ID'] not in added):\n", | |
" #Add to list\n", | |
" added.append(r['AUTHOR_ID'])\n", | |
" write_node(r)\n", | |
" #print the progress \n", | |
" progress(ind)\n", | |
"\n", | |
"print(nl+\"Printing nodes over\")\n", | |
"\n", | |
"#flush index\n", | |
"ind = 0 \n", | |
"#Generate edges \n", | |
"for i, r in df.iterrows():\n", | |
" #increment, as index not reliable\n", | |
" ind += 1\n", | |
" if(r['AUTHOR_ID'] < r['CO-AUTHOR_ID']):\n", | |
" write_edge(r)\n", | |
" #print the progress \n", | |
" progress(ind)\n", | |
"\n", | |
"print(nl+\"Printing nodes and edges over\")\n", | |
"\n", | |
"#closing node\n", | |
"f.write(\"]\"+nl)\n", | |
"f.close()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
can i have the dataset of the csv file because i want to see the structure of the csv file
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
xml2csv
import sys, time
import pandas as pd
import datetime as dt
from IPython.display import display
import csv
import xml.etree.ElementTree as ET
def xml_to_csv(file_path,csv_name) -> None:
tree = ET.parse(file_path)
root = tree.getroot()
if name = 'main':