Skip to content

Instantly share code, notes, and snippets.

@restrepo
Created November 11, 2016 21:24
Show Gist options
  • Save restrepo/6047a1bc4da689fe22d653a76c1e16ed to your computer and use it in GitHub Desktop.
Save restrepo/6047a1bc4da689fe22d653a76c1e16ed to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Add DOI info to productos UdeA"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import publications as p\n",
"from unidecode import unidecode"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import re\n",
"import sys\n",
"import numpy as np\n",
"import time\n",
"from publications import *\n",
"from unidecode import unidecode\n",
"import difflib\n",
"def get_doi(\n",
" surname=None,#'florez',\n",
" title=r'Baryonic violation of R-parity from anomalous $U(1)_H$',\n",
" other='',\n",
" DOI=None,\n",
" check_text=None,\n",
" check_mesagges_key=None,\n",
" JSON=False):\n",
" import re\n",
" import requests\n",
" import time\n",
" import sys\n",
" '''\n",
" Search for a DOI and check against the full DOI info. If JSON is set True, the full\n",
" info is returned as a python dictionary\n",
" \n",
" Implementations:\n",
" \n",
" 1) Search and check for a DOI by surname and title or if only DOI is given, just check the DOI.\n",
" \n",
" The checking is doing by comparing check_text with the check_mesagges_key from the full info.\n",
" By default the given 'title' is used for the check.\n",
" \n",
" For other possible check_mesagges_key's, see in a browser the several keys of the 'mesagges' \n",
" dictionary at:\n",
" https://api.crossref.org/v1/works/DOI, \n",
" for example:\n",
" https://api.crossref.org/v1/works/10.1103/physrevd.87.095010\n",
" \n",
" \n",
" 2) if only DOI is given, just get full info aboit the DOI\n",
" \n",
" Examples:\n",
" 2) get_doi(surname='Florez',title='Baryonic violation of R-parity from anomalous U(1)H',JSON=True)\n",
" 3) get_doi(DOI)\n",
" '''\n",
" similarity=0.6\n",
" if not check_text:\n",
" check_text=title.lower()\n",
" if not check_mesagges_key:\n",
" check_mesagges_key='title' #'container-title'\n",
" if not surname and DOI:\n",
" if not check_text and not check_mesagges_key:\n",
" similarity=0.1\n",
" check_text='http://dx.doi.org/'\n",
" check_mesagges_key='DOI'\n",
" JSON=True\n",
" \n",
" doi=''\n",
" if JSON:\n",
" doi={}\n",
" if not DOI:\n",
" search=''\n",
" if surname:\n",
" search=surname\n",
" if title:\n",
" if len(search)>0:\n",
" search=search+', '+title\n",
" if other:\n",
" if len(search)>0:\n",
" search=search+', '+other\n",
"\n",
" r=requests.get('http://search.crossref.org/?q=%s' %search)\n",
" urldoi='http://dx.doi.org/'\n",
"\n",
" DOI=''\n",
" if len(r.text.split(urldoi))>1:\n",
" DOI=r.text.split(urldoi)[1].split(\"\\',\")[0].split('>\\n')[0]\n",
" else:\n",
" DOI=''\n",
" \n",
" \n",
" if DOI:\n",
" json='https://api.crossref.org/v1/works/'\n",
" rr=requests.get( json+DOI )\n",
" if rr.status_code==200:\n",
" if 'message' in rr.json():\n",
" if check_mesagges_key in rr.json()['message']:\n",
" chk=get_close_matches_Levenshtein(check_text,\\\n",
" rr.json()[\"message\"][check_mesagges_key][0].lower(),\\\n",
" n=1,cutoff=similarity)\n",
" if chk:\n",
" if \"DOI\" in rr.json()[\"message\"]:\n",
" doi=rr.json()[\"message\"][\"DOI\"]\n",
" if JSON:\n",
" doi=rr.json()[\"message\"]\n",
" \n",
" \n",
" time.sleep(1)\n",
" return doi\n",
"def get_close_matches_Levenshtein(word, possibilities,n=3,cutoff=0.6,full=False):\n",
" '''Replaces difflib.get_close_matches with faster algortihm based on\n",
" Levenshtein.ratio.\n",
" HINT: Similarity increase significatively after lower() and unidecode()\n",
" '''\n",
" import pandas as pd\n",
" import Levenshtein\n",
" if type(possibilities)==str:\n",
" possibilities=[possibilities]\n",
" rl=['']\n",
" rs=pd.DataFrame()\n",
" MATCH=False\n",
" for p in possibilities:\n",
" similarity=Levenshtein.ratio(word,p)\n",
" if similarity>=cutoff:\n",
" MATCH=True\n",
" rs=rs.append({'similarity':similarity,'match':p},ignore_index=True)\n",
"\n",
" if MATCH:\n",
" rs=rs.sort_values('similarity',ascending=False).reset_index(drop=True)\n",
" if full:\n",
" return list(rs['match'][:n].values),list(rs['similarity'][:n].values)\n",
" else:\n",
" return list(rs['match'][:n].values)\n",
" else:\n",
" if full:\n",
" return ([],0)\n",
" else:\n",
" return []\n",
" \n",
"def check_hash(df,hashseries,in_hash,min_match=10):\n",
" ''' hashseries obtained from dataframe df, e.g\n",
" hashseris=df.some_column.str.replace('\\W+','').str.lower().map(unicode)\n",
" within which in_hash will be searched for match at least min_match characters \n",
" '''\n",
" comparision=True\n",
" for si in reversed(range(0,len(in_hash)+1)):\n",
" chk=df[hashseries.str.match(in_hash[:si])]\n",
" if chk.shape[0]>0:\n",
" return comparision,chk\n",
" break \n",
" if si<min_match:\n",
" comparision=False\n",
" return comparision,pd.DataFrame()\n",
" \n",
" \n",
"def columns_add_prefix(df,prefix):\n",
" return df.rename_axis( dict( (key,prefix+'_'+key) for key in df.columns.values) , axis=1)\n",
"\n",
"def fill_NaN(df):\n",
" '''Fill NaN entries with proper empty values\n",
" Type : dtype: Fill with\n",
" string: \"0\" : ''\n",
" float : \"float64\" \n",
" '''\n",
" for key in df.columns:\n",
" if df[key].dtype=='O':\n",
" df[key]=df[key].str.strip()\n",
" df[key]=df[key].fillna('')\n",
" elif df[key].dtype=='float64':\n",
" df[key]=df[key].fillna(0.0)\n",
" return df\n",
"def read_excel_fill_NaN(*args, **kwargs):\n",
" '''Fill NaN entries with proper empty values\n",
" Type : dtype: Fill with\n",
" string: \"0\" : ''\n",
" float : \"float64\" \n",
" '''\n",
" df=pd.read_excel(*args, **kwargs)\n",
" df=fill_NaN(df)\n",
" return df\n",
" \n",
"#To add to main publications object:\n",
"def add_sjr_info_from_issn(self,SJR,column_issn='SN',SJR_column_journal='SJR_Title',SJR_column_issn='SJR_Issn'):\n",
" '''self is an publication object and SJR is the info for a journal in column SJR_Issn'''\n",
" if not SJR_column_journal in self.articles.columns:\n",
" sys.exit(\"Run first the the more exact and fast add_sjr_info\")\n",
" \n",
" self.articles=fill_NaN(self.articles)\n",
" kk=self.articles[self.articles[SJR_column_journal]=='']\n",
" for issn in kk[column_issn].str.replace('-','').unique():\n",
" mtch=SJR[SJR[SJR_column_issn].str.contains(issn)].reset_index(drop=True)\n",
" if mtch.shape[0]==1:\n",
" moa=kk[ kk[column_issn].str.replace('-','')==issn ]\n",
" if moa.shape[0]>=1:\n",
" #DEBUG: more filters if \n",
" for key in SJR.columns.values:\n",
" self.articles.loc[moa.index.values,key]=mtch.ix[0][key]\n",
" \n",
" return self\n",
"\n",
"def add_sjr_info_from_journal(self,SJR,column_journal='SO',SJR_column_journal='SJR_Title'):\n",
" '''self is an publication object and SJR is the info for a journal in column SJR_Issn'''\n",
" if not SJR_column_journal in self.articles.columns:\n",
" sys.exit(\"Run first the more exact and fast add_sjr_info\")\n",
" \n",
" self.articles=fill_NaN(self.articles)\n",
" kk=self.articles[self.articles[SJR_column_journal]=='']\n",
" #kk_hash_SO=kk[column_journal].str.replace('\\W+','').str.lower().str.strip().map(unidecode)\n",
" SJR_hash_Title=SJR[SJR_column_journal].str.replace('\\W+','').str.lower().str.strip().map(unidecode)\n",
" for title in kk[column_journal].str.lower().str.strip().unique():\n",
" hash_match,mtch=check_hash(SJR,SJR_hash_Title,re.sub('\\W+','',title).lower().strip() )\n",
" if hash_match:\n",
" mtch=mtch.reset_index(drop=True)\n",
" if mtch.shape[0]>1:\n",
" newtitle=re.sub('\\W+',' ',title)\n",
" mtch=SJR[SJR[SJR_column_journal].str.lower().str.strip().str.match('%s ' %newtitle)]\n",
" if mtch.shape[0]:\n",
" mtch=mtch.reset_index(drop=True)\n",
" \n",
" if mtch.shape[0]==1:\n",
" moa=kk[ kk[column_journal].str.lower().str.strip()==title ]\n",
" if moa.shape[0]>=1:\n",
" for key in SJR.columns.values:\n",
" self.articles.loc[moa.index.values,key]=mtch.ix[0][key]\n",
" return self\n",
"\n",
"def add_sjr_info(self,SJR,column_journal='SO',SJR_column_journal='SJR_Title'):\n",
" '''self is an publication object and SJR is the info for a journal in column SJR_Title'''\n",
" self.articles=self.articles.reset_index(drop=True)\n",
" for joa in np.intersect1d( self.articles[column_journal].str.lower().str.strip().unique(),\\\n",
" SJR[SJR_column_journal].str.lower().str.strip().unique() ):\n",
" moa=self.articles[ self.articles[column_journal].str.lower() == joa ]\n",
" if moa.shape[0]:\n",
" mtch=SJR[SJR[SJR_column_journal].str.lower().str.strip()==joa].reset_index(drop=True)\n",
" if mtch.shape[0]==1:\n",
" #DEBUG: filter by ISSN if >1:\n",
" for key in SJR.columns.values:\n",
" self.articles.loc[moa.index.values,key]=mtch.ix[0][key]\n",
" \n",
" return self\n",
" \n",
"def merge_with_close_matches(left,right,left_on,right_on,left_extra_on,right_extra_on,how='inner',\\\n",
" n=1,cutoff=0.6,full=True):\n",
" '''For each entry of the column: left_on of DataFrame left (cannot have empty fields), \n",
" try to find the close match inside each row of right DataFrame, by comparing with \n",
" the right_on entry of the row. When a row match is found, the full right row is appended \n",
" to the matched row in the left DataFrame. \n",
" If the similarity between the entries at left_on and right_on is less than 0.8, \n",
" an additional check is performed between the entries left_extra_on and right_extra_on\n",
" of the matched row.\n",
" \n",
" how implemented: inner and left (Default: left)\n",
" '''\n",
" from unidecode import unidecode\n",
" import pandas as pd\n",
" words=left[left_on].str.lower().map(unidecode)\n",
" possibilities=right[right_on].str.lower().map(unidecode)\n",
" \n",
" joined=pd.DataFrame()\n",
" for i in left.index:\n",
" if i%100==0:\n",
" print('.', end=\"\")\n",
" joined_series=left.loc[i]\n",
" #joined_series=joined_series.append(pd.Series( {similarity_column:0} ))\n",
" title,similarity=get_close_matches_Levenshtein(words[i],possibilities,n=n,cutoff=cutoff,full=full)\n",
" if title:\n",
" mtch=right[possibilities==title[0]]\n",
" chk_cutoff=similarity[0]\n",
" crosscheck=0.8\n",
" if chk_cutoff<crosscheck:\n",
" if get_close_matches_Levenshtein( unidecode(left[left_extra_on][i].lower()),\\\n",
" [unidecode(mtch[right_extra_on][mtch.index[0]].lower())] ):\n",
" chk_cutoff=crosscheck+0.1\n",
" if chk_cutoff>=crosscheck:\n",
" joined_series=joined_series.append(mtch.loc[mtch.index[0]])\n",
" #joined_series[similarity_column]=similarity[0]\n",
" if how=='inner':\n",
" joined=joined.append(joined_series,ignore_index=True)\n",
" if how=='left':\n",
" joined=joined.append(joined_series,ignore_index=True)\n",
" return joined"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import io\n",
"import requests\n",
"import pandas as pd\n",
"import numpy as np\n",
"#key='125rkGaqMdlUotxZSVTNHmZ3DCh41jgTwaQ127z8_gGs'\n",
"key='1jTuRWgzwCbFBMzK_8qiTDWx_O66Qw3BaYT_5e5-V3IY'\n",
"#key='1OaTFwLu6KmJGPYdoKBpLJuXIO2eROOMl-hUSGHIdRAA'\n",
"url='https://docs.google.com/spreadsheets/d/%s/gviz/tq?tqx=out:csv' %key\n",
"r=requests.get(url)\n",
"if r.status_code==200:\n",
" csv_file=io.StringIO(r.text) # or directly with: urllib.request.urlopen(url)\n",
" udea=pd.read_csv( csv_file,keep_default_na=False)\n",
" udea=udea.rename_axis({'material ':'material'},axis='columns')"
]
},
{
"cell_type": "code",
"execution_count": 331,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"udea=pd.read_excel('producción_reconocida_2002_2016_doi.xlsx')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(51346, 11)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"udea.shape"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"udea=udea.fillna('')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(11598, 11)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"udea_rp=udea[udea['nombre revista o premio']!='']\n",
"udea_rp=udea_rp.drop_duplicates(subset=['título']).reset_index(drop=True)\n",
"udea_rp.shape"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(11598, 11)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"u=udea_rp.reset_index(drop=True).fillna('')\n",
"u.shape"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"u['simple_title']=u.título.str.lower().map(unidecode).str.split('(').str[0]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nombre</th>\n",
" <th>material</th>\n",
" <th>nro autores</th>\n",
" <th>nombre revista o premio</th>\n",
" <th>año realiz</th>\n",
" <th>título</th>\n",
" <th>país</th>\n",
" <th>idioma</th>\n",
" <th>item adic</th>\n",
" <th>valor item</th>\n",
" <th>simple_title</th>\n",
" <th>apellido</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>QUINTERO RIAZA VICTOR MANUEL</td>\n",
" <td>Articulo en revista Tipo A2</td>\n",
" <td>3</td>\n",
" <td>REVISTA COLOMBIANA DE CIRUGIA</td>\n",
" <td>2015</td>\n",
" <td>MANEJO NO QUIRURGICO DE PACIENTES SINTOMATICOS CON CAPSULA ENDOSCOPICA RETENIDA EN ENFERMEDAD DE CROHN.</td>\n",
" <td>46</td>\n",
" <td>ESPA?</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>manejo no quirurgico de pacientes sintomaticos con capsula endoscopica retenida en enfermedad de crohn.</td>\n",
" <td>quintero</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" nombre material nro autores \\\n",
"0 QUINTERO RIAZA VICTOR MANUEL Articulo en revista Tipo A2 3 \n",
"\n",
" nombre revista o premio año realiz \\\n",
"0 REVISTA COLOMBIANA DE CIRUGIA 2015 \n",
"\n",
" título \\\n",
"0 MANEJO NO QUIRURGICO DE PACIENTES SINTOMATICOS CON CAPSULA ENDOSCOPICA RETENIDA EN ENFERMEDAD DE CROHN. \n",
"\n",
" país idioma item adic valor item \\\n",
"0 46 ESPA? \n",
"\n",
" simple_title \\\n",
"0 manejo no quirurgico de pacientes sintomaticos con capsula endoscopica retenida en enfermedad de crohn. \n",
"\n",
" apellido \n",
"0 quintero "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"u['apellido']=u.nombre.str.lower().map(unidecode).str.split(' ').str[0]\n",
"u[:1]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Obtain DOI comparing titles"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"udea=pd.read_excel('producción_reconocida_2002_2016_doi.xlsx')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(51425, 11)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"udea.shape"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"udea=udea.fillna('')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(11598, 11)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"udea_rp=udea[udea['nombre revista o premio']!='']\n",
"udea_rp=udea_rp.drop_duplicates(subset=['título']).reset_index(drop=True)\n",
"udea_rp.shape"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"udea_old_doi=udea[udea.doi!=''].reset_index(drop=True)\n",
"udea_not_doi=udea[udea.doi==''].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"((3053, 11), (48372, 11))"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"udea_old_doi.shape,udea_not_doi.shape"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"WOS_SCP=pd.read_excel('WOS_SCP.xlsx')"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"WOS_SCP=fill_NaN(WOS_SCP)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(1589, 11)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"uu=udea_rp[udea_rp.doi==''].reset_index(drop=True)\n",
"udea_rp[udea_rp.doi!=''].shape"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(10009, 11)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"uu.shape"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"uu['simple_title']=uu.título.str.lower().map(unidecode).str.split('(').str[0]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(4131, 96)"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"WOS_SCP[WOS_SCP.DI!=''].shape\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nombre</th>\n",
" <th>material</th>\n",
" <th>nro autores</th>\n",
" <th>nombre revista o premio</th>\n",
" <th>año realiz</th>\n",
" <th>título</th>\n",
" <th>país</th>\n",
" <th>idioma</th>\n",
" <th>item adic</th>\n",
" <th>valor item</th>\n",
" <th>doi</th>\n",
" <th>simple_title</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>QUINTERO RIAZA VICTOR MANUEL</td>\n",
" <td>Articulo en revista Tipo A2</td>\n",
" <td>3</td>\n",
" <td>REVISTA COLOMBIANA DE CIRUGIA</td>\n",
" <td>2015</td>\n",
" <td>MANEJO NO QUIRURGICO DE PACIENTES SINTOMATICOS CON CAPSULA ENDOSCOPICA RETENIDA EN ENFERMEDAD DE CROHN.</td>\n",
" <td>46</td>\n",
" <td>ESPA?</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>manejo no quirurgico de pacientes sintomaticos con capsula endoscopica retenida en enfermedad de crohn.</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" nombre material nro autores \\\n",
"0 QUINTERO RIAZA VICTOR MANUEL Articulo en revista Tipo A2 3 \n",
"\n",
" nombre revista o premio año realiz \\\n",
"0 REVISTA COLOMBIANA DE CIRUGIA 2015 \n",
"\n",
" título \\\n",
"0 MANEJO NO QUIRURGICO DE PACIENTES SINTOMATICOS CON CAPSULA ENDOSCOPICA RETENIDA EN ENFERMEDAD DE CROHN. \n",
"\n",
" país idioma item adic valor item doi \\\n",
"0 46 ESPA? \n",
"\n",
" simple_title \n",
"0 manejo no quirurgico de pacientes sintomaticos con capsula endoscopica retenida en enfermedad de crohn. "
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"uu[:1]"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"....................................................................................................."
]
}
],
"source": [
"uuu=merge_with_close_matches(uu,WOS_SCP[WOS_SCP.DI!=''][['TI','DI','SO']],\\\n",
" left_on='título',right_on='TI',left_extra_on='nombre revista o premio',\\\n",
" right_extra_on='SO',how='left')"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(1944, 15)"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"uuu=uuu.fillna('')\n",
"uuu[uuu.DI!=''].shape"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'10.3305/nh.2015.32.6.9859'"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"uuu[uuu.DI!=''].loc[uuu[uuu.DI!=''].index.values[0],'DI']"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"for i in uuu[uuu.DI!=''].index:\n",
" for j in udea[udea.título==uuu.loc[i,'título']].index:\n",
" udea.loc[j,'doi']=uuu.loc[i,'DI']"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(6253, 11)"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"udea[udea.doi!=''].shape"
]
},
{
"cell_type": "code",
"execution_count": 329,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"udea.to_excel('producción_reconocida_2002_2016_doi.xlsx',index=False)"
]
},
{
"cell_type": "code",
"execution_count": 330,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nombre</th>\n",
" <th>material</th>\n",
" <th>nro autores</th>\n",
" <th>nombre revista o premio</th>\n",
" <th>año realiz</th>\n",
" <th>título</th>\n",
" <th>país</th>\n",
" <th>idioma</th>\n",
" <th>item adic</th>\n",
" <th>valor item</th>\n",
" <th>doi</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>URREA DUQUE JUAN PABLO</td>\n",
" <td>Ponencia en extenso en evento internacional</td>\n",
" <td>2</td>\n",
" <td></td>\n",
" <td>2015</td>\n",
" <td>STATISTICAL PERFORMANCE EVALUATION OF P2P VIDEO STREAMING ON MULTI-HOP WIRELESS NETWORKS.</td>\n",
" <td>46</td>\n",
" <td>INGLE</td>\n",
" <td>EDITORIAL</td>\n",
" <td>IEEE</td>\n",
" <td></td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" nombre material \\\n",
"0 URREA DUQUE JUAN PABLO Ponencia en extenso en evento internacional \n",
"\n",
" nro autores nombre revista o premio año realiz \\\n",
"0 2 2015 \n",
"\n",
" título \\\n",
"0 STATISTICAL PERFORMANCE EVALUATION OF P2P VIDEO STREAMING ON MULTI-HOP WIRELESS NETWORKS. \n",
"\n",
" país idioma item adic valor item doi \n",
"0 46 INGLE EDITORIAL IEEE "
]
},
"execution_count": 330,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"udea[:1]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Obtain DOI comparing simple titles"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#udea=pd.read_excel('producción_reconocida_2002_2016_doi.xlsx')\n",
"if 'doi' not in udea:\n",
" udea['doi']=''\n",
"udea=udea.fillna('')"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def obtain_doi_from_title(udea,WOS_SCP,\\\n",
" title_left_on='simple_title',\\\n",
" title_right_on='TI'):\n",
" udea_rp=udea[udea['nombre revista o premio']!='']\n",
" udea_rp=udea_rp.drop_duplicates(subset=['título']).reset_index(drop=True)\n",
" \n",
" #udea_old_doi=udea[udea.doi!=''].reset_index(drop=True)\n",
" #udea_not_doi=udea[udea.doi==''].reset_index(drop=True)\n",
"\n",
" WOS_SCP=pd.read_excel('WOS_SCP.xlsx')\n",
" WOS_SCP=fill_NaN(WOS_SCP)\n",
"\n",
" uu=udea_rp[udea_rp.doi==''].reset_index(drop=True)\n",
" if title_left_on=='simple_title':\n",
" uu['simple_title']=uu.título.str.lower().map(unidecode).str.split('(').str[0]\n",
"\n",
" uuu=merge_with_close_matches(uu,WOS_SCP[WOS_SCP.DI!=''][[title_right_on,'DI','SO']],\\\n",
" left_on=title_left_on,right_on=title_right_on,left_extra_on='nombre revista o premio',\\\n",
" right_extra_on='SO',how='left')\n",
"\n",
" uuu=uuu.fillna('')\n",
" if 'DI' in uuu:\n",
" uuu=uuu[uuu.DI!='']\n",
" for i in uuu.index:\n",
" for j in udea[udea.título==uuu.loc[i,'título']].index:\n",
" udea.loc[j,'doi']=uuu.loc[i,'DI']\n",
"\n",
"\n",
" return udea"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(0, 12)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"udea[udea.doi!=''].shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"udea=pd.read_excel('producción_reconocida_2002_2016_doi.xlsx')\n",
"udea=udea.fillna('')\n",
"udea=obtain_doi_from_title(udea,WOS_SCP,\\\n",
" title_left_on='título',\\\n",
" title_right_on='TI')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"udea.to_excel('producción_reconocida_2002_2016_doi.xlsx',index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"udea=pd.read_excel('producción_reconocida_2002_2016_doi.xlsx')\n",
"udea=udea.fillna('')\n",
"udea=obtain_doi_from_title(udea,WOS_SCP,\\\n",
" title_left_on='simple_title',\\\n",
" title_right_on='TI')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"udea.to_excel('producción_reconocida_2002_2016_doi.xlsx',index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"..............."
]
}
],
"source": [
"udea=pd.read_excel('producción_reconocida_2002_2016_doi.xlsx')\n",
"udea=udea.fillna('')\n",
"udea=obtain_doi_from_title(udea,WOS_SCP,\\\n",
" title_left_on='título',\\\n",
" title_right_on='SCP_Title')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"udea.to_excel('producción_reconocida_2002_2016_doi.xlsx',index=False)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"................................................................................."
]
}
],
"source": [
"udea=pd.read_excel('producción_reconocida_2002_2016_doi.xlsx')\n",
"udea=udea.fillna('')\n",
"udea=obtain_doi_from_title(udea,WOS_SCP,\\\n",
" title_left_on='simple_title',\\\n",
" title_right_on='SCP_Title')"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(6299, 11)"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"udea[udea.doi!=''].shape"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"udea.to_excel('producción_reconocida_2002_2016_doi.xlsx',index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"udea[udea.doi!=''].shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Find DOIS online from crossref API \n",
"Move to the end"
]
},
{
"cell_type": "code",
"execution_count": 311,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"u=u[u.doi=='']"
]
},
{
"cell_type": "code",
"execution_count": 312,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(10010, 13)"
]
},
"execution_count": 312,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"u.shape"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"u['simple_title']=u.título.str.lower().map(unidecode).str.split('(').str[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"u['apellido']=u.nombre.str.lower().map(unidecode).str.split(' ').str[0]\n",
"u[:1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"."
]
}
],
"source": [
"import subprocess\n",
"#u['doi']=''\n",
"TITLE='simple_title'\n",
"#TITLE='título'\n",
"u['title']=u[TITLE].str.lower().map(unidecode)\n",
"\n",
"for i in u.index:\n",
" if i%100==0:\n",
" print('.',end='')\n",
" kk=subprocess.getstatusoutput('echo %d > doi.log' %i)\n",
" #f=open('doi.log','w')\n",
" #f.write('%d' %i)\n",
" #f.close()\n",
" u.loc[i,'doi']=get_doi(u.apellido[i],u['title'][i])"
]
},
{
"cell_type": "code",
"execution_count": 313,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(0, 13)"
]
},
"execution_count": 313,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"u[u.doi!=''].shape"
]
},
{
"cell_type": "code",
"execution_count": 236,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"uu=u[u.doi!=''][['título','doi']].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 237,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>título</th>\n",
" <th>doi</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>OPTIMIZACION DEL TRATAMIENTO DE AGUAS RESIDUALES DE CULTIVOS DE FLORES USANDO HUMEDALES CONTRUIDOS DE FLUJO SUBSUPERFICIAL HORIZONTAL. (OPTIMIZATION OF WASTEWATER TREATMENT FROM A FLOWER CROPS USI...</td>\n",
" <td>10.17533/udea.rfnsp.v34n1a03</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" título \\\n",
"0 OPTIMIZACION DEL TRATAMIENTO DE AGUAS RESIDUALES DE CULTIVOS DE FLORES USANDO HUMEDALES CONTRUIDOS DE FLUJO SUBSUPERFICIAL HORIZONTAL. (OPTIMIZATION OF WASTEWATER TREATMENT FROM A FLOWER CROPS USI... \n",
"\n",
" doi \n",
"0 10.17533/udea.rfnsp.v34n1a03 "
]
},
"execution_count": 237,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"uu[:1]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment