restrepo · November 11, 2016 21:24
diff --git a/puntaje.ipynb b/puntaje.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Add DOI info to productos UdeA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import publications as p\n",
    "from unidecode import unidecode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import re\n",
    "import sys\n",
    "import numpy as np\n",
    "import time\n",
    "from publications import *\n",
    "from unidecode import unidecode\n",
    "import difflib\n",
    "def get_doi(\n",
    "        surname=None,#'florez',\n",
    "        title=r'Baryonic violation of R-parity from anomalous $U(1)_H$',\n",
    "        other='',\n",
    "        DOI=None,\n",
    "        check_text=None,\n",
    "        check_mesagges_key=None,\n",
    "        JSON=False):\n",
    "        import re\n",
    "        import requests\n",
    "        import time\n",
    "        import sys\n",
    "        '''\n",
    "        Search for a DOI and check against the full DOI info. If JSON is set True, the full\n",
    "        info is returned as a python dictionary\n",
    "        \n",
    "        Implementations:\n",
    "        \n",
    "        1) Search and check for a DOI by surname and title or if only DOI is given, just check the DOI.\n",
    "        \n",
    "          The checking is doing by comparing check_text with the check_mesagges_key from the full info.\n",
    "          By default the given 'title' is used for the check.\n",
    "          \n",
    "          For other possible check_mesagges_key's, see in a browser the several keys of the 'mesagges' \n",
    "          dictionary at:\n",
    "             https://api.crossref.org/v1/works/DOI, \n",
    "          for example:\n",
    "             https://api.crossref.org/v1/works/10.1103/physrevd.87.095010\n",
    "          \n",
    "        \n",
    "        2) if only DOI is given, just get full info aboit the DOI\n",
    "        \n",
    "        Examples:\n",
    "        2) get_doi(surname='Florez',title='Baryonic violation of R-parity from anomalous U(1)H',JSON=True)\n",
    "        3) get_doi(DOI)\n",
    "        '''\n",
    "        similarity=0.6\n",
    "        if not check_text:\n",
    "            check_text=title.lower()\n",
    "        if not check_mesagges_key:\n",
    "            check_mesagges_key='title' #'container-title'\n",
    "        if not surname and DOI:\n",
    "            if not check_text and not check_mesagges_key:\n",
    "                similarity=0.1\n",
    "                check_text='http://dx.doi.org/'\n",
    "                check_mesagges_key='DOI'\n",
    "                JSON=True\n",
    "            \n",
    "        doi=''\n",
    "        if JSON:\n",
    "            doi={}\n",
    "        if not DOI:\n",
    "            search=''\n",
    "            if surname:\n",
    "                search=surname\n",
    "            if title:\n",
    "                if len(search)>0:\n",
    "                    search=search+', '+title\n",
    "            if other:\n",
    "                if len(search)>0:\n",
    "                    search=search+', '+other\n",
    "\n",
    "            r=requests.get('http://search.crossref.org/?q=%s' %search)\n",
    "            urldoi='http://dx.doi.org/'\n",
    "\n",
    "            DOI=''\n",
    "            if len(r.text.split(urldoi))>1:\n",
    "                DOI=r.text.split(urldoi)[1].split(\"\\',\")[0].split('>\\n')[0]\n",
    "            else:\n",
    "                DOI=''\n",
    "            \n",
    "            \n",
    "        if DOI:\n",
    "            json='https://api.crossref.org/v1/works/'\n",
    "            rr=requests.get( json+DOI )\n",
    "            if rr.status_code==200:\n",
    "                if 'message' in rr.json():\n",
    "                    if check_mesagges_key in rr.json()['message']:\n",
    "                        chk=get_close_matches_Levenshtein(check_text,\\\n",
    "                                                          rr.json()[\"message\"][check_mesagges_key][0].lower(),\\\n",
    "                                                          n=1,cutoff=similarity)\n",
    "                        if chk:\n",
    "                            if \"DOI\" in rr.json()[\"message\"]:\n",
    "                                doi=rr.json()[\"message\"][\"DOI\"]\n",
    "                                if JSON:\n",
    "                                    doi=rr.json()[\"message\"]\n",
    "                                \n",
    "                                    \n",
    "        time.sleep(1)\n",
    "        return doi\n",
    "def get_close_matches_Levenshtein(word, possibilities,n=3,cutoff=0.6,full=False):\n",
    "    '''Replaces difflib.get_close_matches with faster algortihm based on\n",
    "       Levenshtein.ratio.\n",
    "       HINT: Similarity increase significatively after lower() and unidecode()\n",
    "    '''\n",
    "    import pandas as pd\n",
    "    import Levenshtein\n",
    "    if type(possibilities)==str:\n",
    "        possibilities=[possibilities]\n",
    "    rl=['']\n",
    "    rs=pd.DataFrame()\n",
    "    MATCH=False\n",
    "    for p in possibilities:\n",
    "        similarity=Levenshtein.ratio(word,p)\n",
    "        if similarity>=cutoff:\n",
    "            MATCH=True\n",
    "            rs=rs.append({'similarity':similarity,'match':p},ignore_index=True)\n",
    "\n",
    "    if MATCH:\n",
    "        rs=rs.sort_values('similarity',ascending=False).reset_index(drop=True)\n",
    "        if full:\n",
    "            return list(rs['match'][:n].values),list(rs['similarity'][:n].values)\n",
    "        else:\n",
    "            return list(rs['match'][:n].values)\n",
    "    else:\n",
    "        if full:\n",
    "            return ([],0)\n",
    "        else:\n",
    "            return []\n",
    "    \n",
    "def check_hash(df,hashseries,in_hash,min_match=10):\n",
    "    ''' hashseries obtained from dataframe df, e.g\n",
    "          hashseris=df.some_column.str.replace('\\W+','').str.lower().map(unicode)\n",
    "        within which in_hash will be searched for match at least min_match characters  \n",
    "    '''\n",
    "    comparision=True\n",
    "    for si in reversed(range(0,len(in_hash)+1)):\n",
    "        chk=df[hashseries.str.match(in_hash[:si])]\n",
    "        if chk.shape[0]>0:\n",
    "            return comparision,chk\n",
    "            break    \n",
    "        if si<min_match:\n",
    "            comparision=False\n",
    "            return comparision,pd.DataFrame()\n",
    "        \n",
    "            \n",
    "def columns_add_prefix(df,prefix):\n",
    "    return df.rename_axis( dict( (key,prefix+'_'+key) for key in df.columns.values) , axis=1)\n",
    "\n",
    "def fill_NaN(df):\n",
    "    '''Fill NaN entries with proper empty values\n",
    "     Type  : dtype: Fill with\n",
    "     string: \"0\"  : ''\n",
    "     float : \"float64\" \n",
    "    '''\n",
    "    for key in df.columns:\n",
    "        if df[key].dtype=='O':\n",
    "            df[key]=df[key].str.strip()\n",
    "            df[key]=df[key].fillna('')\n",
    "        elif df[key].dtype=='float64':\n",
    "            df[key]=df[key].fillna(0.0)\n",
    "    return df\n",
    "def read_excel_fill_NaN(*args, **kwargs):\n",
    "    '''Fill NaN entries with proper empty values\n",
    "     Type  : dtype: Fill with\n",
    "     string: \"0\"  : ''\n",
    "     float : \"float64\" \n",
    "    '''\n",
    "    df=pd.read_excel(*args, **kwargs)\n",
    "    df=fill_NaN(df)\n",
    "    return df\n",
    "    \n",
    "#To add to main publications object:\n",
    "def add_sjr_info_from_issn(self,SJR,column_issn='SN',SJR_column_journal='SJR_Title',SJR_column_issn='SJR_Issn'):\n",
    "    '''self is an publication object and SJR is the info for a journal in column SJR_Issn'''\n",
    "    if not SJR_column_journal in self.articles.columns:\n",
    "        sys.exit(\"Run first the the more exact and fast add_sjr_info\")\n",
    "            \n",
    "    self.articles=fill_NaN(self.articles)\n",
    "    kk=self.articles[self.articles[SJR_column_journal]=='']\n",
    "    for issn in kk[column_issn].str.replace('-','').unique():\n",
    "        mtch=SJR[SJR[SJR_column_issn].str.contains(issn)].reset_index(drop=True)\n",
    "        if mtch.shape[0]==1:\n",
    "            moa=kk[ kk[column_issn].str.replace('-','')==issn ]\n",
    "            if moa.shape[0]>=1:\n",
    "                #DEBUG: more filters if \n",
    "                for key in SJR.columns.values:\n",
    "                    self.articles.loc[moa.index.values,key]=mtch.ix[0][key]\n",
    "                    \n",
    "    return self\n",
    "\n",
    "def add_sjr_info_from_journal(self,SJR,column_journal='SO',SJR_column_journal='SJR_Title'):\n",
    "    '''self is an publication object and SJR is the info for a journal in column SJR_Issn'''\n",
    "    if not SJR_column_journal in self.articles.columns:\n",
    "        sys.exit(\"Run first the more exact and fast add_sjr_info\")\n",
    "        \n",
    "    self.articles=fill_NaN(self.articles)\n",
    "    kk=self.articles[self.articles[SJR_column_journal]=='']\n",
    "    #kk_hash_SO=kk[column_journal].str.replace('\\W+','').str.lower().str.strip().map(unidecode)\n",
    "    SJR_hash_Title=SJR[SJR_column_journal].str.replace('\\W+','').str.lower().str.strip().map(unidecode)\n",
    "    for title in kk[column_journal].str.lower().str.strip().unique():\n",
    "        hash_match,mtch=check_hash(SJR,SJR_hash_Title,re.sub('\\W+','',title).lower().strip() )\n",
    "        if hash_match:\n",
    "            mtch=mtch.reset_index(drop=True)\n",
    "            if mtch.shape[0]>1:\n",
    "                newtitle=re.sub('\\W+',' ',title)\n",
    "                mtch=SJR[SJR[SJR_column_journal].str.lower().str.strip().str.match('%s ' %newtitle)]\n",
    "                if mtch.shape[0]:\n",
    "                    mtch=mtch.reset_index(drop=True)\n",
    "             \n",
    "            if mtch.shape[0]==1:\n",
    "                moa=kk[ kk[column_journal].str.lower().str.strip()==title ]\n",
    "                if moa.shape[0]>=1:\n",
    "                    for key in SJR.columns.values:\n",
    "                        self.articles.loc[moa.index.values,key]=mtch.ix[0][key]\n",
    "    return self\n",
    "\n",
    "def add_sjr_info(self,SJR,column_journal='SO',SJR_column_journal='SJR_Title'):\n",
    "    '''self is an publication object and SJR is the info for a journal in column SJR_Title'''\n",
    "    self.articles=self.articles.reset_index(drop=True)\n",
    "    for joa in np.intersect1d( self.articles[column_journal].str.lower().str.strip().unique(),\\\n",
    "                                   SJR[SJR_column_journal].str.lower().str.strip().unique() ):\n",
    "        moa=self.articles[ self.articles[column_journal].str.lower() == joa ]\n",
    "        if moa.shape[0]:\n",
    "            mtch=SJR[SJR[SJR_column_journal].str.lower().str.strip()==joa].reset_index(drop=True)\n",
    "            if mtch.shape[0]==1:\n",
    "                #DEBUG: filter by ISSN if >1:\n",
    "                for key in SJR.columns.values:\n",
    "                    self.articles.loc[moa.index.values,key]=mtch.ix[0][key]\n",
    "    \n",
    "    return self\n",
    "        \n",
    "def merge_with_close_matches(left,right,left_on,right_on,left_extra_on,right_extra_on,how='inner',\\\n",
    "                             n=1,cutoff=0.6,full=True):\n",
    "    '''For each entry of the column: left_on of DataFrame left (cannot have empty fields), \n",
    "       try to find the close match inside each row of right DataFrame, by comparing with \n",
    "       the right_on entry of the row. When a row match is found, the full right row is appended \n",
    "       to the matched row in the left DataFrame. \n",
    "       If the similarity between the entries at left_on and right_on is less than 0.8,  \n",
    "       an additional check is performed between the entries left_extra_on and right_extra_on\n",
    "       of the matched row.\n",
    "       \n",
    "       how implemented: inner and left (Default: left)\n",
    "    '''\n",
    "    from unidecode import unidecode\n",
    "    import pandas as pd\n",
    "    words=left[left_on].str.lower().map(unidecode)\n",
    "    possibilities=right[right_on].str.lower().map(unidecode)\n",
    "    \n",
    "    joined=pd.DataFrame()\n",
    "    for i in left.index:\n",
    "        if i%100==0:\n",
    "            print('.', end=\"\")\n",
    "        joined_series=left.loc[i]\n",
    "        #joined_series=joined_series.append(pd.Series( {similarity_column:0} ))\n",
    "        title,similarity=get_close_matches_Levenshtein(words[i],possibilities,n=n,cutoff=cutoff,full=full)\n",
    "        if title:\n",
    "            mtch=right[possibilities==title[0]]\n",
    "            chk_cutoff=similarity[0]\n",
    "            crosscheck=0.8\n",
    "            if chk_cutoff<crosscheck:\n",
    "                if get_close_matches_Levenshtein( unidecode(left[left_extra_on][i].lower()),\\\n",
    "                                                 [unidecode(mtch[right_extra_on][mtch.index[0]].lower())] ):\n",
    "                    chk_cutoff=crosscheck+0.1\n",
    "            if chk_cutoff>=crosscheck:\n",
    "                joined_series=joined_series.append(mtch.loc[mtch.index[0]])\n",
    "                #joined_series[similarity_column]=similarity[0]\n",
    "            if how=='inner':\n",
    "                joined=joined.append(joined_series,ignore_index=True)\n",
    "        if how=='left':\n",
    "            joined=joined.append(joined_series,ignore_index=True)\n",
    "    return joined"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import io\n",
    "import requests\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "#key='125rkGaqMdlUotxZSVTNHmZ3DCh41jgTwaQ127z8_gGs'\n",
    "key='1jTuRWgzwCbFBMzK_8qiTDWx_O66Qw3BaYT_5e5-V3IY'\n",
    "#key='1OaTFwLu6KmJGPYdoKBpLJuXIO2eROOMl-hUSGHIdRAA'\n",
    "url='https://docs.google.com/spreadsheets/d/%s/gviz/tq?tqx=out:csv' %key\n",
    "r=requests.get(url)\n",
    "if r.status_code==200:\n",
    "    csv_file=io.StringIO(r.text) # or directly  with: urllib.request.urlopen(url)\n",
    "    udea=pd.read_csv( csv_file,keep_default_na=False)\n",
    "    udea=udea.rename_axis({'material  ':'material'},axis='columns')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 331,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "udea=pd.read_excel('producción_reconocida_2002_2016_doi.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(51346, 11)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "udea.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "udea=udea.fillna('')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(11598, 11)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "udea_rp=udea[udea['nombre revista o premio']!='']\n",
    "udea_rp=udea_rp.drop_duplicates(subset=['título']).reset_index(drop=True)\n",
    "udea_rp.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(11598, 11)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "u=udea_rp.reset_index(drop=True).fillna('')\n",
    "u.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "u['simple_title']=u.título.str.lower().map(unidecode).str.split('(').str[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>nombre</th>\n",
       "      <th>material</th>\n",
       "      <th>nro autores</th>\n",
       "      <th>nombre revista o premio</th>\n",
       "      <th>año realiz</th>\n",
       "      <th>título</th>\n",
       "      <th>país</th>\n",
       "      <th>idioma</th>\n",
       "      <th>item adic</th>\n",
       "      <th>valor item</th>\n",
       "      <th>simple_title</th>\n",
       "      <th>apellido</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>QUINTERO RIAZA VICTOR MANUEL</td>\n",
       "      <td>Articulo en revista Tipo A2</td>\n",
       "      <td>3</td>\n",
       "      <td>REVISTA COLOMBIANA DE CIRUGIA</td>\n",
       "      <td>2015</td>\n",
       "      <td>MANEJO NO QUIRURGICO DE PACIENTES SINTOMATICOS CON CAPSULA ENDOSCOPICA RETENIDA EN ENFERMEDAD DE CROHN.</td>\n",
       "      <td>46</td>\n",
       "      <td>ESPA?</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>manejo no quirurgico de pacientes sintomaticos con capsula endoscopica retenida en enfermedad de crohn.</td>\n",
       "      <td>quintero</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                         nombre                     material  nro autores  \\\n",
       "0  QUINTERO RIAZA VICTOR MANUEL  Articulo en revista Tipo A2            3   \n",
       "\n",
       "         nombre revista o premio  año realiz  \\\n",
       "0  REVISTA COLOMBIANA DE CIRUGIA        2015   \n",
       "\n",
       "                                                                                                    título  \\\n",
       "0  MANEJO NO QUIRURGICO DE PACIENTES SINTOMATICOS CON CAPSULA ENDOSCOPICA RETENIDA EN ENFERMEDAD DE CROHN.   \n",
       "\n",
       "   país idioma item adic valor item  \\\n",
       "0    46  ESPA?                        \n",
       "\n",
       "                                                                                              simple_title  \\\n",
       "0  manejo no quirurgico de pacientes sintomaticos con capsula endoscopica retenida en enfermedad de crohn.   \n",
       "\n",
       "   apellido  \n",
       "0  quintero  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "u['apellido']=u.nombre.str.lower().map(unidecode).str.split(' ').str[0]\n",
    "u[:1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Obtain DOI comparing titles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "udea=pd.read_excel('producción_reconocida_2002_2016_doi.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(51425, 11)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "udea.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "udea=udea.fillna('')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(11598, 11)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "udea_rp=udea[udea['nombre revista o premio']!='']\n",
    "udea_rp=udea_rp.drop_duplicates(subset=['título']).reset_index(drop=True)\n",
    "udea_rp.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "udea_old_doi=udea[udea.doi!=''].reset_index(drop=True)\n",
    "udea_not_doi=udea[udea.doi==''].reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((3053, 11), (48372, 11))"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "udea_old_doi.shape,udea_not_doi.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "WOS_SCP=pd.read_excel('WOS_SCP.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "WOS_SCP=fill_NaN(WOS_SCP)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1589, 11)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "uu=udea_rp[udea_rp.doi==''].reset_index(drop=True)\n",
    "udea_rp[udea_rp.doi!=''].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10009, 11)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "uu.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "uu['simple_title']=uu.título.str.lower().map(unidecode).str.split('(').str[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4131, 96)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "WOS_SCP[WOS_SCP.DI!=''].shape\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>nombre</th>\n",
       "      <th>material</th>\n",
       "      <th>nro autores</th>\n",
       "      <th>nombre revista o premio</th>\n",
       "      <th>año realiz</th>\n",
       "      <th>título</th>\n",
       "      <th>país</th>\n",
       "      <th>idioma</th>\n",
       "      <th>item adic</th>\n",
       "      <th>valor item</th>\n",
       "      <th>doi</th>\n",
       "      <th>simple_title</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>QUINTERO RIAZA VICTOR MANUEL</td>\n",
       "      <td>Articulo en revista Tipo A2</td>\n",
       "      <td>3</td>\n",
       "      <td>REVISTA COLOMBIANA DE CIRUGIA</td>\n",
       "      <td>2015</td>\n",
       "      <td>MANEJO NO QUIRURGICO DE PACIENTES SINTOMATICOS CON CAPSULA ENDOSCOPICA RETENIDA EN ENFERMEDAD DE CROHN.</td>\n",
       "      <td>46</td>\n",
       "      <td>ESPA?</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>manejo no quirurgico de pacientes sintomaticos con capsula endoscopica retenida en enfermedad de crohn.</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                         nombre                     material  nro autores  \\\n",
       "0  QUINTERO RIAZA VICTOR MANUEL  Articulo en revista Tipo A2            3   \n",
       "\n",
       "         nombre revista o premio  año realiz  \\\n",
       "0  REVISTA COLOMBIANA DE CIRUGIA        2015   \n",
       "\n",
       "                                                                                                    título  \\\n",
       "0  MANEJO NO QUIRURGICO DE PACIENTES SINTOMATICOS CON CAPSULA ENDOSCOPICA RETENIDA EN ENFERMEDAD DE CROHN.   \n",
       "\n",
       "   país idioma item adic valor item doi  \\\n",
       "0    46  ESPA?                            \n",
       "\n",
       "                                                                                              simple_title  \n",
       "0  manejo no quirurgico de pacientes sintomaticos con capsula endoscopica retenida en enfermedad de crohn.  "
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "uu[:1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "....................................................................................................."
     ]
    }
   ],
   "source": [
    "uuu=merge_with_close_matches(uu,WOS_SCP[WOS_SCP.DI!=''][['TI','DI','SO']],\\\n",
    "                         left_on='título',right_on='TI',left_extra_on='nombre revista o premio',\\\n",
    "                            right_extra_on='SO',how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1944, 15)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "uuu=uuu.fillna('')\n",
    "uuu[uuu.DI!=''].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'10.3305/nh.2015.32.6.9859'"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "uuu[uuu.DI!=''].loc[uuu[uuu.DI!=''].index.values[0],'DI']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "for i in uuu[uuu.DI!=''].index:\n",
    "    for j in udea[udea.título==uuu.loc[i,'título']].index:\n",
    "        udea.loc[j,'doi']=uuu.loc[i,'DI']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(6253, 11)"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "udea[udea.doi!=''].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 329,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "udea.to_excel('producción_reconocida_2002_2016_doi.xlsx',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 330,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>nombre</th>\n",
       "      <th>material</th>\n",
       "      <th>nro autores</th>\n",
       "      <th>nombre revista o premio</th>\n",
       "      <th>año realiz</th>\n",
       "      <th>título</th>\n",
       "      <th>país</th>\n",
       "      <th>idioma</th>\n",
       "      <th>item adic</th>\n",
       "      <th>valor item</th>\n",
       "      <th>doi</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>URREA DUQUE JUAN PABLO</td>\n",
       "      <td>Ponencia en extenso en evento internacional</td>\n",
       "      <td>2</td>\n",
       "      <td></td>\n",
       "      <td>2015</td>\n",
       "      <td>STATISTICAL PERFORMANCE EVALUATION OF P2P VIDEO STREAMING ON MULTI-HOP WIRELESS NETWORKS.</td>\n",
       "      <td>46</td>\n",
       "      <td>INGLE</td>\n",
       "      <td>EDITORIAL</td>\n",
       "      <td>IEEE</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   nombre                                     material  \\\n",
       "0  URREA DUQUE JUAN PABLO  Ponencia en extenso en evento internacional   \n",
       "\n",
       "   nro autores nombre revista o premio  año realiz  \\\n",
       "0            2                                2015   \n",
       "\n",
       "                                                                                      título  \\\n",
       "0  STATISTICAL PERFORMANCE EVALUATION OF P2P VIDEO STREAMING ON MULTI-HOP WIRELESS NETWORKS.   \n",
       "\n",
       "   país idioma  item adic valor item doi  \n",
       "0    46  INGLE  EDITORIAL       IEEE      "
      ]
     },
     "execution_count": 330,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "udea[:1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Obtain DOI comparing simple titles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#udea=pd.read_excel('producción_reconocida_2002_2016_doi.xlsx')\n",
    "if 'doi' not in udea:\n",
    "    udea['doi']=''\n",
    "udea=udea.fillna('')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def obtain_doi_from_title(udea,WOS_SCP,\\\n",
    "    title_left_on='simple_title',\\\n",
    "    title_right_on='TI'):\n",
    "    udea_rp=udea[udea['nombre revista o premio']!='']\n",
    "    udea_rp=udea_rp.drop_duplicates(subset=['título']).reset_index(drop=True)\n",
    "    \n",
    "    #udea_old_doi=udea[udea.doi!=''].reset_index(drop=True)\n",
    "    #udea_not_doi=udea[udea.doi==''].reset_index(drop=True)\n",
    "\n",
    "    WOS_SCP=pd.read_excel('WOS_SCP.xlsx')\n",
    "    WOS_SCP=fill_NaN(WOS_SCP)\n",
    "\n",
    "    uu=udea_rp[udea_rp.doi==''].reset_index(drop=True)\n",
    "    if title_left_on=='simple_title':\n",
    "        uu['simple_title']=uu.título.str.lower().map(unidecode).str.split('(').str[0]\n",
    "\n",
    "    uuu=merge_with_close_matches(uu,WOS_SCP[WOS_SCP.DI!=''][[title_right_on,'DI','SO']],\\\n",
    "                             left_on=title_left_on,right_on=title_right_on,left_extra_on='nombre revista o premio',\\\n",
    "                                right_extra_on='SO',how='left')\n",
    "\n",
    "    uuu=uuu.fillna('')\n",
    "    if 'DI' in uuu:\n",
    "        uuu=uuu[uuu.DI!='']\n",
    "        for i in uuu.index:\n",
    "            for j in udea[udea.título==uuu.loc[i,'título']].index:\n",
    "                udea.loc[j,'doi']=uuu.loc[i,'DI']\n",
    "\n",
    "\n",
    "    return udea"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0, 12)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "udea[udea.doi!=''].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "udea=pd.read_excel('producción_reconocida_2002_2016_doi.xlsx')\n",
    "udea=udea.fillna('')\n",
    "udea=obtain_doi_from_title(udea,WOS_SCP,\\\n",
    "    title_left_on='título',\\\n",
    "    title_right_on='TI')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "udea.to_excel('producción_reconocida_2002_2016_doi.xlsx',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "udea=pd.read_excel('producción_reconocida_2002_2016_doi.xlsx')\n",
    "udea=udea.fillna('')\n",
    "udea=obtain_doi_from_title(udea,WOS_SCP,\\\n",
    "    title_left_on='simple_title',\\\n",
    "    title_right_on='TI')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "udea.to_excel('producción_reconocida_2002_2016_doi.xlsx',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "..............."
     ]
    }
   ],
   "source": [
    "udea=pd.read_excel('producción_reconocida_2002_2016_doi.xlsx')\n",
    "udea=udea.fillna('')\n",
    "udea=obtain_doi_from_title(udea,WOS_SCP,\\\n",
    "    title_left_on='título',\\\n",
    "    title_right_on='SCP_Title')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "udea.to_excel('producción_reconocida_2002_2016_doi.xlsx',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "................................................................................."
     ]
    }
   ],
   "source": [
    "udea=pd.read_excel('producción_reconocida_2002_2016_doi.xlsx')\n",
    "udea=udea.fillna('')\n",
    "udea=obtain_doi_from_title(udea,WOS_SCP,\\\n",
    "    title_left_on='simple_title',\\\n",
    "    title_right_on='SCP_Title')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(6299, 11)"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "udea[udea.doi!=''].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "udea.to_excel('producción_reconocida_2002_2016_doi.xlsx',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "udea[udea.doi!=''].shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Find  DOIS online from crossref API \n",
    "Move to the end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 311,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "u=u[u.doi=='']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 312,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10010, 13)"
      ]
     },
     "execution_count": 312,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "u.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "u['simple_title']=u.título.str.lower().map(unidecode).str.split('(').str[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "u['apellido']=u.nombre.str.lower().map(unidecode).str.split(' ').str[0]\n",
    "u[:1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "."
     ]
    }
   ],
   "source": [
    "import subprocess\n",
    "#u['doi']=''\n",
    "TITLE='simple_title'\n",
    "#TITLE='título'\n",
    "u['title']=u[TITLE].str.lower().map(unidecode)\n",
    "\n",
    "for i in u.index:\n",
    "    if i%100==0:\n",
    "        print('.',end='')\n",
    "    kk=subprocess.getstatusoutput('echo %d > doi.log' %i)\n",
    "    #f=open('doi.log','w')\n",
    "    #f.write('%d' %i)\n",
    "    #f.close()\n",
    "    u.loc[i,'doi']=get_doi(u.apellido[i],u['title'][i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 313,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0, 13)"
      ]
     },
     "execution_count": 313,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "u[u.doi!=''].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 236,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "uu=u[u.doi!=''][['título','doi']].reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 237,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>título</th>\n",
       "      <th>doi</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>OPTIMIZACION DEL TRATAMIENTO DE AGUAS RESIDUALES DE CULTIVOS DE FLORES USANDO HUMEDALES CONTRUIDOS DE FLUJO SUBSUPERFICIAL HORIZONTAL. (OPTIMIZATION OF WASTEWATER TREATMENT FROM A FLOWER CROPS USI...</td>\n",
       "      <td>10.17533/udea.rfnsp.v34n1a03</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                                                                                                                                    título  \\\n",
       "0  OPTIMIZACION DEL TRATAMIENTO DE AGUAS RESIDUALES DE CULTIVOS DE FLORES USANDO HUMEDALES CONTRUIDOS DE FLUJO SUBSUPERFICIAL HORIZONTAL. (OPTIMIZATION OF WASTEWATER TREATMENT FROM A FLOWER CROPS USI...   \n",
       "\n",
       "                            doi  \n",
       "0  10.17533/udea.rfnsp.v34n1a03  "
      ]
     },
     "execution_count": 237,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "uu[:1]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }