Last active
February 9, 2018 13:23
-
-
Save ericfourrier/824676a9976a5f2330782325f6d509b6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Citation Mining\n", | |
"====" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Python standard\n", | |
"import random\n", | |
"import numpy as np\n", | |
"import csv\n", | |
"\n", | |
"# Machine learning libs\n", | |
"from sklearn import svm\n", | |
"from sklearn.linear_model import SGDClassifier\n", | |
"from sklearn.metrics.pairwise import linear_kernel\n", | |
"from sklearn import preprocessing\n", | |
"from sklearn import metrics\n", | |
"from sklearn.ensemble import RandomForestClassifier\n", | |
"from sklearn.linear_model import LogisticRegression\n", | |
"import networkx as nx\n", | |
"import re\n", | |
"from gensim.models import Word2Vec\n", | |
"# User scripts\n", | |
"from utils import load_data\n", | |
"from features import (TitleAuthorYear, \n", | |
" TextOnly, \n", | |
" GraphFeatureExtractor, \n", | |
" GraphOnly)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Look at pandas Dataframe " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"colnames_nodes = [\"id\", \"publication_year\", \"title\", \"authors\", \"journal_name\", \"abstract\"]\n", | |
"df_nodes = pd.read_csv(\"../dataset/node_information.csv\",header=None, names=colnames_nodes)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>id</th>\n", | |
" <th>publication_year</th>\n", | |
" <th>title</th>\n", | |
" <th>authors</th>\n", | |
" <th>journal_name</th>\n", | |
" <th>abstract</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1001</td>\n", | |
" <td>2000</td>\n", | |
" <td>compactification geometry and duality</td>\n", | |
" <td>Paul S. Aspinwall</td>\n", | |
" <td>NaN</td>\n", | |
" <td>these are notes based on lectures given at tas...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1002</td>\n", | |
" <td>2000</td>\n", | |
" <td>domain walls and massive gauged supergravity p...</td>\n", | |
" <td>M. Cvetic, H. Lu, C.N. Pope</td>\n", | |
" <td>Class.Quant.Grav.</td>\n", | |
" <td>we point out that massive gauged supergravity ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1003</td>\n", | |
" <td>2000</td>\n", | |
" <td>comment on metric fluctuations in brane worlds</td>\n", | |
" <td>Y.S. Myung, Gungwon Kang</td>\n", | |
" <td>NaN</td>\n", | |
" <td>recently ivanov and volovich hep-th 9912242 cl...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1004</td>\n", | |
" <td>2000</td>\n", | |
" <td>moving mirrors and thermodynamic paradoxes</td>\n", | |
" <td>Adam D. Helfer</td>\n", | |
" <td>Phys.Rev.</td>\n", | |
" <td>quantum fields responding to moving mirrors ha...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>1005</td>\n", | |
" <td>2000</td>\n", | |
" <td>bundles of chiral blocks and boundary conditio...</td>\n", | |
" <td>J. Fuchs, C. Schweigert</td>\n", | |
" <td>NaN</td>\n", | |
" <td>proceedings of lie iii clausthal july 1999 var...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>1006</td>\n", | |
" <td>2000</td>\n", | |
" <td>questions in quantum physics</td>\n", | |
" <td>Rudolf Haag</td>\n", | |
" <td>NaN</td>\n", | |
" <td>an assessment of the present status of the the...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>1007</td>\n", | |
" <td>2000</td>\n", | |
" <td>topological defects in 3-d euclidean gravity</td>\n", | |
" <td>Sheng Li, Yong Zhang, Zhongyuan Zhu</td>\n", | |
" <td>NaN</td>\n", | |
" <td>by making use of the complete decomposition of...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>1008</td>\n", | |
" <td>2000</td>\n", | |
" <td>n 0 supersymmetry and the non-relativistic mon...</td>\n", | |
" <td>Donald Spector</td>\n", | |
" <td>Phys.Lett.</td>\n", | |
" <td>we study some of the algebraic properties of t...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>1009</td>\n", | |
" <td>2000</td>\n", | |
" <td>gluon pair production from space-time dependen...</td>\n", | |
" <td>Gouranga C. Nayak, Walter Greiner</td>\n", | |
" <td>NaN</td>\n", | |
" <td>we compute the probabilty for the processes a ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>1010</td>\n", | |
" <td>2000</td>\n", | |
" <td>instantons euclidean supersymmetry and wick ro...</td>\n", | |
" <td>A.V. Belitsky, S. V, oren, P. van Nieuwenhuizen</td>\n", | |
" <td>Phys.Lett.</td>\n", | |
" <td>we discuss the reality properties of the fermi...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>1011</td>\n", | |
" <td>2000</td>\n", | |
" <td>noncommutativities of d-branes and theta chang...</td>\n", | |
" <td>Tsunehide Kuroki</td>\n", | |
" <td>Phys.Lett.</td>\n", | |
" <td>in d-brane matrix models improved it is known ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>1012</td>\n", | |
" <td>2000</td>\n", | |
" <td>boundary liouville field theory i boundary sta...</td>\n", | |
" <td>V. Fateev (Montpellier), A. Zamolodchikov (Rut...</td>\n", | |
" <td>NaN</td>\n", | |
" <td>function montpellier liouville conformal field...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>1013</td>\n", | |
" <td>2000</td>\n", | |
" <td>gravity and the newtonian limit in the randall...</td>\n", | |
" <td>Rainer Dick, Dzo Mikulovicz</td>\n", | |
" <td>Phys.Lett.</td>\n", | |
" <td>appended a remark on the compatibility of the ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>1014</td>\n", | |
" <td>2000</td>\n", | |
" <td>gauge theories in local causal perturbation th...</td>\n", | |
" <td>Franz-Marc Boas</td>\n", | |
" <td>NaN</td>\n", | |
" <td>in this thesis quantum gauge theories are cons...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>1015</td>\n", | |
" <td>2000</td>\n", | |
" <td>canonical quantization and topological theories</td>\n", | |
" <td>Alice Rogers</td>\n", | |
" <td>Nucl.Phys.Proc.Suppl.</td>\n", | |
" <td>dynamics and quantum gravity villasimius sardi...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td>1016</td>\n", | |
" <td>2000</td>\n", | |
" <td>a comment on the holographic renormalization g...</td>\n", | |
" <td>Enrique Alvarez, Cesar Gomez</td>\n", | |
" <td>Phys.Lett.</td>\n", | |
" <td>theorem the equivalence between the holographi...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td>1017</td>\n", | |
" <td>2000</td>\n", | |
" <td>the interaction of two hopf solitons</td>\n", | |
" <td>R. S. Ward</td>\n", | |
" <td>Phys.Lett.</td>\n", | |
" <td>this letter deals with topological solitons in...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td>1018</td>\n", | |
" <td>2000</td>\n", | |
" <td>solitons in brane worlds ii</td>\n", | |
" <td>NaN</td>\n", | |
" <td>Nucl.Phys.</td>\n", | |
" <td>added minor errors corrected we study the solu...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18</th>\n", | |
" <td>1019</td>\n", | |
" <td>2000</td>\n", | |
" <td>casimir energy of a dilute dielectric ball wit...</td>\n", | |
" <td>I. Klich, J. Feinberg, A. Mann, M. Revzen</td>\n", | |
" <td>Phys.Rev.</td>\n", | |
" <td>light at finite temperature the casimir energy...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>19</th>\n", | |
" <td>1020</td>\n", | |
" <td>2000</td>\n", | |
" <td>some uses of moduli spaces in particle and fie...</td>\n", | |
" <td>ST Tsou (Oxford)</td>\n", | |
" <td>NaN</td>\n", | |
" <td>women in mathematics workshop on moduli spaces...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>20</th>\n", | |
" <td>1021</td>\n", | |
" <td>2000</td>\n", | |
" <td>a two-loop test of buscher's t-duality i</td>\n", | |
" <td>Zalan Horvath, Robert L. Karp, Laszlo Palla</td>\n", | |
" <td>Phys.Rev.</td>\n", | |
" <td>we study the two loop quantum equivalence of s...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>21</th>\n", | |
" <td>1022</td>\n", | |
" <td>2000</td>\n", | |
" <td>supersymmetry and bogomol'nyi equations in the...</td>\n", | |
" <td>Bogdan Damski (Jagiellonian University)</td>\n", | |
" <td>Acta</td>\n", | |
" <td>systems we take advantage of the superspace fo...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>22</th>\n", | |
" <td>1023</td>\n", | |
" <td>2000</td>\n", | |
" <td>the string uncertainty relations follow from t...</td>\n", | |
" <td>NaN</td>\n", | |
" <td>Found.Phys.</td>\n", | |
" <td>principle polymomenta coordinates has been mad...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>23</th>\n", | |
" <td>1024</td>\n", | |
" <td>2000</td>\n", | |
" <td>bps states of d 4 n 1 supersymmetry</td>\n", | |
" <td>Jerome P. Gauntlett, Gary W. Gibbons, Christop...</td>\n", | |
" <td>Commun.Math.Phys.</td>\n", | |
" <td>townsend expanded discussion on bps states in ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>24</th>\n", | |
" <td>1025</td>\n", | |
" <td>2000</td>\n", | |
" <td>collapsing d-branes in calabi-yau moduli space</td>\n", | |
" <td>Brian R. Greene, C. I. Lazaroiu</td>\n", | |
" <td>Nucl.Phys.</td>\n", | |
" <td>we study the quantum volume of d-branes wrappe...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25</th>\n", | |
" <td>1026</td>\n", | |
" <td>2000</td>\n", | |
" <td>poisson-sigma models</td>\n", | |
" <td>Allen C. Hirshfeld (University of Dortmund), T...</td>\n", | |
" <td>NaN</td>\n", | |
" <td>university of dortmund physical variables in g...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>26</th>\n", | |
" <td>1027</td>\n", | |
" <td>2000</td>\n", | |
" <td>bi-local fields in noncommutative field theory</td>\n", | |
" <td>Satoshi Iso, Hikaru Kawai, Yoshihisa Kitazawa</td>\n", | |
" <td>Nucl.Phys.</td>\n", | |
" <td>we propose a bi-local representation in noncom...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27</th>\n", | |
" <td>1028</td>\n", | |
" <td>2000</td>\n", | |
" <td>tree amplitudes and linearized susy invariants...</td>\n", | |
" <td>Domenico Seminara (LPT-ENS, Paris)</td>\n", | |
" <td>Nucl.Phys.Proc.Suppl.</td>\n", | |
" <td>gravity villasimius sept 13-17 1999 we exploit...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>28</th>\n", | |
" <td>1029</td>\n", | |
" <td>2000</td>\n", | |
" <td>type i and real algebraic geometry</td>\n", | |
" <td>F.A. Cachazo, C. Vafa</td>\n", | |
" <td>NaN</td>\n", | |
" <td>we revisit the duality between type i and hete...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>29</th>\n", | |
" <td>1030</td>\n", | |
" <td>2000</td>\n", | |
" <td>a geometric discretisation scheme applied to t...</td>\n", | |
" <td>Samik Sen, Siddhartha Sen, James C. Sexton, Da...</td>\n", | |
" <td>Phys.Rev.</td>\n", | |
" <td>theory we give a detailed general description ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27740</th>\n", | |
" <td>9912264</td>\n", | |
" <td>1999</td>\n", | |
" <td>the osp 1 4 superparticle and exotic bps states</td>\n", | |
" <td>Igor B, os, Jerzy Lukierski, Dmitri Sorokin</td>\n", | |
" <td>NaN</td>\n", | |
" <td>symposium karpacz poland september 21 25 1999 ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27741</th>\n", | |
" <td>9912265</td>\n", | |
" <td>1999</td>\n", | |
" <td>symmetrization of berezin star product and pat...</td>\n", | |
" <td>Satoru Saito, Kazunori Wakatsuki</td>\n", | |
" <td>Prog.Theor.Phys.</td>\n", | |
" <td>we propose a new star pruduct which interpolat...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27742</th>\n", | |
" <td>9912266</td>\n", | |
" <td>1999</td>\n", | |
" <td>deformation of conifold and intersecting branes</td>\n", | |
" <td>Kazutoshi Ohta, Takashi Yokono</td>\n", | |
" <td>JHEP</td>\n", | |
" <td>we study the relation between intersecting ns5...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27743</th>\n", | |
" <td>9912267</td>\n", | |
" <td>1999</td>\n", | |
" <td>on the relation between euclidean and lorentzi...</td>\n", | |
" <td>J. Ambjorn, J. Correia, C. Kristjansen (NBI), ...</td>\n", | |
" <td>Phys.Lett.</td>\n", | |
" <td>starting from 2d euclidean quantum gravity we ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27744</th>\n", | |
" <td>9912268</td>\n", | |
" <td>2000</td>\n", | |
" <td>yang-mills theory in three dimensions as quant...</td>\n", | |
" <td>Dmitri Diakonov, Victor Petrov</td>\n", | |
" <td>J.Exp.Theor.Phys.</td>\n", | |
" <td>1012-1035 we perform the dual transformation o...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27745</th>\n", | |
" <td>9912269</td>\n", | |
" <td>1999</td>\n", | |
" <td>quadratic effective action for qed in d 2 3 di...</td>\n", | |
" <td>D. Dalmazi, A. de Souza Dutra, Marcelo Hott</td>\n", | |
" <td>Phys.Rev.</td>\n", | |
" <td>we calculate the effective action for quantum ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27746</th>\n", | |
" <td>9912270</td>\n", | |
" <td>2000</td>\n", | |
" <td>yang-lee zeros of the ising model on random gr...</td>\n", | |
" <td>Luiz C. de Albuquerque, Nelson A. Alves, D. Da...</td>\n", | |
" <td>Nucl.Phys.</td>\n", | |
" <td>topology nucl phys b we obtain in a closed for...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27747</th>\n", | |
" <td>9912271</td>\n", | |
" <td>1999</td>\n", | |
" <td>lectures on supersymmetric yang-mills theory a...</td>\n", | |
" <td>Eric D'Hoker (UCLA), D. H. Phong (Columbia Uni...</td>\n", | |
" <td>NaN</td>\n", | |
" <td>we present a series of four self-contained lec...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27748</th>\n", | |
" <td>9912272</td>\n", | |
" <td>2000</td>\n", | |
" <td>twisted bundles on noncommutative t 4 and d-br...</td>\n", | |
" <td>Eunsang Kim, Hoil Kim, Nakwoo Kim, Bum-Hoon Le...</td>\n", | |
" <td>Phys.Rev.</td>\n", | |
" <td>hyun seok yang added we construct twisted quan...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27749</th>\n", | |
" <td>9912273</td>\n", | |
" <td>1999</td>\n", | |
" <td>brane cube realization of three-dimensional no...</td>\n", | |
" <td>Tomomi Muto</td>\n", | |
" <td>JHEP</td>\n", | |
" <td>we study d-branes on three-dimensional orbifol...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27750</th>\n", | |
" <td>9912274</td>\n", | |
" <td>2000</td>\n", | |
" <td>open string field theory on noncommutative space</td>\n", | |
" <td>Teruhiko Kawano, Tomohiko Takahashi (Univ. of ...</td>\n", | |
" <td>Prog.Theor.Phys.</td>\n", | |
" <td>we study witten's open string field theory in ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27751</th>\n", | |
" <td>9912275</td>\n", | |
" <td>1999</td>\n", | |
" <td>d branes in string theory ii</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>in these lectures we review the properties of ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27752</th>\n", | |
" <td>9912276</td>\n", | |
" <td>2000</td>\n", | |
" <td>four-dimensional planck scale is not universal...</td>\n", | |
" <td>Takaaki Ozeki, Noriyuki Shimoyama</td>\n", | |
" <td>Prog.Theor.Phys.</td>\n", | |
" <td>randall-sundrum scenario it has recently been ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27753</th>\n", | |
" <td>9912277</td>\n", | |
" <td>1999</td>\n", | |
" <td>on g h geometry and its use in m-theory compac...</td>\n", | |
" <td>Leonardo Castellani</td>\n", | |
" <td>Annals</td>\n", | |
" <td>the riemannian geometry of coset spaces is rev...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27754</th>\n", | |
" <td>9912278</td>\n", | |
" <td>1999</td>\n", | |
" <td>semi-naive dimensional renormalization</td>\n", | |
" <td>M. Pernici</td>\n", | |
" <td>Nucl.Phys.</td>\n", | |
" <td>we propose a treatment of gamma 5 in dimension...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27755</th>\n", | |
" <td>9912279</td>\n", | |
" <td>1999</td>\n", | |
" <td>self-duality ramond-ramond fields and k-theory</td>\n", | |
" <td>Gregory Moore, Edward Witten</td>\n", | |
" <td>JHEP</td>\n", | |
" <td>just as d-brane charge of type iia and type ii...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27756</th>\n", | |
" <td>9912280</td>\n", | |
" <td>2000</td>\n", | |
" <td>coordinate-free action for ads3 higher-spin-ma...</td>\n", | |
" <td>Sergey Prokushkin (1), Arkady Segal (2), , Mik...</td>\n", | |
" <td>Phys.Lett.</td>\n", | |
" <td>stanford university 2 lebedev physics institut...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27757</th>\n", | |
" <td>9912281</td>\n", | |
" <td>1999</td>\n", | |
" <td>mathematics and physics of n 2 strings</td>\n", | |
" <td>Olaf Lechtenfeld</td>\n", | |
" <td>NaN</td>\n", | |
" <td>open and closed strings with two worldsheet su...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27758</th>\n", | |
" <td>9912282</td>\n", | |
" <td>1999</td>\n", | |
" <td>reparametrization invariance and the schr odin...</td>\n", | |
" <td>V.I. Tkach, A. Pashnev, J.J. Rosales</td>\n", | |
" <td>NaN</td>\n", | |
" <td>in the present work we consider a time-depende...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27759</th>\n", | |
" <td>9912283</td>\n", | |
" <td>2000</td>\n", | |
" <td>a heavy fermion can create a soliton</td>\n", | |
" <td>E. Farhi, N. Graham, R. L. Jaffe, H. Weigel</td>\n", | |
" <td>Phys.Lett.</td>\n", | |
" <td>minor errors added reference v3 corrected refe...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27760</th>\n", | |
" <td>9912284</td>\n", | |
" <td>2000</td>\n", | |
" <td>on the gauge fixing of the k symmetry on ads a...</td>\n", | |
" <td>Igor Pes, o</td>\n", | |
" <td>Phys.Lett.</td>\n", | |
" <td>lightcone action for the type iib string on ad...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27761</th>\n", | |
" <td>9912285</td>\n", | |
" <td>1999</td>\n", | |
" <td>dimensional reduction magnetic flux strings an...</td>\n", | |
" <td>C. D. Fosco, A. Lopez, F. A. Schaposnik</td>\n", | |
" <td>Nucl.Phys.</td>\n", | |
" <td>we study some consequences of dimensionally re...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27762</th>\n", | |
" <td>9912286</td>\n", | |
" <td>2000</td>\n", | |
" <td>exact renormalized one-loop quantum correction...</td>\n", | |
" <td>N. Graham</td>\n", | |
" <td>NaN</td>\n", | |
" <td>field configurations error in eq b 22 v3 fixed...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27763</th>\n", | |
" <td>9912287</td>\n", | |
" <td>2000</td>\n", | |
" <td>global structure of exact cosmological solutio...</td>\n", | |
" <td>Shinji Mukohyama, Tetsuya Shiromizu, Kei-ichi ...</td>\n", | |
" <td>Phys.Rev.</td>\n", | |
" <td>publication in physical review d we find the e...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27764</th>\n", | |
" <td>9912288</td>\n", | |
" <td>2000</td>\n", | |
" <td>s-wave absorption of scalars by noncommutative...</td>\n", | |
" <td>Y.S. Myung, Gungwon Kang, H.W. Lee (Inje Unive...</td>\n", | |
" <td>Phys.Rev.</td>\n", | |
" <td>on the supergravity side we study the propagat...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27765</th>\n", | |
" <td>9912289</td>\n", | |
" <td>2002</td>\n", | |
" <td>gauge fixing in the chain by chain method</td>\n", | |
" <td>A Shirzad, F Loran</td>\n", | |
" <td>NaN</td>\n", | |
" <td>in a recent work we showed that for a hamilton...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27766</th>\n", | |
" <td>9912290</td>\n", | |
" <td>2000</td>\n", | |
" <td>shuffling quantum field theory</td>\n", | |
" <td>Dirk Kreimer</td>\n", | |
" <td>Lett.Math.Phys.</td>\n", | |
" <td>we discuss shuffle identities between feynman ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27767</th>\n", | |
" <td>9912291</td>\n", | |
" <td>1999</td>\n", | |
" <td>small object limit of casimir effect and the s...</td>\n", | |
" <td>O. Kenneth, S. Nussinov</td>\n", | |
" <td>Phys.Rev.</td>\n", | |
" <td>we show a simple way of deriving the casimir p...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27768</th>\n", | |
" <td>9912292</td>\n", | |
" <td>1999</td>\n", | |
" <td>1 4 pbgs and superparticle actions</td>\n", | |
" <td>F.Delduc, E. Ivanov, S. Krivonos</td>\n", | |
" <td>NaN</td>\n", | |
" <td>karpacz poland september 21-25 1999 we constru...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27769</th>\n", | |
" <td>9912293</td>\n", | |
" <td>2000</td>\n", | |
" <td>corrections to the abelian born-infeld action ...</td>\n", | |
" <td>L. Cornalba (I.H.E.S.)</td>\n", | |
" <td>JHEP</td>\n", | |
" <td>noncommutative geometry in a recent paper seib...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>27770 rows × 6 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" id publication_year \\\n", | |
"0 1001 2000 \n", | |
"1 1002 2000 \n", | |
"2 1003 2000 \n", | |
"3 1004 2000 \n", | |
"4 1005 2000 \n", | |
"5 1006 2000 \n", | |
"6 1007 2000 \n", | |
"7 1008 2000 \n", | |
"8 1009 2000 \n", | |
"9 1010 2000 \n", | |
"10 1011 2000 \n", | |
"11 1012 2000 \n", | |
"12 1013 2000 \n", | |
"13 1014 2000 \n", | |
"14 1015 2000 \n", | |
"15 1016 2000 \n", | |
"16 1017 2000 \n", | |
"17 1018 2000 \n", | |
"18 1019 2000 \n", | |
"19 1020 2000 \n", | |
"20 1021 2000 \n", | |
"21 1022 2000 \n", | |
"22 1023 2000 \n", | |
"23 1024 2000 \n", | |
"24 1025 2000 \n", | |
"25 1026 2000 \n", | |
"26 1027 2000 \n", | |
"27 1028 2000 \n", | |
"28 1029 2000 \n", | |
"29 1030 2000 \n", | |
"... ... ... \n", | |
"27740 9912264 1999 \n", | |
"27741 9912265 1999 \n", | |
"27742 9912266 1999 \n", | |
"27743 9912267 1999 \n", | |
"27744 9912268 2000 \n", | |
"27745 9912269 1999 \n", | |
"27746 9912270 2000 \n", | |
"27747 9912271 1999 \n", | |
"27748 9912272 2000 \n", | |
"27749 9912273 1999 \n", | |
"27750 9912274 2000 \n", | |
"27751 9912275 1999 \n", | |
"27752 9912276 2000 \n", | |
"27753 9912277 1999 \n", | |
"27754 9912278 1999 \n", | |
"27755 9912279 1999 \n", | |
"27756 9912280 2000 \n", | |
"27757 9912281 1999 \n", | |
"27758 9912282 1999 \n", | |
"27759 9912283 2000 \n", | |
"27760 9912284 2000 \n", | |
"27761 9912285 1999 \n", | |
"27762 9912286 2000 \n", | |
"27763 9912287 2000 \n", | |
"27764 9912288 2000 \n", | |
"27765 9912289 2002 \n", | |
"27766 9912290 2000 \n", | |
"27767 9912291 1999 \n", | |
"27768 9912292 1999 \n", | |
"27769 9912293 2000 \n", | |
"\n", | |
" title \\\n", | |
"0 compactification geometry and duality \n", | |
"1 domain walls and massive gauged supergravity p... \n", | |
"2 comment on metric fluctuations in brane worlds \n", | |
"3 moving mirrors and thermodynamic paradoxes \n", | |
"4 bundles of chiral blocks and boundary conditio... \n", | |
"5 questions in quantum physics \n", | |
"6 topological defects in 3-d euclidean gravity \n", | |
"7 n 0 supersymmetry and the non-relativistic mon... \n", | |
"8 gluon pair production from space-time dependen... \n", | |
"9 instantons euclidean supersymmetry and wick ro... \n", | |
"10 noncommutativities of d-branes and theta chang... \n", | |
"11 boundary liouville field theory i boundary sta... \n", | |
"12 gravity and the newtonian limit in the randall... \n", | |
"13 gauge theories in local causal perturbation th... \n", | |
"14 canonical quantization and topological theories \n", | |
"15 a comment on the holographic renormalization g... \n", | |
"16 the interaction of two hopf solitons \n", | |
"17 solitons in brane worlds ii \n", | |
"18 casimir energy of a dilute dielectric ball wit... \n", | |
"19 some uses of moduli spaces in particle and fie... \n", | |
"20 a two-loop test of buscher's t-duality i \n", | |
"21 supersymmetry and bogomol'nyi equations in the... \n", | |
"22 the string uncertainty relations follow from t... \n", | |
"23 bps states of d 4 n 1 supersymmetry \n", | |
"24 collapsing d-branes in calabi-yau moduli space \n", | |
"25 poisson-sigma models \n", | |
"26 bi-local fields in noncommutative field theory \n", | |
"27 tree amplitudes and linearized susy invariants... \n", | |
"28 type i and real algebraic geometry \n", | |
"29 a geometric discretisation scheme applied to t... \n", | |
"... ... \n", | |
"27740 the osp 1 4 superparticle and exotic bps states \n", | |
"27741 symmetrization of berezin star product and pat... \n", | |
"27742 deformation of conifold and intersecting branes \n", | |
"27743 on the relation between euclidean and lorentzi... \n", | |
"27744 yang-mills theory in three dimensions as quant... \n", | |
"27745 quadratic effective action for qed in d 2 3 di... \n", | |
"27746 yang-lee zeros of the ising model on random gr... \n", | |
"27747 lectures on supersymmetric yang-mills theory a... \n", | |
"27748 twisted bundles on noncommutative t 4 and d-br... \n", | |
"27749 brane cube realization of three-dimensional no... \n", | |
"27750 open string field theory on noncommutative space \n", | |
"27751 d branes in string theory ii \n", | |
"27752 four-dimensional planck scale is not universal... \n", | |
"27753 on g h geometry and its use in m-theory compac... \n", | |
"27754 semi-naive dimensional renormalization \n", | |
"27755 self-duality ramond-ramond fields and k-theory \n", | |
"27756 coordinate-free action for ads3 higher-spin-ma... \n", | |
"27757 mathematics and physics of n 2 strings \n", | |
"27758 reparametrization invariance and the schr odin... \n", | |
"27759 a heavy fermion can create a soliton \n", | |
"27760 on the gauge fixing of the k symmetry on ads a... \n", | |
"27761 dimensional reduction magnetic flux strings an... \n", | |
"27762 exact renormalized one-loop quantum correction... \n", | |
"27763 global structure of exact cosmological solutio... \n", | |
"27764 s-wave absorption of scalars by noncommutative... \n", | |
"27765 gauge fixing in the chain by chain method \n", | |
"27766 shuffling quantum field theory \n", | |
"27767 small object limit of casimir effect and the s... \n", | |
"27768 1 4 pbgs and superparticle actions \n", | |
"27769 corrections to the abelian born-infeld action ... \n", | |
"\n", | |
" authors \\\n", | |
"0 Paul S. Aspinwall \n", | |
"1 M. Cvetic, H. Lu, C.N. Pope \n", | |
"2 Y.S. Myung, Gungwon Kang \n", | |
"3 Adam D. Helfer \n", | |
"4 J. Fuchs, C. Schweigert \n", | |
"5 Rudolf Haag \n", | |
"6 Sheng Li, Yong Zhang, Zhongyuan Zhu \n", | |
"7 Donald Spector \n", | |
"8 Gouranga C. Nayak, Walter Greiner \n", | |
"9 A.V. Belitsky, S. V, oren, P. van Nieuwenhuizen \n", | |
"10 Tsunehide Kuroki \n", | |
"11 V. Fateev (Montpellier), A. Zamolodchikov (Rut... \n", | |
"12 Rainer Dick, Dzo Mikulovicz \n", | |
"13 Franz-Marc Boas \n", | |
"14 Alice Rogers \n", | |
"15 Enrique Alvarez, Cesar Gomez \n", | |
"16 R. S. Ward \n", | |
"17 NaN \n", | |
"18 I. Klich, J. Feinberg, A. Mann, M. Revzen \n", | |
"19 ST Tsou (Oxford) \n", | |
"20 Zalan Horvath, Robert L. Karp, Laszlo Palla \n", | |
"21 Bogdan Damski (Jagiellonian University) \n", | |
"22 NaN \n", | |
"23 Jerome P. Gauntlett, Gary W. Gibbons, Christop... \n", | |
"24 Brian R. Greene, C. I. Lazaroiu \n", | |
"25 Allen C. Hirshfeld (University of Dortmund), T... \n", | |
"26 Satoshi Iso, Hikaru Kawai, Yoshihisa Kitazawa \n", | |
"27 Domenico Seminara (LPT-ENS, Paris) \n", | |
"28 F.A. Cachazo, C. Vafa \n", | |
"29 Samik Sen, Siddhartha Sen, James C. Sexton, Da... \n", | |
"... ... \n", | |
"27740 Igor B, os, Jerzy Lukierski, Dmitri Sorokin \n", | |
"27741 Satoru Saito, Kazunori Wakatsuki \n", | |
"27742 Kazutoshi Ohta, Takashi Yokono \n", | |
"27743 J. Ambjorn, J. Correia, C. Kristjansen (NBI), ... \n", | |
"27744 Dmitri Diakonov, Victor Petrov \n", | |
"27745 D. Dalmazi, A. de Souza Dutra, Marcelo Hott \n", | |
"27746 Luiz C. de Albuquerque, Nelson A. Alves, D. Da... \n", | |
"27747 Eric D'Hoker (UCLA), D. H. Phong (Columbia Uni... \n", | |
"27748 Eunsang Kim, Hoil Kim, Nakwoo Kim, Bum-Hoon Le... \n", | |
"27749 Tomomi Muto \n", | |
"27750 Teruhiko Kawano, Tomohiko Takahashi (Univ. of ... \n", | |
"27751 NaN \n", | |
"27752 Takaaki Ozeki, Noriyuki Shimoyama \n", | |
"27753 Leonardo Castellani \n", | |
"27754 M. Pernici \n", | |
"27755 Gregory Moore, Edward Witten \n", | |
"27756 Sergey Prokushkin (1), Arkady Segal (2), , Mik... \n", | |
"27757 Olaf Lechtenfeld \n", | |
"27758 V.I. Tkach, A. Pashnev, J.J. Rosales \n", | |
"27759 E. Farhi, N. Graham, R. L. Jaffe, H. Weigel \n", | |
"27760 Igor Pes, o \n", | |
"27761 C. D. Fosco, A. Lopez, F. A. Schaposnik \n", | |
"27762 N. Graham \n", | |
"27763 Shinji Mukohyama, Tetsuya Shiromizu, Kei-ichi ... \n", | |
"27764 Y.S. Myung, Gungwon Kang, H.W. Lee (Inje Unive... \n", | |
"27765 A Shirzad, F Loran \n", | |
"27766 Dirk Kreimer \n", | |
"27767 O. Kenneth, S. Nussinov \n", | |
"27768 F.Delduc, E. Ivanov, S. Krivonos \n", | |
"27769 L. Cornalba (I.H.E.S.) \n", | |
"\n", | |
" journal_name \\\n", | |
"0 NaN \n", | |
"1 Class.Quant.Grav. \n", | |
"2 NaN \n", | |
"3 Phys.Rev. \n", | |
"4 NaN \n", | |
"5 NaN \n", | |
"6 NaN \n", | |
"7 Phys.Lett. \n", | |
"8 NaN \n", | |
"9 Phys.Lett. \n", | |
"10 Phys.Lett. \n", | |
"11 NaN \n", | |
"12 Phys.Lett. \n", | |
"13 NaN \n", | |
"14 Nucl.Phys.Proc.Suppl. \n", | |
"15 Phys.Lett. \n", | |
"16 Phys.Lett. \n", | |
"17 Nucl.Phys. \n", | |
"18 Phys.Rev. \n", | |
"19 NaN \n", | |
"20 Phys.Rev. \n", | |
"21 Acta \n", | |
"22 Found.Phys. \n", | |
"23 Commun.Math.Phys. \n", | |
"24 Nucl.Phys. \n", | |
"25 NaN \n", | |
"26 Nucl.Phys. \n", | |
"27 Nucl.Phys.Proc.Suppl. \n", | |
"28 NaN \n", | |
"29 Phys.Rev. \n", | |
"... ... \n", | |
"27740 NaN \n", | |
"27741 Prog.Theor.Phys. \n", | |
"27742 JHEP \n", | |
"27743 Phys.Lett. \n", | |
"27744 J.Exp.Theor.Phys. \n", | |
"27745 Phys.Rev. \n", | |
"27746 Nucl.Phys. \n", | |
"27747 NaN \n", | |
"27748 Phys.Rev. \n", | |
"27749 JHEP \n", | |
"27750 Prog.Theor.Phys. \n", | |
"27751 NaN \n", | |
"27752 Prog.Theor.Phys. \n", | |
"27753 Annals \n", | |
"27754 Nucl.Phys. \n", | |
"27755 JHEP \n", | |
"27756 Phys.Lett. \n", | |
"27757 NaN \n", | |
"27758 NaN \n", | |
"27759 Phys.Lett. \n", | |
"27760 Phys.Lett. \n", | |
"27761 Nucl.Phys. \n", | |
"27762 NaN \n", | |
"27763 Phys.Rev. \n", | |
"27764 Phys.Rev. \n", | |
"27765 NaN \n", | |
"27766 Lett.Math.Phys. \n", | |
"27767 Phys.Rev. \n", | |
"27768 NaN \n", | |
"27769 JHEP \n", | |
"\n", | |
" abstract \n", | |
"0 these are notes based on lectures given at tas... \n", | |
"1 we point out that massive gauged supergravity ... \n", | |
"2 recently ivanov and volovich hep-th 9912242 cl... \n", | |
"3 quantum fields responding to moving mirrors ha... \n", | |
"4 proceedings of lie iii clausthal july 1999 var... \n", | |
"5 an assessment of the present status of the the... \n", | |
"6 by making use of the complete decomposition of... \n", | |
"7 we study some of the algebraic properties of t... \n", | |
"8 we compute the probabilty for the processes a ... \n", | |
"9 we discuss the reality properties of the fermi... \n", | |
"10 in d-brane matrix models improved it is known ... \n", | |
"11 function montpellier liouville conformal field... \n", | |
"12 appended a remark on the compatibility of the ... \n", | |
"13 in this thesis quantum gauge theories are cons... \n", | |
"14 dynamics and quantum gravity villasimius sardi... \n", | |
"15 theorem the equivalence between the holographi... \n", | |
"16 this letter deals with topological solitons in... \n", | |
"17 added minor errors corrected we study the solu... \n", | |
"18 light at finite temperature the casimir energy... \n", | |
"19 women in mathematics workshop on moduli spaces... \n", | |
"20 we study the two loop quantum equivalence of s... \n", | |
"21 systems we take advantage of the superspace fo... \n", | |
"22 principle polymomenta coordinates has been mad... \n", | |
"23 townsend expanded discussion on bps states in ... \n", | |
"24 we study the quantum volume of d-branes wrappe... \n", | |
"25 university of dortmund physical variables in g... \n", | |
"26 we propose a bi-local representation in noncom... \n", | |
"27 gravity villasimius sept 13-17 1999 we exploit... \n", | |
"28 we revisit the duality between type i and hete... \n", | |
"29 theory we give a detailed general description ... \n", | |
"... ... \n", | |
"27740 symposium karpacz poland september 21 25 1999 ... \n", | |
"27741 we propose a new star pruduct which interpolat... \n", | |
"27742 we study the relation between intersecting ns5... \n", | |
"27743 starting from 2d euclidean quantum gravity we ... \n", | |
"27744 1012-1035 we perform the dual transformation o... \n", | |
"27745 we calculate the effective action for quantum ... \n", | |
"27746 topology nucl phys b we obtain in a closed for... \n", | |
"27747 we present a series of four self-contained lec... \n", | |
"27748 hyun seok yang added we construct twisted quan... \n", | |
"27749 we study d-branes on three-dimensional orbifol... \n", | |
"27750 we study witten's open string field theory in ... \n", | |
"27751 in these lectures we review the properties of ... \n", | |
"27752 randall-sundrum scenario it has recently been ... \n", | |
"27753 the riemannian geometry of coset spaces is rev... \n", | |
"27754 we propose a treatment of gamma 5 in dimension... \n", | |
"27755 just as d-brane charge of type iia and type ii... \n", | |
"27756 stanford university 2 lebedev physics institut... \n", | |
"27757 open and closed strings with two worldsheet su... \n", | |
"27758 in the present work we consider a time-depende... \n", | |
"27759 minor errors added reference v3 corrected refe... \n", | |
"27760 lightcone action for the type iib string on ad... \n", | |
"27761 we study some consequences of dimensionally re... \n", | |
"27762 field configurations error in eq b 22 v3 fixed... \n", | |
"27763 publication in physical review d we find the e... \n", | |
"27764 on the supergravity side we study the propagat... \n", | |
"27765 in a recent work we showed that for a hamilton... \n", | |
"27766 we discuss shuffle identities between feynman ... \n", | |
"27767 we show a simple way of deriving the casimir p... \n", | |
"27768 karpacz poland september 21-25 1999 we constru... \n", | |
"27769 noncommutative geometry in a recent paper seib... \n", | |
"\n", | |
"[27770 rows x 6 columns]" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_nodes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Load stemmer\n", | |
"from abc import abstractmethod\n", | |
"import numpy as np\n", | |
"import networkx as nx\n", | |
"import nltk\n", | |
"import scipy.sparse\n", | |
"from process_authors import process_authors, split_authors\n", | |
"from sklearn.feature_extraction.text import TfidfVectorizer\n", | |
"from fuzzywuzzy import fuzz\n", | |
"PRINT_EVERY = 10000\n", | |
"# Load stemmer\n", | |
"stemmer = nltk.PorterStemmer()\n", | |
"\n", | |
"# Load stopwords\n", | |
"nltk.download('stopwords') # uncomment if running for first time\n", | |
"stopwords = set(nltk.corpus.stopwords.words('english'))\n", | |
"class BaseFeatureExtractor(object):\n", | |
" def __init__(self, node_info):\n", | |
" self.IDs = {int(element[0]): idx for idx, element in enumerate(node_info)}\n", | |
" self.node_info = node_info\n", | |
"\n", | |
" def get_features(self, data, msg='dataset?'):\n", | |
" features = []\n", | |
" for i, row in enumerate(data):\n", | |
" source_id, target_id = row[0:2]\n", | |
" source_idx, target_idx = self.IDs[source_id], self.IDs[target_id]\n", | |
" source_info, target_info = self.node_info[source_idx], self.node_info[target_idx]\n", | |
" feature = self.get_feature(source_idx, source_info, target_idx, target_info)\n", | |
" features.append(feature)\n", | |
" if i % PRINT_EVERY == True:\n", | |
" print '[{}] ({:.0f}%) processed (#{})'.format(msg, 100.*i/len(data), i)\n", | |
" if isinstance(features[0], scipy.sparse.coo.coo_matrix):\n", | |
" return scipy.sparse.vstack(features)\n", | |
" else:\n", | |
" return np.asarray(features)\n", | |
"\n", | |
" @abstractmethod\n", | |
" def get_feature(source_idx, source_info, target_idx, target_info):\n", | |
" pass" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 131, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"class TitleAuthorYear(BaseFeatureExtractor):\n", | |
" \n", | |
" def __init__(self, *args, **kwargs):\n", | |
" super(TitleAuthorYear, self).__init__(*args, **kwargs)\n", | |
" self.colnames = ['n_title', 'n_years', 'n_authors', 'is_same_journal',\n", | |
" 'is_author_missing','is_journal_missing','n_authors_source','n_authors_target']\n", | |
" def clean_info(self, info):\n", | |
" '''common authors, title words, and temporal difference'''\n", | |
" id, year, title, authors, journal, abstract = info\n", | |
" # convert to lowercase and tokenize, remove stopwords\n", | |
" title_ = [w for w in title.lower().split() if w not in stopwords]\n", | |
" title_ = [stemmer.stem(token) for token in title_]\n", | |
" # return year\n", | |
" year_ = (float(year) - 1992.) / (2000. - 1992.)\n", | |
" # return authors\n", | |
" authors_ = split_authors(process_authors(info[3]))\n", | |
" journal_ = journal.lower()\n", | |
" return title_, year_, authors_, journal_\n", | |
"\n", | |
" def get_feature(self, source_idx, source_info, target_idx, target_info):\n", | |
" source_title, source_year, source_authors, source_journal = self.clean_info(source_info)\n", | |
" target_title, target_year, target_authors, target_journal = self.clean_info(target_info)\n", | |
" n_title = fuzz.token_set_ratio(source_title, target_title)\n", | |
" n_years = source_year - target_year\n", | |
" n_authors = len(set(source_authors).intersection(set(target_authors)))\n", | |
" n_authors_source = len(source_authors) if source_authors[0] != \"\" else 0\n", | |
" n_authors_target = len(target_authors)if target_authors[0] != \"\" else 0\n", | |
" is_same_journal = (source_journal == target_journal) # 1 if published is same journal 0 either\n", | |
" is_author_missing = 1 if (source_authors[0] == \"\" or target_authors[0] == \"\") else 0\n", | |
" is_journal_missing = 1 if (source_journal == \"\" or target_journal == \"\") else 0\n", | |
" return np.asarray((n_title, n_years, n_authors, is_same_journal,\n", | |
" is_author_missing, is_journal_missing, n_authors_source, n_authors_target))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Limit size of training set (None for no limit)\n", | |
"MAX_DATA_SIZE = None\n", | |
"test_size = 0.05\n", | |
"\n", | |
"# Load data\n", | |
"itrain, ivalid, ytrain, yvalid, itest, node_info, graph_info = load_data(\n", | |
" '../dataset', max_train_size=MAX_DATA_SIZE, test_size=test_size)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#nx.approximate_current_flow_betweenness_centrality(graph_info)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Total edges 584287\n", | |
"Removed 0-weight edges: 266385\n" | |
] | |
} | |
], | |
"source": [ | |
"# Remove edges with zero weight\n", | |
"connected_graph = graph_info.copy()\n", | |
"print 'Total edges {}'.format(connected_graph.number_of_edges())\n", | |
"to_be_removed = []\n", | |
"for i, j in connected_graph.edges():\n", | |
" if connected_graph[i][j]['WGH'] < 0.5:\n", | |
" to_be_removed.append((i, j))\n", | |
"print 'Removed 0-weight edges: {}'.format(len(to_be_removed))\n", | |
"connected_graph.remove_edges_from(to_be_removed)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#%% Text Features\n", | |
"ft = TextOnly(node_info)\n", | |
"tmp_xtrain = ft.get_features(itrain, 'train')\n", | |
"tmp_xvalid = ft.get_features(ivalid, 'valid')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 81, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[test] (0%) processed (#1)\n", | |
"[test] (31%) processed (#10001)\n", | |
"[test] (61%) processed (#20001)\n", | |
"[test] (92%) processed (#30001)\n" | |
] | |
} | |
], | |
"source": [ | |
"tmp_xtest = ft.get_features(itest, 'test')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 82, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(32648, 6)" | |
] | |
}, | |
"execution_count": 82, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"tmp_xtest.shape" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Trying cosine similarity" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 134, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.metrics.pairwise import cosine_similarity, linear_kernel\n", | |
"class TextOnlyCosine(BaseFeatureExtractor):\n", | |
" def __init__(self, node_info):\n", | |
" self.colnames = ['abstract_cosim']\n", | |
" BaseFeatureExtractor.__init__(self, node_info)\n", | |
" # initialize TFIDF vectorizer\n", | |
" self.abstracts = [node[5] for node in node_info]\n", | |
" self.tv = TfidfVectorizer(stop_words='english')\n", | |
" print 'Fitting tf.idf vectorizer'\n", | |
" self.transformed_abstracts = self.tv.fit_transform(self.abstracts)\n", | |
"\n", | |
" def get_feature(self, source_idx, source_info, target_idx, target_info):\n", | |
" sft = self.transformed_abstracts[source_idx]\n", | |
" tft = self.transformed_abstracts[target_idx]\n", | |
" feature = linear_kernel(sft, tft)[0] # linear kernel faster (X.T*Y) than cosine_similarity (already normalized)\n", | |
" return feature\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Fitting tf.idf vectorizer\n", | |
"[train] (0%) processed (#1)\n", | |
"[train] (2%) processed (#10001)\n", | |
"[train] (3%) processed (#20001)\n", | |
"[train] (5%) processed (#30001)\n", | |
"[train] (7%) processed (#40001)\n", | |
"[train] (9%) processed (#50001)\n", | |
"[train] (10%) processed (#60001)\n", | |
"[train] (12%) processed (#70001)\n", | |
"[train] (14%) processed (#80001)\n", | |
"[train] (15%) processed (#90001)\n", | |
"[train] (17%) processed (#100001)\n", | |
"[train] (19%) processed (#110001)\n", | |
"[train] (21%) processed (#120001)\n", | |
"[train] (22%) processed (#130001)\n", | |
"[train] (24%) processed (#140001)\n", | |
"[train] (26%) processed (#150001)\n", | |
"[train] (27%) processed (#160001)\n", | |
"[train] (29%) processed (#170001)\n", | |
"[train] (31%) processed (#180001)\n", | |
"[train] (32%) processed (#190001)\n", | |
"[train] (34%) processed (#200001)\n", | |
"[train] (36%) processed (#210001)\n", | |
"[train] (38%) processed (#220001)\n", | |
"[train] (39%) processed (#230001)\n", | |
"[train] (41%) processed (#240001)\n", | |
"[train] (43%) processed (#250001)\n", | |
"[train] (44%) processed (#260001)\n", | |
"[train] (46%) processed (#270001)\n", | |
"[train] (48%) processed (#280001)\n", | |
"[train] (50%) processed (#290001)\n", | |
"[train] (51%) processed (#300001)\n", | |
"[train] (53%) processed (#310001)\n", | |
"[train] (55%) processed (#320001)\n", | |
"[train] (56%) processed (#330001)\n", | |
"[train] (58%) processed (#340001)\n", | |
"[train] (60%) processed (#350001)\n", | |
"[train] (62%) processed (#360001)\n", | |
"[train] (63%) processed (#370001)\n", | |
"[train] (65%) processed (#380001)\n", | |
"[train] (67%) processed (#390001)\n", | |
"[train] (68%) processed (#400001)\n", | |
"[train] (70%) processed (#410001)\n", | |
"[train] (72%) processed (#420001)\n", | |
"[train] (74%) processed (#430001)\n", | |
"[train] (75%) processed (#440001)\n", | |
"[train] (77%) processed (#450001)\n", | |
"[train] (79%) processed (#460001)\n", | |
"[train] (80%) processed (#470001)\n", | |
"[train] (82%) processed (#480001)\n", | |
"[train] (84%) processed (#490001)\n", | |
"[train] (86%) processed (#500001)\n", | |
"[train] (87%) processed (#510001)\n", | |
"[train] (89%) processed (#520001)\n", | |
"[train] (91%) processed (#530001)\n", | |
"[train] (92%) processed (#540001)\n", | |
"[train] (94%) processed (#550001)\n", | |
"[train] (96%) processed (#560001)\n", | |
"[train] (97%) processed (#570001)\n", | |
"[train] (99%) processed (#580001)\n", | |
"[valid] (0%) processed (#1)\n", | |
"[valid] (32%) processed (#10001)\n", | |
"[valid] (65%) processed (#20001)\n", | |
"[valid] (97%) processed (#30001)\n", | |
"[test] (0%) processed (#1)\n", | |
"[test] (31%) processed (#10001)\n", | |
"[test] (61%) processed (#20001)\n", | |
"[test] (92%) processed (#30001)\n" | |
] | |
} | |
], | |
"source": [ | |
"ft_cos = TextOnlyCosine(node_info)\n", | |
"tmp_xtrain_cos = ft_cos.get_features(itrain, 'train')\n", | |
"tmp_xvalid_cos = ft_cos.get_features(ivalid, 'valid')\n", | |
"tmp_xtest_cos = ft_cos.get_features(itest, 'test')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Fitting Linear SVM for ABSTRACTS\n", | |
"Accuracy 0.785319729659\n", | |
"F1 0.807201844232\n" | |
] | |
} | |
], | |
"source": [ | |
"print 'Fitting Linear SVM for ABSTRACTS'\n", | |
"classifier = svm.LinearSVC(C=1)\n", | |
"#classifier = LogisticRegression()\n", | |
"classifier.fit(tmp_xtrain, ytrain)\n", | |
"pvalid = classifier.predict(tmp_xvalid)\n", | |
"accuracy = metrics.accuracy_score(yvalid, pvalid)\n", | |
"f1 = metrics.f1_score(yvalid, pvalid)\n", | |
"print 'Accuracy', accuracy\n", | |
"print 'F1', f1\n", | |
"\n", | |
"# compute features\n", | |
"text_xtrain = classifier.decision_function(tmp_xtrain)[:, np.newaxis]\n", | |
"text_xvalid = classifier.decision_function(tmp_xvalid)[:, np.newaxis]\n", | |
"text_xtest = classifier.decision_function(tmp_xtest)[:, np.newaxis]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 129, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"class TextOnlyGensim(BaseFeatureExtractor):\n", | |
" \"\"\" Abstract similarity using word2vec\"\"\"\n", | |
" def __init__(self, node_info):\n", | |
" BaseFeatureExtractor.__init__(self, node_info)\n", | |
" self.colnames = ['abstract_similarity', 'title_similarity']\n", | |
" self.abstracts = [self.clean_text(node[5]) for node in node_info]\n", | |
" self.titles = [self.clean_text(node[2]) for node in node_info]\n", | |
" self.vocabulary = [t+a for t,a in zip(self.abstracts, self.titles)]\n", | |
" print('Building word2net neural network')\n", | |
" self.model_nn = Word2Vec(self.vocabulary, size=200, window=5, min_count=1, workers=4)\n", | |
" \n", | |
" @staticmethod\n", | |
" def clean_text(text,remove_stopwords=True ,stopwords=stopwords):\n", | |
" # Remove non-letters\n", | |
" text = re.sub(\"[^a-zA-Z]\",\" \", text)\n", | |
" # Convert words to lower case and split them\n", | |
" words = text.lower().split()\n", | |
" # Optionally remove stop words (false by default)\n", | |
" if remove_stopwords:\n", | |
" stops = set(stopwords)\n", | |
" words = [w for w in words if not w in stops]\n", | |
" # Return a list of words\n", | |
" return words\n", | |
"\n", | |
" def get_feature(self, source_idx, source_info, target_idx, target_info):\n", | |
" source_abstract = self.clean_text(self.node_info[source_idx][5])\n", | |
" target_abstract = self.clean_text(self.node_info[target_idx][5])\n", | |
" source_title = self.clean_text(self.node_info[source_idx][2])\n", | |
" target_title = self.clean_text(self.node_info[target_idx][2])\n", | |
" abstract_similarity = self.model_nn.n_similarity(source_abstract, target_abstract)\n", | |
" title_similarity = self.model_nn.n_similarity(source_title, target_title)\n", | |
" title_similarity = title_similarity if isinstance(title_similarity,float) else 0\n", | |
" return np.asarray([abstract_similarity, title_similarity])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 118, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Building word2net neural network\n" | |
] | |
} | |
], | |
"source": [ | |
"# Gensim abstract similarity \n", | |
"word2vec_ft = TextOnlyGensim(node_info)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 119, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[train] (0%) processed (#1)\n", | |
"[train] (2%) processed (#10001)\n", | |
"[train] (3%) processed (#20001)\n", | |
"[train] (5%) processed (#30001)\n", | |
"[train] (7%) processed (#40001)\n", | |
"[train] (9%) processed (#50001)\n", | |
"[train] (10%) processed (#60001)\n", | |
"[train] (12%) processed (#70001)\n", | |
"[train] (14%) processed (#80001)\n", | |
"[train] (15%) processed (#90001)\n", | |
"[train] (17%) processed (#100001)\n", | |
"[train] (19%) processed (#110001)\n", | |
"[train] (21%) processed (#120001)\n", | |
"[train] (22%) processed (#130001)\n", | |
"[train] (24%) processed (#140001)\n", | |
"[train] (26%) processed (#150001)\n", | |
"[train] (27%) processed (#160001)\n", | |
"[train] (29%) processed (#170001)\n", | |
"[train] (31%) processed (#180001)\n", | |
"[train] (32%) processed (#190001)\n", | |
"[train] (34%) processed (#200001)\n", | |
"[train] (36%) processed (#210001)\n", | |
"[train] (38%) processed (#220001)\n", | |
"[train] (39%) processed (#230001)\n", | |
"[train] (41%) processed (#240001)\n", | |
"[train] (43%) processed (#250001)\n", | |
"[train] (44%) processed (#260001)\n", | |
"[train] (46%) processed (#270001)\n", | |
"[train] (48%) processed (#280001)\n", | |
"[train] (50%) processed (#290001)\n", | |
"[train] (51%) processed (#300001)\n", | |
"[train] (53%) processed (#310001)\n", | |
"[train] (55%) processed (#320001)\n", | |
"[train] (56%) processed (#330001)\n", | |
"[train] (58%) processed (#340001)\n", | |
"[train] (60%) processed (#350001)\n", | |
"[train] (62%) processed (#360001)\n", | |
"[train] (63%) processed (#370001)\n", | |
"[train] (65%) processed (#380001)\n", | |
"[train] (67%) processed (#390001)\n", | |
"[train] (68%) processed (#400001)\n", | |
"[train] (70%) processed (#410001)\n", | |
"[train] (72%) processed (#420001)\n", | |
"[train] (74%) processed (#430001)\n", | |
"[train] (75%) processed (#440001)\n", | |
"[train] (77%) processed (#450001)\n", | |
"[train] (79%) processed (#460001)\n", | |
"[train] (80%) processed (#470001)\n", | |
"[train] (82%) processed (#480001)\n", | |
"[train] (84%) processed (#490001)\n", | |
"[train] (86%) processed (#500001)\n", | |
"[train] (87%) processed (#510001)\n", | |
"[train] (89%) processed (#520001)\n", | |
"[train] (91%) processed (#530001)\n", | |
"[train] (92%) processed (#540001)\n", | |
"[train] (94%) processed (#550001)\n", | |
"[train] (96%) processed (#560001)\n", | |
"[train] (97%) processed (#570001)\n", | |
"[train] (99%) processed (#580001)\n", | |
"[valid] (0%) processed (#1)\n", | |
"[valid] (32%) processed (#10001)\n", | |
"[valid] (65%) processed (#20001)\n", | |
"[valid] (97%) processed (#30001)\n", | |
"[test] (0%) processed (#1)\n", | |
"[test] (31%) processed (#10001)\n", | |
"[test] (61%) processed (#20001)\n", | |
"[test] (92%) processed (#30001)\n" | |
] | |
} | |
], | |
"source": [ | |
"word2vec_xtrain = word2vec_ft.get_features(itrain, 'train')\n", | |
"word2vec_xvalid = word2vec_ft.get_features(ivalid, 'valid')\n", | |
"word2vec_xtest = word2vec_ft.get_features(itest, 'test')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 96, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[train] (0%) processed (#1)\n", | |
"[train] (2%) processed (#10001)\n", | |
"[train] (3%) processed (#20001)\n", | |
"[train] (5%) processed (#30001)\n", | |
"[train] (7%) processed (#40001)\n", | |
"[train] (9%) processed (#50001)\n", | |
"[train] (10%) processed (#60001)\n", | |
"[train] (12%) processed (#70001)\n", | |
"[train] (14%) processed (#80001)\n", | |
"[train] (15%) processed (#90001)\n", | |
"[train] (17%) processed (#100001)\n", | |
"[train] (19%) processed (#110001)\n", | |
"[train] (21%) processed (#120001)\n", | |
"[train] (22%) processed (#130001)\n", | |
"[train] (24%) processed (#140001)\n", | |
"[train] (26%) processed (#150001)\n", | |
"[train] (27%) processed (#160001)\n", | |
"[train] (29%) processed (#170001)\n", | |
"[train] (31%) processed (#180001)\n", | |
"[train] (32%) processed (#190001)\n", | |
"[train] (34%) processed (#200001)\n", | |
"[train] (36%) processed (#210001)\n", | |
"[train] (38%) processed (#220001)\n", | |
"[train] (39%) processed (#230001)\n", | |
"[train] (41%) processed (#240001)\n", | |
"[train] (43%) processed (#250001)\n", | |
"[train] (44%) processed (#260001)\n", | |
"[train] (46%) processed (#270001)\n", | |
"[train] (48%) processed (#280001)\n", | |
"[train] (50%) processed (#290001)\n", | |
"[train] (51%) processed (#300001)\n", | |
"[train] (53%) processed (#310001)\n", | |
"[train] (55%) processed (#320001)\n", | |
"[train] (56%) processed (#330001)\n", | |
"[train] (58%) processed (#340001)\n", | |
"[train] (60%) processed (#350001)\n", | |
"[train] (62%) processed (#360001)\n", | |
"[train] (63%) processed (#370001)\n", | |
"[train] (65%) processed (#380001)\n", | |
"[train] (67%) processed (#390001)\n", | |
"[train] (68%) processed (#400001)\n", | |
"[train] (70%) processed (#410001)\n", | |
"[train] (72%) processed (#420001)\n", | |
"[train] (74%) processed (#430001)\n", | |
"[train] (75%) processed (#440001)\n", | |
"[train] (77%) processed (#450001)\n", | |
"[train] (79%) processed (#460001)\n", | |
"[train] (80%) processed (#470001)\n", | |
"[train] (82%) processed (#480001)\n", | |
"[train] (84%) processed (#490001)\n", | |
"[train] (86%) processed (#500001)\n", | |
"[train] (87%) processed (#510001)\n", | |
"[train] (89%) processed (#520001)\n", | |
"[train] (91%) processed (#530001)\n", | |
"[train] (92%) processed (#540001)\n", | |
"[train] (94%) processed (#550001)\n", | |
"[train] (96%) processed (#560001)\n", | |
"[train] (97%) processed (#570001)\n", | |
"[train] (99%) processed (#580001)\n", | |
"[valid] (0%) processed (#1)\n", | |
"[valid] (32%) processed (#10001)\n", | |
"[valid] (65%) processed (#20001)\n", | |
"[valid] (97%) processed (#30001)\n" | |
] | |
} | |
], | |
"source": [ | |
"# Title/Author/Year features\n", | |
"ft = TitleAuthorYear(node_info)\n", | |
"meta_xtrain = ft.get_features(itrain, 'train')\n", | |
"meta_xvalid = ft.get_features(ivalid, 'valid')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 97, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[test] (0%) processed (#1)\n", | |
"[test] (31%) processed (#10001)\n", | |
"[test] (61%) processed (#20001)\n", | |
"[test] (92%) processed (#30001)\n" | |
] | |
} | |
], | |
"source": [ | |
"meta_xtest = ft.get_features(itest, 'test')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 133, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"class GraphOnlyMore(GraphOnly):\n", | |
" \"\"\" Class using Graph Features\n", | |
" # (1) common(SRC/TGT) number of common neighbors (undirected)\n", | |
" # (2) d_out(SRC) total number of outgoing edges from SRC\n", | |
" # (3) d_in(TGT) total number of incoming edges to TGT\n", | |
" \"\"\" \n", | |
" def __init__(self, full, connected):\n", | |
" GraphFeatureExtractor.__init__(self, full)\n", | |
" self.colnames = [\"common\", \"d_out\", \"d_in\",\"common2\", \"d_out2\", \"d_in2\",\"diff_in2\", \"jacccard_coeffs\"]\n", | |
" self.full = full\n", | |
" self.connected = connected\n", | |
"# print('Computing Flow Centrality....')\n", | |
"# self.closeness_centrality = nx.betweenness_centrality(graph_info)\n", | |
" \n", | |
" def get_feature(self, source_id, target_id):\n", | |
" \"\"\"Graph Features\"\"\"\n", | |
" common = len(list(nx.common_neighbors(self.graph_info, source_id, target_id)))\n", | |
" d_out = self.graph_info.degree(source_id) \n", | |
" d_in = self.graph_info.degree(target_id) \n", | |
" \n", | |
" common2 = len(list(nx.common_neighbors(self.connected, source_id, target_id)))\n", | |
" d_out2 = self.connected.degree(source_id) \n", | |
" d_in2 = self.connected.degree(target_id) \n", | |
" \n", | |
"# s_close = self.closeness_centrality[source_id]\n", | |
"# t_close = self.closeness_centrality[target_id]\n", | |
" jacccard_coeffs = list(nx.jaccard_coefficient(self.connected, [(source_id, target_id)]))[0][2]\n", | |
" diff_in = d_in - d_out\n", | |
" diff_in2 = d_in2 - d_out2\n", | |
" return np.asarray((common, d_out, d_in,\n", | |
" common2, d_out2, d_in2,diff_in2, jacccard_coeffs), dtype=float)\n", | |
" \n", | |
"# class GraphExtended(GraphOnly):\n", | |
"# def __init__(self, graph_info):\n", | |
"# GraphOnly.__init__(self, graph_info)\n", | |
"# # do pagerank\n", | |
"# #print 'Computing Pagerank'\n", | |
"# #self.pagerank = nx.pagerank(graph_info)\n", | |
"# # other centralities\n", | |
"# print 'Computing Closeness'\n", | |
"# self.closeness_centrality = nx.betweenness_centrality(graph_info)\n", | |
"# # current_flow_closeness_centrality\n", | |
"# print 'Computing Flow Centrality'\n", | |
"# self.flow = nx.current_flow_closeness_centrality(graph_info)\n", | |
" \n", | |
"# def get_feature(self, source_id, target_id):\n", | |
"# \"\"\"Graph Features\"\"\"\n", | |
"# common = len(list(nx.common_neighbors(self.graph_info, source_id, target_id)))\n", | |
"# d_out = self.graph_info.degree(source_id) \n", | |
"# d_in = self.graph_info.degree(target_id) \n", | |
"# s_flow = self.flow[source_id]\n", | |
"# t_flow = self.flow[target_id]\n", | |
"# #s_pagerank = self.pagerank[source_id]\n", | |
"# #t_pagerank = self.pagerank[target_id]\n", | |
"# s_close = self.closeness_centrality[source_id]\n", | |
"# t_close = self.closeness_centrality[target_id]\n", | |
"# return np.asarray((common, \n", | |
"# d_out, d_in,\n", | |
"# s_flow, t_flow,\n", | |
"# s_close, t_close\n", | |
"# )) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Get Graph features\n", | |
"#graph_ft = GraphOnly(graph_info)\n", | |
"graph_ft = GraphOnlyMore(graph_info, connected_graph)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[train] (0%) processed (#1)\n", | |
"[train] (2%) processed (#10001)\n", | |
"[train] (3%) processed (#20001)\n", | |
"[train] (5%) processed (#30001)\n", | |
"[train] (7%) processed (#40001)\n", | |
"[train] (9%) processed (#50001)\n", | |
"[train] (10%) processed (#60001)\n", | |
"[train] (12%) processed (#70001)\n", | |
"[train] (14%) processed (#80001)\n", | |
"[train] (15%) processed (#90001)\n", | |
"[train] (17%) processed (#100001)\n", | |
"[train] (19%) processed (#110001)\n", | |
"[train] (21%) processed (#120001)\n", | |
"[train] (22%) processed (#130001)\n", | |
"[train] (24%) processed (#140001)\n", | |
"[train] (26%) processed (#150001)\n", | |
"[train] (27%) processed (#160001)\n", | |
"[train] (29%) processed (#170001)\n", | |
"[train] (31%) processed (#180001)\n", | |
"[train] (32%) processed (#190001)\n", | |
"[train] (34%) processed (#200001)\n", | |
"[train] (36%) processed (#210001)\n", | |
"[train] (38%) processed (#220001)\n", | |
"[train] (39%) processed (#230001)\n", | |
"[train] (41%) processed (#240001)\n", | |
"[train] (43%) processed (#250001)\n", | |
"[train] (44%) processed (#260001)\n", | |
"[train] (46%) processed (#270001)\n", | |
"[train] (48%) processed (#280001)\n", | |
"[train] (50%) processed (#290001)\n", | |
"[train] (51%) processed (#300001)\n", | |
"[train] (53%) processed (#310001)\n", | |
"[train] (55%) processed (#320001)\n", | |
"[train] (56%) processed (#330001)\n", | |
"[train] (58%) processed (#340001)\n", | |
"[train] (60%) processed (#350001)\n", | |
"[train] (62%) processed (#360001)\n", | |
"[train] (63%) processed (#370001)\n", | |
"[train] (65%) processed (#380001)\n", | |
"[train] (67%) processed (#390001)\n", | |
"[train] (68%) processed (#400001)\n", | |
"[train] (70%) processed (#410001)\n", | |
"[train] (72%) processed (#420001)\n", | |
"[train] (74%) processed (#430001)\n", | |
"[train] (75%) processed (#440001)\n", | |
"[train] (77%) processed (#450001)\n", | |
"[train] (79%) processed (#460001)\n", | |
"[train] (80%) processed (#470001)\n", | |
"[train] (82%) processed (#480001)\n", | |
"[train] (84%) processed (#490001)\n", | |
"[train] (86%) processed (#500001)\n", | |
"[train] (87%) processed (#510001)\n", | |
"[train] (89%) processed (#520001)\n", | |
"[train] (91%) processed (#530001)\n", | |
"[train] (92%) processed (#540001)\n", | |
"[train] (94%) processed (#550001)\n", | |
"[train] (96%) processed (#560001)\n", | |
"[train] (97%) processed (#570001)\n", | |
"[train] (99%) processed (#580001)\n", | |
"[valid] (0%) processed (#1)\n", | |
"[valid] (32%) processed (#10001)\n", | |
"[valid] (65%) processed (#20001)\n", | |
"[valid] (97%) processed (#30001)\n", | |
"[test] (0%) processed (#1)\n", | |
"[test] (31%) processed (#10001)\n", | |
"[test] (61%) processed (#20001)\n", | |
"[test] (92%) processed (#30001)\n" | |
] | |
} | |
], | |
"source": [ | |
"graph_xtrain = graph_ft.get_features(itrain, 'train')\n", | |
"graph_xvalid = graph_ft.get_features(ivalid, 'valid')\n", | |
"graph_xtest = graph_ft.get_features(itest, 'test')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 45, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[ 0.00000000e+00, 2.80000000e+01, 6.00000000e+01, ...,\n", | |
" 3.80000000e+01, 3.00000000e+01, 0.00000000e+00],\n", | |
" [ 0.00000000e+00, 2.60000000e+01, 1.50000000e+01, ...,\n", | |
" 2.00000000e+00, -5.00000000e+00, 0.00000000e+00],\n", | |
" [ 1.40000000e+01, 3.00000000e+01, 4.60000000e+01, ...,\n", | |
" 3.20000000e+01, 1.40000000e+01, 3.88888889e-01],\n", | |
" ..., \n", | |
" [ 0.00000000e+00, 2.60000000e+01, 2.70000000e+01, ...,\n", | |
" 4.00000000e+00, 1.00000000e+00, 0.00000000e+00],\n", | |
" [ 0.00000000e+00, 6.20000000e+01, 2.20000000e+01, ...,\n", | |
" 3.00000000e+00, -3.80000000e+01, 0.00000000e+00],\n", | |
" [ 5.00000000e+00, 3.90000000e+01, 1.15000000e+02, ...,\n", | |
" 9.10000000e+01, 6.80000000e+01, 4.58715596e-02]])" | |
] | |
}, | |
"execution_count": 45, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"graph_xtrain" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 59, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Concatenate Features\n", | |
"#xtrain = graph_xtrain\n", | |
"#xvalid = graph_xvalid\n", | |
"#xtest = graph_xtest\n", | |
"\n", | |
"#xtrain = np.hstack((meta_xtrain, graph_xtrain))\n", | |
"#xvalid = np.hstack((meta_xvalid, graph_xvalid))\n", | |
"#xtest = np.hstack((meta_xtest, graph_xtest))\n", | |
"# word2vec_xtrain = word2vec_xtrain.reshape(word2vec_xtrain.shape[0],1)\n", | |
"# word2vec_xvalid = word2vec_xvalid.reshape(word2vec_xvalid.shape[0], 1)\n", | |
"# word2vec_xtest = word2vec_xtest.reshape(word2vec_xtest.shape[0], 1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 103, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(32648, 1)" | |
] | |
}, | |
"execution_count": 103, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"word2vec_xtest.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 66, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(32648, 1)" | |
] | |
}, | |
"execution_count": 66, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"tmp_xtest_cos.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 72, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(32648, 6)" | |
] | |
}, | |
"execution_count": 72, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"meta_xtest.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 125, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(584736, 2)" | |
] | |
}, | |
"execution_count": 125, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"word2vec_xtrain.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 166, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Parameters\n", | |
"USE_MULTIGRAPH = True\n", | |
"USE_SURNAME_ONLY = True\n", | |
"\n", | |
"# Construct Author Multigraph\n", | |
"class AuthorGraph(BaseFeatureExtractor):\n", | |
" def __init__(self, node_info, graph_info,\n", | |
" use_multigraph=True, use_surname_only=True, count_self_loops=False):\n", | |
" BaseFeatureExtractor.__init__(self, node_info)\n", | |
" self.colnames = ['MaxACollaborateWithB', 'MeanACollaborateWithB', 'MaxACiteB',\n", | |
" 'MeanACiteB', 'AverageCitedNumberB', 'AverageCitedNumberA']\n", | |
" print 'Gathering authors for each paper'\n", | |
" self.paper_authors = {} # list authors for each papers\n", | |
" for idx, node in enumerate(node_info):\n", | |
" raw_authors = node[3]\n", | |
" authors = split_authors(process_authors(raw_authors))\n", | |
" if use_surname_only:\n", | |
" authors = [author.split()[-1] for author in authors if author.split()]\n", | |
" self.paper_authors[idx] = authors\n", | |
" \n", | |
" print 'Building author collaboration graph'\n", | |
" print 'i.e.: edge = co-authors of the same paper'\n", | |
" self.collab_graph = nx.MultiGraph() if use_multigraph else nx.Graph()\n", | |
" for idx, node in enumerate(node_info):\n", | |
" for i, author in enumerate(self.paper_authors[idx]):\n", | |
" # use [i:] to count self-loops\n", | |
" other_authors = authors[i:] if count_self_loops else authors[:i]\n", | |
" for author2 in other_authors:\n", | |
" self.collab_graph.add_node(author)\n", | |
" self.collab_graph.add_node(author2)\n", | |
" self.collab_graph.add_edge(author, author2)\n", | |
"\n", | |
" print 'Building author citation graph'\n", | |
" print 'i.e.: edge = authors who wrote papers which cite papers written by authors'\n", | |
" self.cite_graph = nx.MultiGraph() if use_multigraph else nx.Graph()\n", | |
" # graph_info says which paper cited which one\n", | |
" for source_id, target_id in graph_info.edges_iter():\n", | |
" A = self.paper_authors[self.IDs[source_id]] # source authors\n", | |
" B = self.paper_authors[self.IDs[target_id]] # target authors\n", | |
" for a in A:\n", | |
" for b in B:\n", | |
" self.cite_graph.add_node(a)\n", | |
" self.cite_graph.add_node(b)\n", | |
" self.cite_graph.add_edge(a, b)\n", | |
" \n", | |
" print '#papers', len(node_info)\n", | |
" print '#authors (collab nodes)', self.collab_graph.number_of_nodes()\n", | |
" print '#collaborations (collab edges)', self.collab_graph.number_of_edges()\n", | |
" print '#collab connected components', nx.number_connected_components(self.collab_graph)\n", | |
" print '#authors (cite nodes)', self.cite_graph.number_of_nodes()\n", | |
" print '#author-to-author citations (cite edges)', self.cite_graph.number_of_edges\n", | |
" print '#cite connected components', nx.number_connected_components(self.cite_graph)\n", | |
" \n", | |
" def get_feature(self, source_idx, source_info, target_idx, target_info):\n", | |
" A = self.paper_authors[source_idx] # source authors\n", | |
" B = self.paper_authors[target_idx] # target authors\n", | |
" # features from stanford cs224w final project\n", | |
" # http://snap.stanford.edu/class/cs224w-2010/proj2010/05_ProjectReport.pdf\n", | |
" # the maximum number of\n", | |
" # collaborations happened between each author of paper\n", | |
" # A and each author of paper B\n", | |
" collabs = [0] # in case A or B is empty\n", | |
" cites = [0]\n", | |
" for a in A:\n", | |
" for b in B:\n", | |
" collabs.append(self.collab_graph.number_of_edges(a, b))\n", | |
" cites.append(self.cite_graph.number_of_edges(a, b))\n", | |
" citesA = nx.degree(self.cite_graph, nbunch=A).values() + [0]\n", | |
" citesB = nx.degree(self.cite_graph, nbunch=B).values() + [0]\n", | |
" \n", | |
" collabs = np.log(np.asarray(collabs)+1)\n", | |
" cites = np.log(np.asarray(cites)+1)\n", | |
" citesA = np.log(np.asarray(citesA)+1)\n", | |
" citesB = np.log(np.asarray(citesB)+1)\n", | |
" \n", | |
" ft = {}\n", | |
" ft['MaxACollaborateWithB'] = np.max(collabs)\n", | |
" ft['MeanACollaborateWithB'] = np.mean(collabs)\n", | |
" ft['MaxACiteB'] = np.max(cites)\n", | |
" ft['MeanACiteB'] = np.mean(cites)\n", | |
" ft['AverageCitedNumberB'] = np.mean(citesB)\n", | |
" #ft['AverageReferenceNumberB'] =\n", | |
" ft['AverageCitedNumberA'] = np.mean(citesA)\n", | |
" #ft['AverageReferenceNumberA'] =\n", | |
" \n", | |
" # Concatenate all\n", | |
" return np.asarray(ft.values(), dtype=float)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 167, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Gathering authors for each paper\n", | |
"Building author collaboration graph\n", | |
"i.e.: edge = co-authors of the same paper\n", | |
"Building author citation graph\n", | |
"i.e.: edge = authors who wrote papers which cite papers written by authors\n", | |
"#papers 27770\n", | |
"#authors (collab nodes) 8959\n", | |
"#collaborations (collab edges) 28254\n", | |
"#collab connected components 1\n", | |
"#authors (cite nodes) 13458\n", | |
"#author-to-author citations (cite edges) <bound method MultiGraph.number_of_edges of <networkx.classes.multigraph.MultiGraph object at 0x1630f3c10>>\n", | |
"#cite connected components 45\n" | |
] | |
} | |
], | |
"source": [ | |
"author_ft = AuthorGraph(node_info,\n", | |
" connected_graph, #graph_info\n", | |
" use_multigraph=True, \n", | |
" use_surname_only=False, \n", | |
" count_self_loops=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 159, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[train] (0%) processed (#1)\n", | |
"[train] (2%) processed (#10001)\n", | |
"[train] (3%) processed (#20001)\n", | |
"[train] (5%) processed (#30001)\n", | |
"[train] (7%) processed (#40001)\n", | |
"[train] (9%) processed (#50001)\n", | |
"[train] (10%) processed (#60001)\n", | |
"[train] (12%) processed (#70001)\n", | |
"[train] (14%) processed (#80001)\n", | |
"[train] (15%) processed (#90001)\n", | |
"[train] (17%) processed (#100001)\n", | |
"[train] (19%) processed (#110001)\n", | |
"[train] (21%) processed (#120001)\n", | |
"[train] (22%) processed (#130001)\n", | |
"[train] (24%) processed (#140001)\n", | |
"[train] (26%) processed (#150001)\n", | |
"[train] (27%) processed (#160001)\n", | |
"[train] (29%) processed (#170001)\n", | |
"[train] (31%) processed (#180001)\n", | |
"[train] (32%) processed (#190001)\n", | |
"[train] (34%) processed (#200001)\n", | |
"[train] (36%) processed (#210001)\n", | |
"[train] (38%) processed (#220001)\n", | |
"[train] (39%) processed (#230001)\n", | |
"[train] (41%) processed (#240001)\n", | |
"[train] (43%) processed (#250001)\n", | |
"[train] (44%) processed (#260001)\n", | |
"[train] (46%) processed (#270001)\n", | |
"[train] (48%) processed (#280001)\n", | |
"[train] (50%) processed (#290001)\n", | |
"[train] (51%) processed (#300001)\n", | |
"[train] (53%) processed (#310001)\n", | |
"[train] (55%) processed (#320001)\n", | |
"[train] (56%) processed (#330001)\n", | |
"[train] (58%) processed (#340001)\n", | |
"[train] (60%) processed (#350001)\n", | |
"[train] (62%) processed (#360001)\n", | |
"[train] (63%) processed (#370001)\n", | |
"[train] (65%) processed (#380001)\n", | |
"[train] (67%) processed (#390001)\n", | |
"[train] (68%) processed (#400001)\n", | |
"[train] (70%) processed (#410001)\n", | |
"[train] (72%) processed (#420001)\n", | |
"[train] (74%) processed (#430001)\n", | |
"[train] (75%) processed (#440001)\n", | |
"[train] (77%) processed (#450001)\n", | |
"[train] (79%) processed (#460001)\n", | |
"[train] (80%) processed (#470001)\n", | |
"[train] (82%) processed (#480001)\n", | |
"[train] (84%) processed (#490001)\n", | |
"[train] (86%) processed (#500001)\n", | |
"[train] (87%) processed (#510001)\n", | |
"[train] (89%) processed (#520001)\n", | |
"[train] (91%) processed (#530001)\n", | |
"[train] (92%) processed (#540001)\n", | |
"[train] (94%) processed (#550001)\n", | |
"[train] (96%) processed (#560001)\n", | |
"[train] (97%) processed (#570001)\n", | |
"[train] (99%) processed (#580001)\n", | |
"[valid] (0%) processed (#1)\n", | |
"[valid] (32%) processed (#10001)\n", | |
"[valid] (65%) processed (#20001)\n", | |
"[valid] (97%) processed (#30001)\n", | |
"[test] (0%) processed (#1)\n", | |
"[test] (31%) processed (#10001)\n", | |
"[test] (61%) processed (#20001)\n", | |
"[test] (92%) processed (#30001)\n" | |
] | |
} | |
], | |
"source": [ | |
"# Get author collab and citation graph features\n", | |
"author_xtrain = author_ft.get_features(itrain, 'train')\n", | |
"author_xvalid = author_ft.get_features(ivalid, 'valid')\n", | |
"author_xtest = author_ft.get_features(itest, 'test')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 158, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<__main__.AuthorGraph at 0x16916f7d0>" | |
] | |
}, | |
"execution_count": 158, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"author_ft." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 165, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[ 0. , 0.69314718, 0.55451774, 4.47341344, 0. ,\n", | |
" 4.13091159],\n", | |
" [ 0. , 0. , 0. , 4.70625923, 0. ,\n", | |
" 3.40064152],\n", | |
" [ 0. , 0. , 0. , 5.46442935, 0. ,\n", | |
" 0.54930614],\n", | |
" ..., \n", | |
" [ 0. , 5.85793315, 3.51026152, 6.32071517, 0. ,\n", | |
" 7.65962654],\n", | |
" [ 0. , 6.94408721, 3.4720436 , 4.59649537, 0. ,\n", | |
" 6.2086042 ],\n", | |
" [ 0. , 2.63905733, 1.63210365, 4.49416731, 0. ,\n", | |
" 5.24360709]])" | |
] | |
}, | |
"execution_count": 165, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"author_xtrain" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 253, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"xtrain = np.hstack((tmp_xtrain_cos, text_xtrain, meta_xtrain, graph_xtrain,word2vec_xtrain))\n", | |
"xvalid = np.hstack((tmp_xvalid_cos, text_xvalid, meta_xvalid, graph_xvalid,word2vec_xvalid))\n", | |
"xtest = np.hstack((tmp_xtest_cos, text_xtest, meta_xtest, graph_xtest,word2vec_xtest))\n", | |
"\n", | |
"# xtrain = np.hstack((tmp_xtrain, meta_xtrain, graph_xtrain))\n", | |
"# xvalid = np.hstack((tmp_xvalid, meta_xvalid, graph_xvalid))\n", | |
"# xtest = np.hstack((tmp_xtest, meta_xtest, graph_xtest))\n", | |
"\n", | |
"\n", | |
"normalize = False\n", | |
"if normalize:\n", | |
" #norm = 0.000001 + xtrain.max(axis=0, keepdims=True)\n", | |
" norm = 0.000001 + np.percentile(xtrain, 95, axis=0, keepdims=True)\n", | |
" xtrain /= norm\n", | |
" xvalid /= norm\n", | |
" xtest /= norm" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 176, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"test = pd.DataFrame(xtrain, columns=colnames)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 180, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"abstract_cosim 0\n", | |
"abstract_svm 0\n", | |
"n_title 0\n", | |
"n_years 0\n", | |
"n_authors 0\n", | |
"is_same_journal 0\n", | |
"is_author_missing 0\n", | |
"is_journal_missing 0\n", | |
"n_authors_source 0\n", | |
"n_authors_target 0\n", | |
"common 0\n", | |
"d_out 0\n", | |
"d_in 0\n", | |
"common2 0\n", | |
"d_out2 0\n", | |
"d_in2 0\n", | |
"diff_in2 0\n", | |
"jacccard_coeffs 0\n", | |
"abstract_similarity 0\n", | |
"title_similarity 0\n", | |
"MaxACollaborateWithB 0\n", | |
"MeanACollaborateWithB 0\n", | |
"MaxACiteB 0\n", | |
"MeanACiteB 0\n", | |
"AverageCitedNumberB 0\n", | |
"AverageCitedNumberA 0\n", | |
"dtype: int64" | |
] | |
}, | |
"execution_count": 180, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test.isnull().sum()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 161, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(584736, 26)" | |
] | |
}, | |
"execution_count": 161, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"xtrain.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 100, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(584736, 1)" | |
] | |
}, | |
"execution_count": 100, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"word2vec_xtrain.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 53, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(584736,)" | |
] | |
}, | |
"execution_count": 53, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"word2vec_xtrain" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 58, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[ 0.59887377],\n", | |
" [-0.04610019],\n", | |
" [-1.46748028],\n", | |
" ..., \n", | |
" [ 1.18366304],\n", | |
" [ 0.85202786],\n", | |
" [ 1.04686267]])" | |
] | |
}, | |
"execution_count": 58, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"text_xtrain" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 47, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([ 0., 1.])" | |
] | |
}, | |
"execution_count": 47, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"np.append(np.zeros(1),1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 48, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([ 1.])" | |
] | |
}, | |
"execution_count": 48, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"np.append(np.empty(0), 1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 49, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[ 38. , -0.375, 0. , 0. , 0. , 1. ],\n", | |
" [ 41. , 0. , 0. , 0. , 0. , 1. ],\n", | |
" [ 67. , 0.5 , 0. , 0. , 1. , 1. ],\n", | |
" ..., \n", | |
" [ 46. , 0.875, 0. , 0. , 1. , 1. ],\n", | |
" [ 38. , 0.875, 0. , 0. , 0. , 1. ],\n", | |
" [ 52. , 0.5 , 0. , 0. , 0. , 0. ]])" | |
] | |
}, | |
"execution_count": 49, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"meta_xtrain" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 254, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Fitting tf.idf vectorizer\n", | |
"Building word2net neural network\n" | |
] | |
} | |
], | |
"source": [ | |
"# colnames \n", | |
"\n", | |
"colnames = TextOnlyCosine(node_info).colnames + ['abstract_svm'] + TitleAuthorYear(node_info).colnames + \\\n", | |
"GraphOnlyMore(graph_info, connected_graph).colnames + TextOnlyGensim(node_info).colnames " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Logistic regression " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 246, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Accuracy 0.954087600728\n", | |
"F1 0.957089495581\n" | |
] | |
} | |
], | |
"source": [ | |
"logreg = LogisticRegression(C=1)\n", | |
"logreg.fit(xtrain, ytrain)\n", | |
"pvalid_logreg = logreg.predict(xvalid)\n", | |
"\n", | |
"# Evaluation metrics\n", | |
"accuracy = metrics.accuracy_score(yvalid, pvalid_logreg)\n", | |
"f1 = metrics.f1_score(yvalid, pvalid_logreg)\n", | |
"print 'Accuracy', accuracy\n", | |
"print 'F1', f1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 247, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA7kAAAHaCAYAAADFWI4dAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzs3XucZVV95/3PV5pLRBC8pMqgVmviJUYbaC5qQDjR0Uli\nJCpBH7wxRjPmMfES1InRaFcbJ4i3PESjRiWt4+1RBlDACBqGIxfBxu6G5mKYGMFMTLqZeURoiXaA\n/j1/nF3NoaiqPqeqmlPs+rxfr3qdtddZa+3fPuf068WPtfbaqSokSZIkSWqDB4w6AEmSJEmSFotJ\nriRJkiSpNUxyJUmSJEmtYZIrSZIkSWoNk1xJkiRJUmusGHUAml0St76WJEmStKxVVYZpb5K7xPmI\nJw1icnKSycnJUYeh+wl/LxqUvxUNw9+LBuVvRcNIhspvAZcrS5IkSZJaxCRXkiRJktQaJrlSC3Q6\nnVGHoPsRfy8alL8VDcPfiwblb0W7W7znc+lKUn4/kiRJkparJENvPOVMriRJkiSpNUxypSVufHwl\nSeb8Gx9fOeowJUmSpCXB5cpLmMuVBVPbpu/qdxAfNyVJkqTW2a3LlZNcOnxIw0lybJKn9x2vS/LC\n3X3eWWJ5XpL/Mo9+u/1zkiRJkiTNbMWgDavq6N0ZSKMD/AS4fKEDJdmjqu6ab/+qOhc4dx797ovP\nSZIkSZI0g2Fmcrc1r+NJvplkY5LNSY6ao89HkqxPck2SNX31NyZ5SFM+LMlFSSaA3wfe2Iw9Ne6x\nSS5L8r3+Wd0k72vGvTrJi5q6Y5NcnOQrwHWzxDSR5LvNLPENST6b5FlJLm2OD2/anZTkQ035hOZc\nm5J0m7onJfl2E+tVSX5x2ud0bHNdZzTn+0xfDL/Z1F2Z5LQkQyfTkiRJkqR7G3gml7tvCnwJcH5V\nnZLezYIPnKPP26rqx0keAFyY5MyqupZ732BYVfWDJB8DtlXVBwGSvBoYr6qjkvwycA5wVpLjgVVV\n9ZQkPw9cmeSbzViHAr9SVf80R1y/CBxfVdcn+Q5wYlUdneQ44O3AC6Zd8zuA51TVvybZv6n7feD/\nqaovJFkB7DGtD8AhwJOALcBlSX4V2AB8DDi6qv4pyedn+DwkSZIkSfMwTJI75Urg9CR7Al+pqqvn\naPt/Jfm95jzj9BK+a4Fhbhz+MkBVfbdJaAGOAr7Q1N/czK4eAWwD1u8iwQW4saqub8rXARc25WuA\niRnaXwp8OsmXgLOausuBtyd5JHB2VX1vhn7rq+pfAZJcBawEbgf+sS/GLwC/N1ugk5OTO8udTseH\nZ0uSJElqrW63S7fbXdAYQye5VXVJkmOA5wKfSvKBqvrs9HZJVgJvAg6rqtuSrAP2ad6+k7uXSu8z\nve802/uHnaVNf/3tuxhv+pg7+o53MMNnUlWvTXIE8FvAhiSrmxncK5q6v03yn6uqO8d57uobe+Ak\nvz/JlSRJkqQ2mz6xt3bt2qHHGOY5uQFI8mjg5qo6HfgksHqW9vvT20RqW5Ix4Df63rsROKwpH99X\nv63pN2cMwCXAi5M8IMnDgWcA64e9loEbJ4+tqiurag1wM/CoJI+pqhur6kPAV4BVA459A/CY5nME\nePEwsUiSJEmSZjefe3I7wFuS3EEvKX3FjI2rNjdLdL8L/C96S36nvIvekudbgW5f/bnAf2/ujX0d\nM9y724x9dpKnAVfTm319S7Ns+ZeHvJbp5dm8L8njmvLfNdf2x0leDtwB/CvwX3cx3lTsP0vyWuCC\nJD+ht/zbe3IlSZIkaRGkyvzqvpZk36q6vSn/FfA/q+q0GdqV3496+7vt6ncQ/K1IkiSpbZJQVUOt\nxB1mubIWz+81jyO6jt7y7L8edUCSJEmS1AaLMpPbbMC019QhvWmnl1fVjM+qvS80z+G9kLunwKbi\nelZV3TKquIbhTK4AxsdXsnXrD+ZsMzY2wZYtN903AUmSJEn3kfnM5LpceQkzyZUkSZK0nLlcWZIk\nSZK0rJnkSpIkSZJawyRXkiRJktQaJrmSJEmSpNYwyZUkSZIktYZJriRJkiSpNUxyJUmSJEmtYZIr\nSZIkSWoNk1xJkiRJUmuY5EqSJEmSWsMkV7ofGh9fSZKdf+PjK0cdkiRJkrQkpKpGHYNmkaT8fjST\nJED/byP4W5EkSVLbJKGqMkwfZ3IlSZIkSa2xpJLcJBNJTuw7PinJh0YZkyRJkiTp/mNJJbnAY4CX\nTKub9xrMJCO9vlGfX5IkSZKWm6GTsGa29fokH09ybZLzk+w9S9tXJ1mfZFOSM5Ls09SvS/LCvnbb\nmuIpwNFJNiZ5Q1N3UJKvJbkhyal9fU5Msrn5e0//WEnen2QT8LQkpyS5LslVSd47x3WdkOSaJtZu\nU7d3kr9pzrEhSaepv8cMc5Jzkxwzy/kPT3JZc/4rkuyb5AFJ3pvk20397w3xFUiSJEmSZjHfmcZf\nAj5UVU8GbgWOn6XdmVV1ZFUdCvw98KpZ2k3N1r4VuKSqVlfVaU3dwcAJwCrgxUkOSvII4D1ABzgE\nOCLJcU37fYHL+875gqr6lao6BHj3HNf0DuA5Tb+psf4A2FFVq+jNMH86yV7TYp6u//xXAl8EXtec\n/z8AP2s+hx9X1VOBI4H/nGRijtgkSZIkSQNYMc9+N1bVNU15A7BylnZPSfJu4AB6yd8F8zjXhVX1\nE4Ak1wETwMOAi6rqR03954BjgHOAu4Czmr63Aj9N8kngq8B5c5znUnpJ7Jf6+h8N/CVAVd2Q5Cbg\n8buI986+/k8A/qWqNjZjTF3Hc+h9Nic07fYHHgf8YPpgk5OTO8udTodOp7OL00uSJEnS/VO326Xb\n7S5ojPkmudv7yncB+8zS7lPAcVV1bZKTgGOb+jtpZpHTexbKXjN3v9e5dnB3zLNtI/3TqefuVNVd\nSY4EnkVvNvgPm/K9VNVrkxwB/BawIclhMzSbOufO+Bv91/+zac/9mSnO0Jvd/cYs17BTf5IrSZIk\nSW02fWJv7dq1Q48x3+XKgz6n6EHAliR7Ai/tq78JOLwp/zawZ1PeBuw3wLjrgWOSPCTJHsCJQHd6\nbEn2BQ6oqvOBk+kteZ5RksdW1ZVVtQa4GXgkcAnwsub9xwOPAm5o4j8kPY+it+R451B95RuA8amE\nOcmDmngvAF6bZEVT/7gkPzfAdUuSJEmS5jDfmdxBdzx+B72E9Gbg29ydwH4C+EqzOdMFwO1N/WZg\nR1P/KeCWmc5bVVuSvJW7E9uvVtV5/W0a+zXnmZpp/aM5Yn1fksc15QuranOSG4CPJtkM3AGcVFV3\nAJc1S5evA75Lb8n2PWJs4rwjyYuBDzdJ7L/Ruy/3k/SWeG9sZrJvBp4/R2ySJEmSpAHknitrtZQk\nKb8fzaT3/0buuSre34okSZLaJglVNehKYmDpPSdXkiRJkqR5m+9y5XtI8mHgKHpTS1NTTKdV1acX\nY/zFlORt9Dah6o/1jKo6ZaSBSUMYG5tg69bc41iSJEmSy5WXNJcrS5IkSVrOXK4sSZIkSVrWTHIl\nSZIkSa1hkitJkiRJag2TXEmSJElSa5jkSpIkSZJawyRXkiRJktQaJrmSJEmSpNYwyZUkSZIktYZJ\nriRJkiSpNUxyJUmSJEmtYZIr3Y+Nj68kCePjK0cdiiRJkrQkpKpGHYNmkaT8fjSXJEABwd+KJEmS\n2iYJVZVh+jiTK0mSJElqDZPcISVZk+TkRRrr4CS/sRhjSZIkSZJMckftEOA3Rx2EJEmSJLWFSe4A\nkrw9yQ1JLgaeMEe7g5NcnuSqJGcmeXBTf1GS1U35oUluTLICeBfwoiQbk5xwn1yMJEmSJLXYilEH\nsNQ1yemLgFXAXsBG4DuzNP9vwB9U1aVJ1gJrgJmWNldV3ZnkncBhVfX62c4/OTm5s9zpdOh0OvO5\nDEmSJEla8rrdLt1ud0FjuLvyLiR5A3BgVU02xx8AflhVH5zWbn9gc1WtbI4fC3ypqg5PchHwpqra\nmOShwJVV9dgkJzFHkuvuytoVd1eWJElSm7m78tJ1J3d/1vuMMhBJkiRJajOT3F27GHh+kr2T7Ac8\nb6ZGVXUbcEuSo5qqlwPfbMo3AYc35f57b7cB+y96xJIkSZK0TJnk7kJVbQK+CGwGvgqsn6P5ScD7\nk1wFHExvYymA9wP/d5INwEP62l8EPMmNpyRJkiRpcXhP7hLmPbnaFe/JlSRJUpt5T64kSZIkaVkz\nyZ2HJB9OsqlZZjz1etKo49LyMzY2AaR5lSRJkuRy5SXM5cqSJEmSljOXK0uSJEmSljWTXEmSJElS\na5jkSpIkSZJawyRXkiRJktQaJrmSJEmSpNYwyZUkSZIktYZJriRJkiSpNUxyJUmSJEmtYZIrSZIk\nSWoNk1xJkiRJUmuY5Er3Y+PjK0nC+PjKUYciSZIkLQmpqlHHoFkkKb8fzSUJUEDwtyJJkqS2SUJV\nZZg+zuQuoiRrkpw8y3trkzzzvo5JkiRJkpaTFaMOYLmoqjWjjkGSJEmS2s6Z3AVK8vYkNyS5GHjC\nHO3WJXlhU74xyWSSDUmuTvL4+yxgSZIkSWoxk9wFSLIaeBGwCngucMQQ3W+uqsOAjwFv2Q3hSZIk\nSdKy43LlhXkGcHZVbQe2JzlniL5nN68bgBfM1mhycnJnudPp0Ol0ho9SkiRJku4Hut0u3W53QWO4\nu/ICJHkDcGBVTTbHHwB+WFUfnKHtOuDcqjoryY3AYVX1oySHAe+rqnttSuXuytoVd1eWJElSm7m7\n8n3vYuD5SfZOsh/wvFEHJEmSJEnLmcuVF6CqNiX5IrAZ2Aqsn6v5LGVJkiRJ0iJxufIS5nJl7YrL\nlSVJktRmLleWJEmSJC1rJrmLLMmHk2xKsrHv9aRRx6V2GhubANK8SpIkSXK58hLmcmVJkiRJy5nL\nlSVJkiRJy5pJriRJkiSpNUxyJUmSJEmtYZIrSZIkSWoNk1xJkiRJUmuY5EqSJEmSWsMkV5IkSZLU\nGia5kiRJkqTWMMmVJEmSJLWGSa4kSZIkqTVMciVJkiRJrWGSK92PjY+vJAnj4ytHHYokSZK0JKSq\nRh2DZpGk/H40lyRAAcHfiiRJktomCVWVYfrsciY3yaXzCOS8JPsP22+xJNm2CGO8JsnLhuzziCRf\nWui5JUmSJEnzs6RncpPsUVV3zaPfbVU1siR7sTiTq11xJleSJElttrtmcrc1r+NJvplkY5LNSY6a\no8+NSR7SlE9Ock3T5w1N3USSa/ravynJO5vyRUn+Isl64PVJ1iU5LcllSb6X5IVNu32T/F2S7yS5\nOslxg1xwkmOTdJN8uRnvlCQvSfLtZpzHNO3WJDm5Kb8+yXVJrkry+b5xNjWfx4Ymnp3XleSkJGcm\n+VqSG5Kc2hfDq5q6K5J8PMlfDhK7JEmSJGluKwZoMzU99BLg/Ko6Jb3powfuqk+S1cBJwBHAHsC3\nk3SBH/eNO5M9q+rIZox1wHhVHZXkl4FzgLOAnwHPr6qfJHkocEXz3iBWAU9s4vg+8ImqemqS1wOv\nA06e1v6PgZVVdUffMuw3Aa+tqsuTPLCJZ+e1Nw4GDgHuAG5oktkdwJ829T8BLgKuGjBuSZIkSdIc\nBklyp1wJnJ5kT+ArVXX1AH2OBs6uqp8BJDkLeAZw7i76fXHa8ZcBquq7SX6+qQtwSpJj6CWOv5Dk\n56vq5kGuZapdkn8Evt7UXwN0Zmh/NfD5JF+eigW4DPiLJJ8DzqqqH/Zy/3u4sKp+0pznOmACeDjQ\nrapbm/ozgMfNFujk5OTOcqfTodOZKTxJkiRJuv/rdrt0u90FjTFwkltVlzQJ5XOBTyX5QFV9dp7n\nvZPezO6Ufaa9f/u04+195alM8qXAw4BDq2pHkhtnGGc2/ePt6DvewcyfyXOBY4DjgLcneXJVnZrk\nvOa9y5I8Z9q4M51nauyB15T3J7mSJEmS1GbTJ/bWrl079BiDPCc3AEkeDdxcVacDnwRW76oPcAnw\n/CT7JNkXeAFwMbAVeHiSA5PsDfzWEDFPjf3gJp4dSX6N3izp9DaL5dFV9U3grcD+wIOSPLaqrquq\n99Kb5X7igOe+EjgmyYOTrACOX+RYJUmSJGnZGuae3A7wliR3ANuAV+yqX1VtSvIpeoldAR+vqs0A\nSd7V1P8z8N0Zzrer488B5ya5GvjOLsaYM8653mwS0c829+IGOK2qbkvy7ia5vgu4Dvga8AtzjFcA\nVfUvSf4cWA/8CPh74NYh4pUkSZIkzWLRHyGUZA9gC73NooZ+/M9ykGTfqrq9+azOBk6vqq/M0M5H\nCGlOPkJIkiRJbbZbHiE0D9fS263YBHd2k0k20dvo6vszJbiSJEmSpOENs7vyvSS5Athr6pDelNLx\nVXX9QgNbqCRPBj7D3cuHA/ysqp4+uqh6quoto45B7TA2NsHWrWFsbGLXjSVJkqRlYNGXK2vxuFxZ\nkiRJ0nK2VJYrS5IkSZI0Eia5kiRJkqTWMMmVJEmSJLWGSa4kSZIkqTVMciVJkiRJrWGSK0mSJElq\nDZNcSZIkSVJrmORKkiRJklrDJFeSJEmS1BomuZIkSZKk1jDJlSRJkiS1hkmu1AKTk5OjDkGSJEla\nElJVo45Bs0hSfj8aRBL8rUiSJKltmv/OzTB9nMltJFmT5E1JJpM8s6k7Osm1STYm2TvJ+5Jck+TU\nWcZ4TZKX7eI8/yHJd5JcneTKJL+2O65HkiRJkpYjZ3IbSdYA26rqg311HwUuqarPN8c/Bg5cyPRq\nkoOBrVW1JcmvABdU1SNnaetMrgbiTK4kSZLaaD4zucs6yU3yduAVwFbgn4ENwJOBc4EDgfcCPwa+\nBewPPBfYDJxSVWfMMN7ORDnJRcC3gV8DHgy8qqoum6HP/wEeUVV3zPCeSa4GYpIrSZKkNppPkrti\ndwWz1CVZDbwIWAXsBWwEvgMUQFWdnuRo4NyqOqvpc1tVrR7iNHtU1VOT/AYwCTx7Wgy/A2ycKcGV\nJEmSJA1v2Sa5wDOAs6tqO7A9yVeANH+L5azmdQMw0f9Gs1T5FKYlvtP175rb6XTodDqLGJ4kSZIk\nLR3dbpdut7ugMZZzkjvdVHK7mGs+tzevd9H3WSd5JL0E+OVVddNcA/hoGEmSJEnLxfSJvbVr1w49\nxnLeXfli4PnNrsn7Ac+jl+DONZO7kFneACQ5ADgP+OOqumIB40mSJEmSplm2SW5VbQK+SG8jqa8C\n66fe6m82vdswp5jl+A+AXwTemWRT83iihw0xriRJkiRpFst6d+Wlzt2VNSh3V5YkSVIbzWd35WU7\nkytJkiRJah83npqHJG8DTuDue3gLOKOqThlpYFq21qxZM+oQJEmSpCXB5cpLmMuVJUmSJC1nLleW\nJEmSJC1rJrmSJEmSpNYwyZUkSZIktYZJriRJkiSpNUxyJUmSJEmtYZIrSZIkSWoNk1xJkiRJUmuY\n5EqSJEmSWsMkV5IkSZLUGia5kiRJkqTWMMmVWmB8fCXj4ytHHYYkSZI0cqmqUcegWSQpvx8NIgkA\n/l4kSZLUJkmoqgzTx5lcSZIkSVJrmORKkiRJklpjWSS5SV6R5Ookm5J8OslEkguTXJXkG0ke2bRb\nl+QjSS5P8r0kxyY5Pcn1Sf6mb7xtSd6b5NokX09yRJKLmj6/1bTZO8nfJNmcZEOSTlN/UpIzk3wt\nyQ1JTh3JhyJJkiRJLdT6JDfJk4C3AZ2qOhR4I/AhYF1VHQJ8vjmeckBVPR04GTgH+EBVPQlYlWRV\n02Zf4O+q6snAT4A/A54FvLApA/wBsKOqVgEvAT6dZK/mvYOBE4BVwIuTHLQbLl2SJEmSlp3WJ7nA\nM4EzquoWgOb16cAXmvc/AxzV1/7c5vUaYEtVXd8cXwesbMrbq+rrfe2+WVU7mvJEU3808NnmnDcA\nNwGPb967sKp+UlXbgev7+kiSJEmSFmDFqAMYkbm2oN3evO7oK08dT31ed0yr3w5QVZVkts+0f0ew\n/nHvYo7vYXJycme50+nQ6XRmj1ySJEmS7se63S7dbndBYyyHJPd/AGcl+Yuq+lGShwDfAk6kN9P6\nMuCSWfrOtlX1XFtYT713CfBSoJvk8cCjgBuAw4YJvj/JlSRJkqQ2mz6xt3bt2qHHaH2SW1XXJ/mv\nwDeT3AlsAl4HfCrJm4H/Dbxyqvn07gOU73XK5vUjwEeTbKY383tSVd0x9TzTAceSJEmSJA0hVeZY\nS1WS8vvRIKb+54m/F0mSJLVJEqpqrpW097IcNp6SJEmSJC0TJrmSJEmSpNZo/T250nIwNuZTqCRJ\nkiTwntwlzXtyJUmSJC1n3pMrSZIkSVrWTHIlSZIkSa1hkitJkiRJag2TXEmSJElSa5jkSpIkSZJa\nwyRXkiRJktQaJrmSJEmSpNYwyZUkSZIktYZJriRJkiSpNUxyJUmSJEmtYZIrtcD4+EqSMD6+ctSh\nSJIkSSOVqhp1DJpFkvL70SCSAAUEfzOSJElqiyRUVYbp40yuJEmSJKk1lmySm2QiyYl9xycl+dB9\ndO7fTvLE++hcf3JfnEeSJEmSloMlm+QCjwFeMq1u3uswkwxzrc8HfmXI8fcYLqKd3jbPfpIkSZKk\naRaU5Dazrdcn+XiSa5Ocn2TvWdq+Osn6JJuSnJFkn6Z+XZIX9rXb1hRPAY5OsjHJG5q6g5J8LckN\nSU7t63Niks3N33v6x0ry/iSbgKclOSXJdUmuSvLeWeJ8OnAc8N7m3I/ZRewfTXIFcGqShyX5epJr\nknwiyU1JHtK0fWmSbzdjfjTJA5KcAvxcU/eZ+X0LkiRJkqQpC9p4KskE8A/AYVV1TZIvAl+pqs/P\n0PbAqrqlKf8ZsKWq/irJOuDcqjqree+2qto/ybHAm6rquKb+JOAdwCHAHcANwFHADuAK4FDgx8A3\ngNOq6pwkO4ATqurMJtn8VlU9sRlv/6q6bZbrmh7TXLE/tC/GDwH/XFWnJvmPwN8CDwd+Hngv8IKq\nuivJXwGXV9Vnp653ljhqzZo1O487nQ6dTmfO70TLkxtPSZIkqQ263S7dbnfn8dq1a4feeGrFIsRx\nY1Vd05Q3ACtnafeUJO8GDgD2BS6Yx7kurKqfACS5DpgAHgZcVFU/auo/BxwDnAPcBZzV9L0V+GmS\nTwJfBc4b4rxzxX5GX/loekudqaoLktzS1D8LWA1cmV42sg+wpXlvzi9scnJyiDAlSZIk6f5r+sTe\n2rVrhx5jMZLc7X3lu+glcDP5FHBcVV3bzMoe29TfSbNsukkA9xrwXDu4O/7ZEsWfTj2Dp5lBPZJe\nwnkC8IdNeRCzxQ5we195+hRa+l4/XVVvH/B8kiRJkqR5WIyNpwadOn4QsCXJnsBL++pvAg5vyr8N\n7NmUtwH7DTDueuCYJA9pNn86EehOjy3JvsABVXU+cDKwao4xtwH9S4hni326y4AXN+d7Dr2ZX4AL\ngd9J8vDmvQOTPKp5798XsGmVJEmSJKnPYiS5g94A+A56CeklwHf76j8BHDu1ORR3z4xuBnY0mz29\nYYbzTM3QbgHeSi+x3QR8p6rO62/T2A84L8nVwMXAH80R6/8LvCXJhiSPmSP26TG9C3h2ks3A8fSW\nJG+rqu8Cfwp8vTn/14FHNH0+DlzjxlOSJEmStHAL2nhK95RkL+CuZmn004CPVNXqBYxXfj8ahBtP\nSZIkqY2SjGTjKd3t0cCXmmfybgd+b8TxSJIkSdKysuhJbpIP03u0T29aqfd6WlV9erHPtVBJ3kZv\nE6r+WM+oqlPmM15VfY/eLsrSfWpsbIKtW8PY2MSoQ5EkSZJGyuXKS5jLlSVJkiQtZ/NZrrwYG09J\nkiRJkrQkmORKkiRJklrDJFeSJEmS1BomuZIkSZKk1jDJlSRJkiS1hkmuJEmSJKk1THIlSZIkSa1h\nkitJkiRJag2TXEmSJElSa5jkSpIkSZJawyRXaoHx8ZUkYXx85ahDkSRJkkYqVTXqGDSLJOX3o0Ek\nAQoI/mYkSZLUFkmoqgzTx5ncGSRZk+TkRRrr4CS/0Xf8kiRXN3+XJnnKYpxHkiRJkmSSe184BPjN\nvuPvA8dU1cHAu4FPjCQqSZIkSWohk9xGkrcnuSHJxcAT5mh3cJLLk1yV5MwkD27qL0qyuik/NMmN\nSVYA7wJelGRjkhOq6oqqurUZ7grgoN18aZIkSZK0bJjkAk1y+iJgFfBc4Ig5mv834C1VdQhwLbBm\nlnZVVXcC7wS+WFWrq+qMaW1eDXxtQcFLkiRJknZaMeoAlohnAGdX1XZge5JzZmqUZH/gwVV1aVP1\naeBL8zlhkl8DXgkcPVe7ycnJneVOp0On05nP6SRJkiRpyet2u3S73QWNYZK7eO7k7pnxfeZqmGQV\n8HHg16vqlrna9ie5kiRJktRm0yf21q5dO/QYLlfuuRh4fpK9k+wHPG+mRlV1G3BLkqOaqpcD32zK\nNwGHN+UT+rptA/afOkjyaOBM4OVV9Y+LdgWSJEmSJJ+TOyXJnwD/CdgK/BOwsao+OEO7VcBfAz9H\nb6fkV1bVrUmeQG/p8p3AV4GXVdVjkxwIXEBv1vwU4DnAC4EfAAHuqKojZ4nJ5+RqID4nV5IkSW00\nn+fkmuQuYSa5GpRJriRJktpoPkmuy5UlSZIkSa1hkjuLJB9Osql5vu3U60mjjkuaydjYBJDmVZIk\nSVq+XK68hLlcWZIkSdJy5nJlSZIkSdKyZpIrSZIkSWoNk1xJkiRJUmuY5EqSJEmSWsMkV5IkSZLU\nGia5kiRJkqTWMMmVJEmSJLWGSa4kSZIkqTVMciVJkiRJrWGSK0mSJElqDZNcSZIkSVJrmORKLTA+\nvpIkjI+vHHUokiRJ0kilqkYdg2aRpPx+NIgkQAHB34wkSZLaIglVlWH6OJM7pCRrkpw8y3trkzxz\nF/1fkuTq5u/SJE/ZPZFKkiRJ0vKzYtQBtElVrRmg2feBY6rq1iS/DnwCeNrujUySJEmSlgdncgeQ\n5O1JbkhyMfCEOdqtS/LCpnxjkskkG5pZ28cDVNUVVXVr0+UK4KDdfgGSJEmStEyY5O5CktXAi4BV\nwHOBI4bofnNVHQZ8DHjLDO+/GvjagoOUJEmSJAEuVx7EM4Czq2o7sD3JOUP0Pbt53QC8oP+NJL8G\nvBI4eq5n+Q1VAAAgAElEQVQBJicnd5Y7nQ6dTmeI00uSJEnS/Ue326Xb7S5oDHdX3oUkbwAOrKrJ\n5vgDwA+r6oMztF0HnFtVZyW5ETisqn6U5DDgfVX1zKbdKuBM4Ner6h/nOLe7K2sg7q4sSZKkNnJ3\n5d3jYuD5SfZOsh/wvIUMluTR9BLcl8+V4EqSJEmShudy5V2oqk1JvghsBrYC6+dqPku53zuAhwAf\nSW/67Y6qOnJRgpUkSZKkZc7lykuYy5U1KJcrS5IkqY1crixJkiRJWtZMcuchyYeTbEqyse/1pFHH\npeVrbGwCSPMqSZIkLV8uV17CXK4sSZIkaTlzubIkSZIkaVkzyZUkSZIktYZJriRJkiSpNUxyJUmS\nJEmtYZIrSZIkSWoNk1xJkiRJUmuY5EqSJEmSWsMkV5IkSZLUGia5kiRJkqTWMMmVJEmSJLWGSa4k\nSZIkqTVMcqUWGB9fSZKdf+PjK0cdkiRJkjQSqapRx6BZJCm/Hw0iCdD/Wwn+diRJknR/l4SqyjB9\nnMldoCQnJRnvO/54kic25T+Z1nbbfR2fJEmSJC0nzuQuUJKLgDdX1YYZ3ttWVfv1Hd9WVfsPMbYz\nuRqIM7mSJElqI2dyF0mSiSTXN7Oy1yY5P8neM7Q7Hjgc+GySjUn2SXJRktVJTgF+rqn/zFSXvr5v\nTrI+yVVJ1tw3VyZJkiRJ7WaSO7tfAj5UVU8GbgWOn96gqs4ErgReUlWrq+pnfe/9CfBvTf3Lp6oB\nkjwbeFxVHQkcChye5OjdezmSJEmS1H4rRh3AEnZjVV3TlDcAK2dpF/pmaAf0HODZSTY2ffcFHgdc\nOr3h5OTkznKn06HT6Qx5KkmSJEm6f+h2u3S73QWN4T25M0gyAZxbVaua4zcB+1bVu2ZoexHwpqra\nOP14tntyk7wfuKGqPrGLOLwnVwPxnlxJkiS1kffkLq5BP8jbgNk2k/r3JP2z5VNjXgD8bpJ9AZL8\nQpKHzy9MSZIkSdIUlyvPbtBpsE8DH0vyb8CvTuv3cWBzkg3NfbkFUFXfaB4zdHlvBo5twMuA/71Y\nwUuSJEnScuRy5SXM5coalMuVJUmS1EYuV5YkSZIkLWsmuQNK8uEkm5rn3k69njTquCSAsbEJ7t7o\nO82xJEmStPy4XHkJc7myJEmSpOXM5cqSJEmSpGXNJFeSJEmS1BomuZIkSZKk1jDJlSRJkiS1hkmu\nJEmSJKk1THIlSZIkSa1hkitJkiRJag2TXEmSJElSa5jkSpIkSZJawyRXkiRJktQaJrlSC4yPryTJ\nvP/Gx1eO+hIkSZKkRZGqGnUMmkWS8vvRIJIAC/mtBH9rkiRJWmqSUFUZpo8zuZIkSZKk1lj0JDfJ\npYs95u6W5ONJnjjC89+Y5CGjOr8kSZIktYXLlRdRkj2q6q559Ps+cHhV/WhavcuVNRCXK0uSJKmN\nlsRy5STbmtfxJN9MsjHJ5iRHzdL+AUnWNW2uTvKGpv7VSdYn2ZTkjCT7NPXrknwkyeVJvpfk2CSn\nJ7k+yd/0jfvsJN9K8p0kX0zywDlivijJ6qZ8YhPL5iTvmX5dTfn4JOv64vloksuBU5OsaeK5qInv\ndX39zk5yZZJrkry6P4ThPmVJkiRJ0kx2xz25U9NBLwHOr6rVwMHAVbO0PwQ4qKpWVdXBwLqm/syq\nOrKqDgX+HnhVX58DqurpwMnAOcAHqupJwKokq5I8FPhT4FlVdTiwAXjTrgJP8gjgPUCnieuIJMdN\nu67p10kT/9Or6s3N8ROAZwNPBdYk2aOpf2VVHQEcAbwhyYG7ikmSJEmSNLgVu3HsK4HTk+wJfKWq\nrp6l3feBxyQ5Dfhb4OtN/VOSvBs4ANgXuKCvz7nN6zXAlqq6vjm+DlgJPAp4EnBZeus49wQuHyDm\nI4CLppYNJ/kccAy9RHqu2dYzph1/taruBP6/JFuBMeBfgDcmeX7T5pHA44D1cwU0OTm5s9zpdOh0\nOgNchiRJkiTd/3S7Xbrd7oLG2G1JblVdkuQY4LnAp5J8oKo+O0O7Hyc5GPiPwO8DJwCvBj4FHFdV\n1yY5CTi2r9v25nVHX3nqeEXz+vWqeuk8Qp8tme2fud1n2nu3Tzu+V0xJjgWeCTy1qrYnuWiGce6l\nP8mVJEmSpDabPrG3du3aocfYHcuVA5Dk0cDNVXU68Elg9YyNe0uL96iqs+ktMZ5q9yBgSzMTPFey\nOlNSegVwVJJfbM7xwCSPGyD29cAxSR7SLDE+Eeg2721J8oQkDwBeMMBY0z0YuKVJcJ8IPG0eY0iS\nJEmS5rA7ZnKnZjw7wFuS3AFsA14xS/uDgHVN8ljAW5v6d9JLOm8Gvg3sN2386efbWa6q/5PkPwFf\nSLJ3U/+nwD/MFXdVbUnyVu5ObM+rqvOa8p8AX23i+Q69JHymeO41bvN6PvD7Sa4DbuCey6fd1laS\nJEmSFoGPEAKSbAaeV1U/GHUs/XyEkAblI4QkSZLURkviEUL3N0m+Dly91BJcSZIkSdLwdufuyveS\n5Apgr6lDelNPL6+q6+6j859Fb/fl/vP/l6r6u/vi/NLuMjY2wdat83/c8tjYxCJGI0mSJI2Oy5WX\nMJcrS5IkSVrOXK4sSZIkSVrWTHIlSZIkSa1hkitJkiRJag2TXEmSJElSa5jkSpIkSZJawyRXkiRJ\nktQaJrmSJEmSpNYwyZUkSZIktYZJriRJkiSpNUxyJUmSJEmtsWLUAUhauPHxlWzd+oNRh3G/NzY2\nwZYtN406DEmSJC1AqmrUMWgWScrvR4NIAvhbWbjgvzlJkqSlIwlVlWH6uFxZkiRJktQaSzbJTbJt\niLbHJnn6Ip77DUn2WazxJEmSJEn3jSWb5DLc2ssO8KszvZFkj3mc+43AA+fRT5IkSZI0QksiyU1y\ndpIrk1yT5NV3V+eDSa5N8o0kD20qX5/kuiRXJfl8kgng94E3JtmY5Kgk65J8NMkVwKlJjkjyrSQb\nklya5HHNWA9I8r7mvFcl+YMkrwN+AbgoyYWzxPuA5hybk1zdzPw+Icm3+9pMJNnclG9M8udJNiVZ\nn+TQJOcn+Yckr9l9n6wkSZIkLS9LZXflV1bVj5slwlcmOQvYF1hfVScneQewBng98MfAyqq6I8n+\nVXVbko8B26rqgwBNonxQVT2tOX4QcHRV7UjyLOAU4HeA1wATwKqqqiQHNHH8EdCpqltmifeQZvxV\nzfhTceyZZKKqfgC8GPhCX5+bqurQJB8E1tGbeX4gcC3w17N9MJOTkzvLnU6HTqcz4EcqSZIkSfcv\n3W6Xbre7oDGWxO7KSSaB5zeHE8CvA5cCezeJ6WOAM6tqdZK/BW4Hvgx8uapuT7KGeya564D/UVWf\naY4fCfwl8Dh6y6BXVNWTkvx34KNVdY8Z2yQ3AodV1Y9mifcA4Ergb5u/rzdJ8luBHVX13iQbgBOq\n6vvNeL9aVf+a5JXA06rqNc1YN9FLsm+b4TzurqyBuLvyYnF3ZUmSpKXkfrm7cpJjgWcCT62qQ4Cr\ngJk2fZr6L8/nAh8GVtOb9Z3tGm7vK/8ZvaT3KcDzZhl/YFX1Y+BgoEtvNviTzVtfAl7cLIfeUVXf\n7+u2vXnd0VeGJuleSDySJEmSpJ6RJ7nAg4Fbqmp7kicCT2vq96C3pBjgpfRmdgEeXVXfBN4K7A88\nCNjWlGezP/DDpvzKvvpvAK+Z2pwqyYFN/W1zjdfcH7xHVZ0NvAM4FKBJau9q6r44RzySJEmSpN1g\nKSS55wN7JrkO+HPgW039T4Ajk1xDb/fkdyVZAXw2ydXABuC0ZpnvucALpjae4t7rNt8HvKdZQtx/\nzZ8E/hewOckm4MSm/hPA+bNtPAUcBHSbPp+hl3BP+SK9pPxLfXVzrX90baQkSZIkLZIlcU+uZuY9\nuRqU9+QuFu/JlSRJWkrul/fkSpIkSZK0WNzwaBeaZ+3uNXVIb7rs5VV13eiiku5pbGyCrVuH+h9c\nmsHY2MSoQ5AkSdICuVx5CXO5siRJkqTlzOXKkiRJkqRlzSRXkiRJktQaJrmSJEmSpNYwyZUkSZIk\ntYZJriRJkiSpNUxyJUmSJEmtYZIrSZIkSWoNk1xJkiRJUmuY5EqSJEmSWsMkV5IkSZLUGitGHYCk\nhRsfX8nWrT8YdRitNDY2wZYtN406DEmSJA0oVTXqGDSLJOX3o0EkAfyt7B7Bf4eSJEmjkYSqyjB9\nXK48Akn2GHUMkiRJktRGyzLJTTKR5PokH09ybZLzk+w9Q7vHJtnQd/xLU8dJDkvSTXJlkq8lGWvq\nX51kfZJNSc5Isk9Tvy7JR5NcAZya5JimzcYkG5Lsex9dviRJkiS11rJMchu/BHyoqp4M3AocP71B\nVX0f+HGSVU3VK4HTk6wA/hI4vqqOANYBf960ObOqjqyqQ4G/B17VN+RBVfW0qnoz8GbgtVW1GngG\n8NPFv0RJkiRJWl6W88ZTN1bVNU15A7BylnanA69M8ibgxcARwBOAJwPfSO9myAcA/9K0X5Xkz4AD\ngH2BC/rGOqOvfBnwF0k+B5xVVT9c+CVJkiRJ0vK2nJPc7X3lu4B9Zml3JrAGuAj4TlXdkuQg4Nqq\nOmqG9uuA46rq2iQnAcf2vXf7VKGqTk1yHvBc4LIkz6mq/zl9sMnJyZ3lTqdDp9MZ5NokSZIk6X6n\n2+3S7XYXNMay3F05yQRwXlU9pTl+E7BvVb1rlvZ/SW858+9W1QVJ9gSuA15RVVc0y5cfX1XXJ7kZ\neBK9JdBfBf65qn43yTrg3Ko6qxnzsc1yaJKcAXymqs6Zdl53V9ZA3F15d3J3ZUmSpFFxd+XhDPNf\nrZ+jN9v7dYCqugP4HXobSF0FbAKe3rR9J7AeuAT47hzne2OSa5r+/w58begrkCRJkiTdw7KcyR1W\nM9O7f1WtuY/P60yuBuJM7u7kTK4kSdKozGcmdznfkzuQJGcBjwWeOepYJEmSJElzM8ltJPkwcBS9\n6bCpabHTquqFIw1MkiRJkjQwk9xGVf3hqGOQ5mtsbIKtW4daxaEBjY1NjDoESZIkDcF7cpcw78mV\nJEmStJy5u7IkSZIkaVkzyZUkSZIktYZJriRJkiSpNUxyJUmSJEmtYZIrSZIkSWoNk1xJkiRJUmuY\n5EqSJEmSWsMkV5IkSZLUGia5kiRJkqTWMMmVJEmSJLWGSa4kSZIkqTVWjDoASQs3Pr6SrVt/MOow\n1GdsbIItW24adRiSJEnLTqpq1DFoFknK70eDSAL4W1lagv9+JUmSFiYJVZVh+rhceQBJJpKc2Hd8\nUpIPjTImSZIkSdK9meQO5jHAS6bVzXuKJomfuyRJkiTtBq1JtprZ1uuTfDzJtUnOT7L3LG1fnWR9\nkk1JzkiyT1O/LskL+9pta4qnAEcn2ZjkDU3dQUm+luSGJKf29Tkxyebm7z39YyV5f5JNwNOSnJLk\nuiRXJXnvYn8ekiRJkrQctSbJbfwS8KGqejJwK3D8LO3OrKojq+pQ4O+BV83Sbmq29q3AJVW1uqpO\na+oOBk4AVgEvTnJQkkcA7wE6wCHAEUmOa9rvC1zed84XVNWvVNUhwLvneb2SJEmSpD5t2135xqq6\npilvAFbO0u4pSd4NHEAv+bxgHue6sKp+ApDkOmACeBhwUVX9qKn/HHAMcA5wF3BW0/dW4KdJPgl8\nFThvtpNMTk7uLHc6HTqdzjxClSRJkqSlr9vt0u12FzRG25Lc7X3lu4B9Zmn3KeC4qro2yUnAsU39\nnTSz2+ltV7vXgOfawd2f5Ww7f/10aqvkqroryZHAs+jNBv9hU76X/iRXkiRJktps+sTe2rVrhx6j\nbcuVB91a+kHAliR7Ai/tq78JOLwp/zawZ1PeBuw3wLjrgWOSPCTJHsCJQHd6bEn2BQ6oqvOBk+kt\neZYkSZIkLVDbZnIH3fH4HfQS0puBb3N3AvsJ4CvN5lAXALc39ZuBHU39p4BbZjpvVW1J8lbuTmy/\nWlXn9bdp7NecZ2qm+Y8GjFuSJEmSNIc0K2i1BCUpvx8Nore63t/K0hL89ytJkrQwSaiqQVfsAu1b\nrixJkiRJWsbatlz5HpJ8GDiK3hTX1FTXaVX16ZEGJi2ysbEJtm4d6n9waTcbG5sYdQiSJEnLksuV\nlzCXK0uSJElazlyuLEmSJEla1kxyJUmSJEmtYZIrSZIkSWoNk1xJkiRJUmuY5EqSJEmSWsMkV5Ik\nSZLUGia5kiRJkqTWMMmVJEmSJLWGSa70/7d371F2leUdx78/iAiIAdEyUSITXYoXBIG2CgIygBcW\nCgIVqyIg6tKlVi62VsXVklRFEW+o1S4UKCjUCoJFrMp1KIpRIYHEBCkuhSCaUWpYEFQI5OkfZ49O\nJjmTGZKZc3Lm+1lr1tn73e9+93PO2ZnMc97LkSRJktQzTHIlSZIkST3DJFeSJEmS1DNmdDoASRtu\n1qw5DA3d2ekwtAH6+vpZvvyOTochSZK0yUtVdToGtZGkfH80HkkA75VNW/DfuyRJ0pqSUFWZyDkO\nV55iSbZKcnmSW5MsTnJap2OSJEmSpF5hktsZZ1TVc4A9gH2TvLzTAUmSJElSL+ipJDfJsUluSbIw\nyXlJ+pNcneTmJFcmmd3UOzfJ55P8IMnPkuyf5OwkS5OcM6K9+5N8LMlPklyR5K+TXNuc88qmzmOT\nnJNkUZKbkgw05ccl+XqSbye5LcnpAFX1h6q6rtl+GFgAzJ7il0qSJEmSelLPJLlJngucAgxU1R7A\nScBngXOranfgwmZ/2HZVtTfwbuAy4BNV9VxgtyS7NXUeB1xVVc8DVgIfBA4Cjmy2Ad4JrK6q3YDX\nA+cl2aI59nzgKGA34G+T7Dgq5u2AQ4GrN9LLIEmSJEnTWi+trnwgcFFVrQCoqhVJ9gaOaI5/GTh9\nRP1vNo+LgeVVtbTZXwLMARYBD1bVFSPq/bGqVidZDPQ35fsCn2mueVuSO4Cdm2NXV9VKgCRLm3Pu\nbvY3p5V4f7qq7mj3pObOnfun7YGBAQYGBtb/SkiSJEnSJmhwcJDBwcENaqOXktx1GWup0gebx9Uj\ntof3h1+XVaPKHwSoqkrS7rUbufLXyHYfYc3X+yzgtqoa2bu8lpFJriRJkiT1stEde/PmzZtwGz0z\nXBm4BjgqyfYAzeMNwOua428Arm9zbrslqcdaqnr42PXA0c01dwaeCtw2VqBJPgTMrKqTx6onSZIk\nSZqYnunJraqlST4MXJfkYWAh8C7g35P8A/Bb4Pjh6qNPH8f2WpdsHj8PfCHJIlo9v8dV1arW95au\nXb+Zl3sKcGuShU3556rqnNEnSJIkSZImJlVj5XHqpCTl+6PxaH2o4r2yaQv+e5ckSVpTEqpqrBG2\na+ml4cqSJEmSpGmuZ4YrS9NZX18/Q0MT+oBLXaavr3/9lSRJkrReDlfuYg5XliRJkjSdOVxZkiRJ\nkjStmeRKkiRJknqGSa4kSZIkqWeY5EqSJEmSeoZJriRJkiSpZ5jkSpIkSZJ6hkmuJEmSJKlnmORK\nkiRJknqGSa4kSZIkqWeY5EqSJEmSesaMTgcgacPNmjWHoaE7Ox2GprG+vn6WL7+j02FIkiSRqup0\nDGojSfn+aDySAN4r6qTg7ytJkrSxJaGqMpFzHK4sSZIkSeoZk5rkJvneZLb/aCW5NsmeU3Cd/0hy\nc5ITkzwrycIkNyV52mRfW5IkSZKmo0mdk1tV+05m++ORZPOqeqQD150F/FVVPbPZfy9wUVWdNtWx\nSJIkSdJ0Mdk9ufcn2TrJVUluTHJLksNGHD+2KVuY5LymbIcklzQ9oAuT7DVG3Vcmmd/0jl6R5C+a\n8lOTnN/0JJ+fZMskX02yJMklwJbrifvgps2FSa5syp6Q5NImhhuS7NqUb53k7BFxHNo0813gKUkW\nJPln4CTg7Umubs65vGl/UZKjNubrLkmSJEnT1WSvrlzAH4HDq2plkicC84HLkuwCnALsXVUrkmzX\nnPMZYLCqjkxrNZ1tkjy3Td3rq2o4CX4z8I/Ae5pjzwH2qaqHkpwMrKyqXZrkdEG7gJM8CTgL2Leq\nlo241jxgQVUdkeQA4HxgD+ADwNVV9eYk2wI/SnIVcBjwzaras2k3wP1V9ckkRwJ3V9Urm2OPf5Sv\nryRJkiRphKn4CqEAH02yH7CaVu/mDsABtIbvrgCoqnub+gcCxzRlBdyf5MA2dZ+a5GvAk4HHAL8Y\ncd3LquqhZvvFwJnNuYuT3DJGvHsB11XVslHX2hc4sim7Nsn2SbYBXgYcmmQ4ud4C2IlWct/OYuDj\nST4CfKuq2s5dnjt37p+2BwYGGBgYGKNZSZIkSdp0DQ4OMjg4uEFtTHaSG+ANwBOBPapqdZJf8Ofh\nwutaCnoi30HxWeDjVfWtJPsDp4449sB64hrLROIK8DdVdfsahUl/u8ar6vZm4atDgA8luaqqPrSu\nuiOTXEmSJEnqZaM79ubNmzfhNqbiK4RmAr9pEtwDgOHk7xrg1Um2h9ac16b8auAdTdlmSWY2dY9a\nR92ZwK+a7ePGiOF/gKObc58H7DZG3fnAfsNJ6ohrXU8rYSfJAHBPVa2kNff2hOGTk+w+oq11JtNJ\nngz8oaouBM4AJn2lZ0mSJEmaDia7J3c1cAFweTNE+EbgVoCqWprkw8B1SR4GFgJvorVA01nNHNuH\ngbdX1Q/b1J0HXJzkd7QS4Tlt4vgCcG6SJc31b2wXcFXdk+StwKXNPNrfAC9vrnVO8zwe4M9J9QeB\nTydZRCup/QWt+bjQvvd3V+CMJKuBh4C3t4tHkiRJkjR+aU17nYSGW4tM3VhVfifso5SkJuv9UW9p\nfR7jvaJOCv6+kiRJG1sSqmp9003XMCnDlZvhuDfQGoorSZIkSdKUmJThylX1a+BZk9H2xpRkPq3V\nkKE11LiAY6pqSeeikiaur6+foaEJfcAlbVR9fW3X2pMkSZpSkzZcWRvO4cqSJEmSprOuGa4sSZIk\nSVInmORKkiRJknqGSa4kSZIkqWeY5EqSJEmSeoZJriRJkiSpZ5jkSpIkSZJ6hkmuJEmSJKlnmORK\nkiRJknqGSa4kSZIkqWeY5EqSJEmSesaMTgcgacPNmjWHoaE7Ox2GJEmS9Kj19fWzfPkdG9xOqmrD\no9GkSFK+PxqPJID3iiRJkjZlYXT+k4SqykRacbiyJEmSJKlndCzJTXL/BOrun2TvjXjtE5NsubHa\na3ONJyf52mReQ5IkSZK0pk725E5kbOUA8KJ1HUiy+aO49knA1o/ivHGrql9X1Wsm8xqSJEmSpDVN\nyZzcJJcCs4EtgTOr6ktNT+4XgZcBvwZeW1X/l+QE4G3AKmAp8H5gPvAw8FvgXcBbgD8CewDfA/4T\nOBN4LPAH4Piquj3JZsDpwMHAI831NgM+DvwUuKeqDmoT88HAh5v691TVS5M8ATgHeDrwAPC2qlqc\nZH/g07QS9wJeDDwJuLyqdk1yHHA48DjgGcAngC2AY5rncUhV3buOGJyTq3FxTq4kSZI2fRtnTu5U\nra58fFXd2wwR/nGSS2glfD+qqncn+SfgVOAE4L3AnKpalWRmVd2X5N+A+6vqkwBJ3gLsWFV7Nfvb\nAPtW1eokBwEfAV5NK1nuB3arqkqyXRPHycBAVa1YV7BJngSc1bS5LMl2zaF5wIKqOiLJAcD5tBLt\nvwfeUVU/SLI1rcQV1sw6dgF2p9WD/DPgPVW1Z5JPAscCn1lXLHPnzv3T9sDAAAMDA+t7rSVJkiRp\nkzQ4OMjg4OAGtTFVSe5JSQ5vtmcDz6TVszo8Z/UrwNeb7VuAC5N8A/jGGG1eNGJ7O+D8JM+klVgO\nP6+DgC8Md4eO6C1N89POXsB1VbVs1Hn7Akc2Zdcm2b5JsL8PfCrJBcAlVXV3q2dtDddW1e+B3ye5\nF7i8KV8M7NoukJFJriRJkiT1stEde/PmzZtwG5M+J7cZynsg8MKq2h24mdaw5dGGez1fAXwO2JNW\nr2+7GB8Ysf1B4Jqq2hU4tE37Ew59jBjXqFNVpwNvBrYCvp9k53Wc++Codob3V+P3FUuSJEnSRjEV\nC09tC6yoqgeTPJtWLynA5rSGFAMcTWtuLcBOVXUd8D5gJrANcH+z3c5M4O5m+/gR5VcCbxtenKqZ\nUwtw33ramw/sl6R/1HnXA29oygaA31bVyiRPr6olVfUx4MfAs5v6Exo7LkmSJEnaMFOR5H4HeEyS\nJcBpwA1N+UrgBUkW01o9+V+SzAC+kuQW4CZai1TdB3wTOCLJgiT7sHaP6hnAR5PcxJrP6UvAXcCi\nJAuB1zXlXwS+k+TqdQVcVfcAbwUubc77anNoHvCXTXyn0ZpLC63h2IuT3Aw8BHx7uKk2r4krBEmS\nJEnSJJiS1ZX16Li6ssbL1ZUlSZK06ds4qyt38ntyJUmSJEnaqKb9gkdJ5tP6zlpozaEt4JiqWtK5\nqKSJ6evrZ2jIKeCSJEnadPX19W+Udhyu3MUcrixJkiRpOnO4sjRNbegXZmt68X7ReHmvaCK8XzRe\n3iuabCa5Ug/wPwtNhPeLxst7RRPh/aLx8l7RZDPJlSRJkiT1DJNcSZIkSVLPcOGpLpbEN0eSJEnS\ntDbRhadMciVJkiRJPcPhypIkSZKknmGSK0mSJEnqGSa5kiRJkqSeYZLbxZJ8LMmtSW5O8vUkMzsd\nk7pPkoOT/DTJ/yZ5b6fjUXdKMjvJNUmWJFmc5IROx6TulmSzJAuSXNbpWNTdkmyb5KLmb5YlSV7Y\n6ZjUvZKcnOQnSRYluSDJFp2OSd0hydlJhpIsGlH2hCRXJLktyXeTbDuetkxyu9sVwC5VtTtwO/D+\nDsejLpNkM+BzwMuBXYDXJXl2Z6NSl3oYeHdV7QLsDbzTe0XrcSKwtNNBaJNwJvDfVfUc4PnArR2O\nR10qyVOAdwF7VtVuwAzgtZ2NSl3kXFp/0470PuCqqnoWcA3jzIdMcrtYVV1VVaub3fnA7E7Go670\nAuD2qrqzqlYBXwVe1eGY1IWqanlV3dxsr6T1R+iOnY1K3SrJbOAQ4EudjkXdrRlltl9VnQtQVQ9X\n1ZaACqgAAAJqSURBVH0dDkvdbXPgcUlmAFsDv+pwPOoSVfU9YMWo4lcB5zXb5wGHj6ctk9xNx5uA\nb3c6CHWdHYG7Ruz/EhMXrUeSOcDuwA87G4m62KeA9wB+z6DW52nAPUnObYa3n5Vkq04Hpe5UVb8C\nPgEsA+4G7q2qqzoblbrcDlU1BK0P7IEdxnOSSW6HJbmymZMw/LO4eTx0RJ0PAKuq6sIOhiqpByTZ\nBrgYOLHp0ZXWkOQVwFDT85/mR2pnBrAn8K9VtSfwe1rDC6W1JNmOVs9cP/AUYJskr+9sVNrEjOvD\n1xmTHYXGVlUvHet4kjfSGjJ24JQEpE3N3cBOI/ZnN2XSWpqhYRcDX66q/+p0POpa+wCHJTkE2Ap4\nfJLzq+rYDsel7vRL4K6qurHZvxhwEUS18xLg51X1O4AklwAvAuzIUTtDSfqqaijJLOA34znJntwu\nluRgWsPFDquqBzsdj7rSj4FnJOlvVid8LeBKqGrnHGBpVZ3Z6UDUvarqlKraqaqeTut3yjUmuGqn\nGUZ4V5Kdm6KDcMEytbcM2CvJlklC635xoTKNNHoE0WXAG5vt44BxfUhvT253+yywBXBl6/cA86vq\nHZ0NSd2kqh5J8ne0VuLeDDi7qvzPQmtJsg9wNLA4yUJaw31OqarvdDYyST3gBOCCJI8Bfg4c3+F4\n1KWq6kdJLgYWAquax7M6G5W6RZILgQHgiUmWAacCHwUuSvIm4E7gNeNqq8o1JSRJkiRJvcHhypIk\nSZKknmGSK0mSJEnqGSa5kiRJkqSeYZIrSZIkSeoZJrmSJEmSpJ5hkitJkiRJ6hkmuZIkSZKknvH/\np0/AAOUL/SoAAAAASUVORK5CYII=\n", | |
"text/plain": [ | |
"<matplotlib.figure.Figure at 0x1ad4413d0>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"weights = logreg.coef_.flatten()\n", | |
"plot_simple_imp(weights,colnames )" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Random Forest " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 248, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Create and fit Random Forest\n", | |
"clf = RandomForestClassifier(n_estimators=300, n_jobs=4)\n", | |
"#clf = SGDClassifier(alpha=0.00001)\n", | |
"clf.fit(xtrain, ytrain)\n", | |
"\n", | |
"# Predict on validation set\n", | |
"pvalid = clf.predict(xvalid)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 249, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Accuracy 0.970139069405\n", | |
"F1 0.972508899459\n" | |
] | |
} | |
], | |
"source": [ | |
"# Evaluation metrics\n", | |
"accuracy = metrics.accuracy_score(yvalid, pvalid)\n", | |
"f1 = metrics.f1_score(yvalid, pvalid)\n", | |
"print 'Accuracy', accuracy\n", | |
"print 'F1', f1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 237, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def plot_simple_imp(imp, feature_names,sort = True, absolute=False,figsize=(15,8)):\n", | |
" serie = pd.Series(index=feature_names, data=imp)\n", | |
" if absolute : \n", | |
" serie = np.abs(serie)\n", | |
" if sort :\n", | |
" serie.sort_values(inplace=True, ascending=False)\n", | |
" serie.plot(kind='barh',figsize=figsize)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 238, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(584736, 20)" | |
] | |
}, | |
"execution_count": 238, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"xtrain.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 239, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"20" | |
] | |
}, | |
"execution_count": 239, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(colnames)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 240, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([ 0.07887705, 0.0617638 , 0.01286454, 0.05279704, 0.00294262,\n", | |
" 0.00104028, 0.00143772, 0.00143998, 0.00376257, 0.00385741,\n", | |
" 0.15862533, 0.00948146, 0.01670675, 0.23143251, 0.01111798,\n", | |
" 0.01483788, 0.00890416, 0.24373993, 0.05527093, 0.02910003])" | |
] | |
}, | |
"execution_count": 240, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"clf.feature_importances_" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Correlation matrix" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 255, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df_var = pd.DataFrame(xtrain, columns=colnames)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 256, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>abstract_cosim</th>\n", | |
" <th>abstract_svm</th>\n", | |
" <th>n_title</th>\n", | |
" <th>n_years</th>\n", | |
" <th>n_authors</th>\n", | |
" <th>is_same_journal</th>\n", | |
" <th>is_author_missing</th>\n", | |
" <th>is_journal_missing</th>\n", | |
" <th>n_authors_source</th>\n", | |
" <th>n_authors_target</th>\n", | |
" <th>common</th>\n", | |
" <th>d_out</th>\n", | |
" <th>d_in</th>\n", | |
" <th>common2</th>\n", | |
" <th>d_out2</th>\n", | |
" <th>d_in2</th>\n", | |
" <th>diff_in2</th>\n", | |
" <th>jacccard_coeffs</th>\n", | |
" <th>abstract_similarity</th>\n", | |
" <th>title_similarity</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>abstract_cosim</th>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.289916</td>\n", | |
" <td>0.447992</td>\n", | |
" <td>0.114286</td>\n", | |
" <td>0.268721</td>\n", | |
" <td>0.089228</td>\n", | |
" <td>-0.032675</td>\n", | |
" <td>-0.105115</td>\n", | |
" <td>0.028062</td>\n", | |
" <td>0.061969</td>\n", | |
" <td>0.342311</td>\n", | |
" <td>0.063155</td>\n", | |
" <td>0.047936</td>\n", | |
" <td>0.343858</td>\n", | |
" <td>0.067649</td>\n", | |
" <td>0.049062</td>\n", | |
" <td>0.028904</td>\n", | |
" <td>0.559951</td>\n", | |
" <td>0.686317</td>\n", | |
" <td>0.537044</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>abstract_svm</th>\n", | |
" <td>0.289916</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.228710</td>\n", | |
" <td>0.361239</td>\n", | |
" <td>0.031596</td>\n", | |
" <td>0.038243</td>\n", | |
" <td>-0.000753</td>\n", | |
" <td>-0.163160</td>\n", | |
" <td>0.054817</td>\n", | |
" <td>0.087508</td>\n", | |
" <td>0.503606</td>\n", | |
" <td>0.299110</td>\n", | |
" <td>0.424660</td>\n", | |
" <td>0.499599</td>\n", | |
" <td>0.308073</td>\n", | |
" <td>0.426876</td>\n", | |
" <td>0.331437</td>\n", | |
" <td>0.291461</td>\n", | |
" <td>0.408335</td>\n", | |
" <td>0.353542</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>n_title</th>\n", | |
" <td>0.447992</td>\n", | |
" <td>0.228710</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.086868</td>\n", | |
" <td>0.138866</td>\n", | |
" <td>0.053240</td>\n", | |
" <td>-0.010525</td>\n", | |
" <td>-0.056332</td>\n", | |
" <td>0.006408</td>\n", | |
" <td>0.037526</td>\n", | |
" <td>0.266146</td>\n", | |
" <td>0.066952</td>\n", | |
" <td>0.043393</td>\n", | |
" <td>0.267238</td>\n", | |
" <td>0.070061</td>\n", | |
" <td>0.044052</td>\n", | |
" <td>0.023297</td>\n", | |
" <td>0.380768</td>\n", | |
" <td>0.374084</td>\n", | |
" <td>0.699251</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>n_years</th>\n", | |
" <td>0.114286</td>\n", | |
" <td>0.361239</td>\n", | |
" <td>0.086868</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.016845</td>\n", | |
" <td>-0.004103</td>\n", | |
" <td>-0.004907</td>\n", | |
" <td>-0.017396</td>\n", | |
" <td>0.047985</td>\n", | |
" <td>-0.025112</td>\n", | |
" <td>0.081092</td>\n", | |
" <td>0.045646</td>\n", | |
" <td>0.109205</td>\n", | |
" <td>0.079834</td>\n", | |
" <td>0.047775</td>\n", | |
" <td>0.109863</td>\n", | |
" <td>0.094281</td>\n", | |
" <td>0.083536</td>\n", | |
" <td>0.169046</td>\n", | |
" <td>0.145372</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>n_authors</th>\n", | |
" <td>0.268721</td>\n", | |
" <td>0.031596</td>\n", | |
" <td>0.138866</td>\n", | |
" <td>0.016845</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.054219</td>\n", | |
" <td>0.033844</td>\n", | |
" <td>-0.029957</td>\n", | |
" <td>0.025718</td>\n", | |
" <td>0.027953</td>\n", | |
" <td>0.101562</td>\n", | |
" <td>0.010127</td>\n", | |
" <td>-0.026385</td>\n", | |
" <td>0.102744</td>\n", | |
" <td>0.011661</td>\n", | |
" <td>-0.025913</td>\n", | |
" <td>-0.028772</td>\n", | |
" <td>0.271323</td>\n", | |
" <td>0.183107</td>\n", | |
" <td>0.154396</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>is_same_journal</th>\n", | |
" <td>0.089228</td>\n", | |
" <td>0.038243</td>\n", | |
" <td>0.053240</td>\n", | |
" <td>-0.004103</td>\n", | |
" <td>0.054219</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>-0.013772</td>\n", | |
" <td>-0.057073</td>\n", | |
" <td>0.024203</td>\n", | |
" <td>-0.000082</td>\n", | |
" <td>0.082868</td>\n", | |
" <td>0.006311</td>\n", | |
" <td>-0.013593</td>\n", | |
" <td>0.083644</td>\n", | |
" <td>0.007006</td>\n", | |
" <td>-0.013369</td>\n", | |
" <td>-0.015126</td>\n", | |
" <td>0.109770</td>\n", | |
" <td>0.104771</td>\n", | |
" <td>0.077082</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>is_author_missing</th>\n", | |
" <td>-0.032675</td>\n", | |
" <td>-0.000753</td>\n", | |
" <td>-0.010525</td>\n", | |
" <td>-0.004907</td>\n", | |
" <td>0.033844</td>\n", | |
" <td>-0.013772</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.042303</td>\n", | |
" <td>-0.401889</td>\n", | |
" <td>-0.452177</td>\n", | |
" <td>0.002519</td>\n", | |
" <td>0.019025</td>\n", | |
" <td>0.044783</td>\n", | |
" <td>0.001670</td>\n", | |
" <td>0.018943</td>\n", | |
" <td>0.044855</td>\n", | |
" <td>0.038654</td>\n", | |
" <td>-0.050447</td>\n", | |
" <td>-0.032516</td>\n", | |
" <td>-0.010804</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>is_journal_missing</th>\n", | |
" <td>-0.105115</td>\n", | |
" <td>-0.163160</td>\n", | |
" <td>-0.056332</td>\n", | |
" <td>-0.017396</td>\n", | |
" <td>-0.029957</td>\n", | |
" <td>-0.057073</td>\n", | |
" <td>0.042303</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>-0.127054</td>\n", | |
" <td>-0.085241</td>\n", | |
" <td>-0.122779</td>\n", | |
" <td>-0.071352</td>\n", | |
" <td>-0.071640</td>\n", | |
" <td>-0.122251</td>\n", | |
" <td>-0.072497</td>\n", | |
" <td>-0.071918</td>\n", | |
" <td>-0.049970</td>\n", | |
" <td>-0.091337</td>\n", | |
" <td>-0.160398</td>\n", | |
" <td>-0.102017</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>n_authors_source</th>\n", | |
" <td>0.028062</td>\n", | |
" <td>0.054817</td>\n", | |
" <td>0.006408</td>\n", | |
" <td>0.047985</td>\n", | |
" <td>0.025718</td>\n", | |
" <td>0.024203</td>\n", | |
" <td>-0.401889</td>\n", | |
" <td>-0.127054</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.049267</td>\n", | |
" <td>0.054195</td>\n", | |
" <td>0.073153</td>\n", | |
" <td>0.006936</td>\n", | |
" <td>0.054178</td>\n", | |
" <td>0.074194</td>\n", | |
" <td>0.007002</td>\n", | |
" <td>-0.014269</td>\n", | |
" <td>0.044715</td>\n", | |
" <td>0.057341</td>\n", | |
" <td>0.021092</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>n_authors_target</th>\n", | |
" <td>0.061969</td>\n", | |
" <td>0.087508</td>\n", | |
" <td>0.037526</td>\n", | |
" <td>-0.025112</td>\n", | |
" <td>0.027953</td>\n", | |
" <td>-0.000082</td>\n", | |
" <td>-0.452177</td>\n", | |
" <td>-0.085241</td>\n", | |
" <td>0.049267</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.042629</td>\n", | |
" <td>0.001673</td>\n", | |
" <td>-0.027893</td>\n", | |
" <td>0.043522</td>\n", | |
" <td>0.002088</td>\n", | |
" <td>-0.027569</td>\n", | |
" <td>-0.027671</td>\n", | |
" <td>0.067663</td>\n", | |
" <td>0.077189</td>\n", | |
" <td>0.057620</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>common</th>\n", | |
" <td>0.342311</td>\n", | |
" <td>0.503606</td>\n", | |
" <td>0.266146</td>\n", | |
" <td>0.081092</td>\n", | |
" <td>0.101562</td>\n", | |
" <td>0.082868</td>\n", | |
" <td>0.002519</td>\n", | |
" <td>-0.122779</td>\n", | |
" <td>0.054195</td>\n", | |
" <td>0.042629</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.398350</td>\n", | |
" <td>0.347228</td>\n", | |
" <td>0.999450</td>\n", | |
" <td>0.402228</td>\n", | |
" <td>0.348114</td>\n", | |
" <td>0.227251</td>\n", | |
" <td>0.565578</td>\n", | |
" <td>0.376272</td>\n", | |
" <td>0.346464</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>d_out</th>\n", | |
" <td>0.063155</td>\n", | |
" <td>0.299110</td>\n", | |
" <td>0.066952</td>\n", | |
" <td>0.045646</td>\n", | |
" <td>0.010127</td>\n", | |
" <td>0.006311</td>\n", | |
" <td>0.019025</td>\n", | |
" <td>-0.071352</td>\n", | |
" <td>0.073153</td>\n", | |
" <td>0.001673</td>\n", | |
" <td>0.398350</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.079206</td>\n", | |
" <td>0.395478</td>\n", | |
" <td>0.997728</td>\n", | |
" <td>0.079586</td>\n", | |
" <td>-0.206185</td>\n", | |
" <td>0.077958</td>\n", | |
" <td>0.099688</td>\n", | |
" <td>0.099873</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>d_in</th>\n", | |
" <td>0.047936</td>\n", | |
" <td>0.424660</td>\n", | |
" <td>0.043393</td>\n", | |
" <td>0.109205</td>\n", | |
" <td>-0.026385</td>\n", | |
" <td>-0.013593</td>\n", | |
" <td>0.044783</td>\n", | |
" <td>-0.071640</td>\n", | |
" <td>0.006936</td>\n", | |
" <td>-0.027893</td>\n", | |
" <td>0.347228</td>\n", | |
" <td>0.079206</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.334093</td>\n", | |
" <td>0.081327</td>\n", | |
" <td>0.999809</td>\n", | |
" <td>0.958737</td>\n", | |
" <td>-0.048726</td>\n", | |
" <td>0.116739</td>\n", | |
" <td>0.074677</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>common2</th>\n", | |
" <td>0.343858</td>\n", | |
" <td>0.499599</td>\n", | |
" <td>0.267238</td>\n", | |
" <td>0.079834</td>\n", | |
" <td>0.102744</td>\n", | |
" <td>0.083644</td>\n", | |
" <td>0.001670</td>\n", | |
" <td>-0.122251</td>\n", | |
" <td>0.054178</td>\n", | |
" <td>0.043522</td>\n", | |
" <td>0.999450</td>\n", | |
" <td>0.395478</td>\n", | |
" <td>0.334093</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.399447</td>\n", | |
" <td>0.334996</td>\n", | |
" <td>0.215160</td>\n", | |
" <td>0.570262</td>\n", | |
" <td>0.376888</td>\n", | |
" <td>0.347508</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>d_out2</th>\n", | |
" <td>0.067649</td>\n", | |
" <td>0.308073</td>\n", | |
" <td>0.070061</td>\n", | |
" <td>0.047775</td>\n", | |
" <td>0.011661</td>\n", | |
" <td>0.007006</td>\n", | |
" <td>0.018943</td>\n", | |
" <td>-0.072497</td>\n", | |
" <td>0.074194</td>\n", | |
" <td>0.002088</td>\n", | |
" <td>0.402228</td>\n", | |
" <td>0.997728</td>\n", | |
" <td>0.081327</td>\n", | |
" <td>0.399447</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.081722</td>\n", | |
" <td>-0.204735</td>\n", | |
" <td>0.082246</td>\n", | |
" <td>0.104550</td>\n", | |
" <td>0.104493</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>d_in2</th>\n", | |
" <td>0.049062</td>\n", | |
" <td>0.426876</td>\n", | |
" <td>0.044052</td>\n", | |
" <td>0.109863</td>\n", | |
" <td>-0.025913</td>\n", | |
" <td>-0.013369</td>\n", | |
" <td>0.044855</td>\n", | |
" <td>-0.071918</td>\n", | |
" <td>0.007002</td>\n", | |
" <td>-0.027569</td>\n", | |
" <td>0.348114</td>\n", | |
" <td>0.079586</td>\n", | |
" <td>0.999809</td>\n", | |
" <td>0.334996</td>\n", | |
" <td>0.081722</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.958812</td>\n", | |
" <td>-0.047477</td>\n", | |
" <td>0.118002</td>\n", | |
" <td>0.075758</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>diff_in2</th>\n", | |
" <td>0.028904</td>\n", | |
" <td>0.331437</td>\n", | |
" <td>0.023297</td>\n", | |
" <td>0.094281</td>\n", | |
" <td>-0.028772</td>\n", | |
" <td>-0.015126</td>\n", | |
" <td>0.038654</td>\n", | |
" <td>-0.049970</td>\n", | |
" <td>-0.014269</td>\n", | |
" <td>-0.027671</td>\n", | |
" <td>0.227251</td>\n", | |
" <td>-0.206185</td>\n", | |
" <td>0.958737</td>\n", | |
" <td>0.215160</td>\n", | |
" <td>-0.204735</td>\n", | |
" <td>0.958812</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>-0.070066</td>\n", | |
" <td>0.086094</td>\n", | |
" <td>0.044622</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>jacccard_coeffs</th>\n", | |
" <td>0.559951</td>\n", | |
" <td>0.291461</td>\n", | |
" <td>0.380768</td>\n", | |
" <td>0.083536</td>\n", | |
" <td>0.271323</td>\n", | |
" <td>0.109770</td>\n", | |
" <td>-0.050447</td>\n", | |
" <td>-0.091337</td>\n", | |
" <td>0.044715</td>\n", | |
" <td>0.067663</td>\n", | |
" <td>0.565578</td>\n", | |
" <td>0.077958</td>\n", | |
" <td>-0.048726</td>\n", | |
" <td>0.570262</td>\n", | |
" <td>0.082246</td>\n", | |
" <td>-0.047477</td>\n", | |
" <td>-0.070066</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.490212</td>\n", | |
" <td>0.460570</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>abstract_similarity</th>\n", | |
" <td>0.686317</td>\n", | |
" <td>0.408335</td>\n", | |
" <td>0.374084</td>\n", | |
" <td>0.169046</td>\n", | |
" <td>0.183107</td>\n", | |
" <td>0.104771</td>\n", | |
" <td>-0.032516</td>\n", | |
" <td>-0.160398</td>\n", | |
" <td>0.057341</td>\n", | |
" <td>0.077189</td>\n", | |
" <td>0.376272</td>\n", | |
" <td>0.099688</td>\n", | |
" <td>0.116739</td>\n", | |
" <td>0.376888</td>\n", | |
" <td>0.104550</td>\n", | |
" <td>0.118002</td>\n", | |
" <td>0.086094</td>\n", | |
" <td>0.490212</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.613475</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>title_similarity</th>\n", | |
" <td>0.537044</td>\n", | |
" <td>0.353542</td>\n", | |
" <td>0.699251</td>\n", | |
" <td>0.145372</td>\n", | |
" <td>0.154396</td>\n", | |
" <td>0.077082</td>\n", | |
" <td>-0.010804</td>\n", | |
" <td>-0.102017</td>\n", | |
" <td>0.021092</td>\n", | |
" <td>0.057620</td>\n", | |
" <td>0.346464</td>\n", | |
" <td>0.099873</td>\n", | |
" <td>0.074677</td>\n", | |
" <td>0.347508</td>\n", | |
" <td>0.104493</td>\n", | |
" <td>0.075758</td>\n", | |
" <td>0.044622</td>\n", | |
" <td>0.460570</td>\n", | |
" <td>0.613475</td>\n", | |
" <td>1.000000</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" abstract_cosim abstract_svm n_title n_years \\\n", | |
"abstract_cosim 1.000000 0.289916 0.447992 0.114286 \n", | |
"abstract_svm 0.289916 1.000000 0.228710 0.361239 \n", | |
"n_title 0.447992 0.228710 1.000000 0.086868 \n", | |
"n_years 0.114286 0.361239 0.086868 1.000000 \n", | |
"n_authors 0.268721 0.031596 0.138866 0.016845 \n", | |
"is_same_journal 0.089228 0.038243 0.053240 -0.004103 \n", | |
"is_author_missing -0.032675 -0.000753 -0.010525 -0.004907 \n", | |
"is_journal_missing -0.105115 -0.163160 -0.056332 -0.017396 \n", | |
"n_authors_source 0.028062 0.054817 0.006408 0.047985 \n", | |
"n_authors_target 0.061969 0.087508 0.037526 -0.025112 \n", | |
"common 0.342311 0.503606 0.266146 0.081092 \n", | |
"d_out 0.063155 0.299110 0.066952 0.045646 \n", | |
"d_in 0.047936 0.424660 0.043393 0.109205 \n", | |
"common2 0.343858 0.499599 0.267238 0.079834 \n", | |
"d_out2 0.067649 0.308073 0.070061 0.047775 \n", | |
"d_in2 0.049062 0.426876 0.044052 0.109863 \n", | |
"diff_in2 0.028904 0.331437 0.023297 0.094281 \n", | |
"jacccard_coeffs 0.559951 0.291461 0.380768 0.083536 \n", | |
"abstract_similarity 0.686317 0.408335 0.374084 0.169046 \n", | |
"title_similarity 0.537044 0.353542 0.699251 0.145372 \n", | |
"\n", | |
" n_authors is_same_journal is_author_missing \\\n", | |
"abstract_cosim 0.268721 0.089228 -0.032675 \n", | |
"abstract_svm 0.031596 0.038243 -0.000753 \n", | |
"n_title 0.138866 0.053240 -0.010525 \n", | |
"n_years 0.016845 -0.004103 -0.004907 \n", | |
"n_authors 1.000000 0.054219 0.033844 \n", | |
"is_same_journal 0.054219 1.000000 -0.013772 \n", | |
"is_author_missing 0.033844 -0.013772 1.000000 \n", | |
"is_journal_missing -0.029957 -0.057073 0.042303 \n", | |
"n_authors_source 0.025718 0.024203 -0.401889 \n", | |
"n_authors_target 0.027953 -0.000082 -0.452177 \n", | |
"common 0.101562 0.082868 0.002519 \n", | |
"d_out 0.010127 0.006311 0.019025 \n", | |
"d_in -0.026385 -0.013593 0.044783 \n", | |
"common2 0.102744 0.083644 0.001670 \n", | |
"d_out2 0.011661 0.007006 0.018943 \n", | |
"d_in2 -0.025913 -0.013369 0.044855 \n", | |
"diff_in2 -0.028772 -0.015126 0.038654 \n", | |
"jacccard_coeffs 0.271323 0.109770 -0.050447 \n", | |
"abstract_similarity 0.183107 0.104771 -0.032516 \n", | |
"title_similarity 0.154396 0.077082 -0.010804 \n", | |
"\n", | |
" is_journal_missing n_authors_source n_authors_target \\\n", | |
"abstract_cosim -0.105115 0.028062 0.061969 \n", | |
"abstract_svm -0.163160 0.054817 0.087508 \n", | |
"n_title -0.056332 0.006408 0.037526 \n", | |
"n_years -0.017396 0.047985 -0.025112 \n", | |
"n_authors -0.029957 0.025718 0.027953 \n", | |
"is_same_journal -0.057073 0.024203 -0.000082 \n", | |
"is_author_missing 0.042303 -0.401889 -0.452177 \n", | |
"is_journal_missing 1.000000 -0.127054 -0.085241 \n", | |
"n_authors_source -0.127054 1.000000 0.049267 \n", | |
"n_authors_target -0.085241 0.049267 1.000000 \n", | |
"common -0.122779 0.054195 0.042629 \n", | |
"d_out -0.071352 0.073153 0.001673 \n", | |
"d_in -0.071640 0.006936 -0.027893 \n", | |
"common2 -0.122251 0.054178 0.043522 \n", | |
"d_out2 -0.072497 0.074194 0.002088 \n", | |
"d_in2 -0.071918 0.007002 -0.027569 \n", | |
"diff_in2 -0.049970 -0.014269 -0.027671 \n", | |
"jacccard_coeffs -0.091337 0.044715 0.067663 \n", | |
"abstract_similarity -0.160398 0.057341 0.077189 \n", | |
"title_similarity -0.102017 0.021092 0.057620 \n", | |
"\n", | |
" common d_out d_in common2 d_out2 \\\n", | |
"abstract_cosim 0.342311 0.063155 0.047936 0.343858 0.067649 \n", | |
"abstract_svm 0.503606 0.299110 0.424660 0.499599 0.308073 \n", | |
"n_title 0.266146 0.066952 0.043393 0.267238 0.070061 \n", | |
"n_years 0.081092 0.045646 0.109205 0.079834 0.047775 \n", | |
"n_authors 0.101562 0.010127 -0.026385 0.102744 0.011661 \n", | |
"is_same_journal 0.082868 0.006311 -0.013593 0.083644 0.007006 \n", | |
"is_author_missing 0.002519 0.019025 0.044783 0.001670 0.018943 \n", | |
"is_journal_missing -0.122779 -0.071352 -0.071640 -0.122251 -0.072497 \n", | |
"n_authors_source 0.054195 0.073153 0.006936 0.054178 0.074194 \n", | |
"n_authors_target 0.042629 0.001673 -0.027893 0.043522 0.002088 \n", | |
"common 1.000000 0.398350 0.347228 0.999450 0.402228 \n", | |
"d_out 0.398350 1.000000 0.079206 0.395478 0.997728 \n", | |
"d_in 0.347228 0.079206 1.000000 0.334093 0.081327 \n", | |
"common2 0.999450 0.395478 0.334093 1.000000 0.399447 \n", | |
"d_out2 0.402228 0.997728 0.081327 0.399447 1.000000 \n", | |
"d_in2 0.348114 0.079586 0.999809 0.334996 0.081722 \n", | |
"diff_in2 0.227251 -0.206185 0.958737 0.215160 -0.204735 \n", | |
"jacccard_coeffs 0.565578 0.077958 -0.048726 0.570262 0.082246 \n", | |
"abstract_similarity 0.376272 0.099688 0.116739 0.376888 0.104550 \n", | |
"title_similarity 0.346464 0.099873 0.074677 0.347508 0.104493 \n", | |
"\n", | |
" d_in2 diff_in2 jacccard_coeffs abstract_similarity \\\n", | |
"abstract_cosim 0.049062 0.028904 0.559951 0.686317 \n", | |
"abstract_svm 0.426876 0.331437 0.291461 0.408335 \n", | |
"n_title 0.044052 0.023297 0.380768 0.374084 \n", | |
"n_years 0.109863 0.094281 0.083536 0.169046 \n", | |
"n_authors -0.025913 -0.028772 0.271323 0.183107 \n", | |
"is_same_journal -0.013369 -0.015126 0.109770 0.104771 \n", | |
"is_author_missing 0.044855 0.038654 -0.050447 -0.032516 \n", | |
"is_journal_missing -0.071918 -0.049970 -0.091337 -0.160398 \n", | |
"n_authors_source 0.007002 -0.014269 0.044715 0.057341 \n", | |
"n_authors_target -0.027569 -0.027671 0.067663 0.077189 \n", | |
"common 0.348114 0.227251 0.565578 0.376272 \n", | |
"d_out 0.079586 -0.206185 0.077958 0.099688 \n", | |
"d_in 0.999809 0.958737 -0.048726 0.116739 \n", | |
"common2 0.334996 0.215160 0.570262 0.376888 \n", | |
"d_out2 0.081722 -0.204735 0.082246 0.104550 \n", | |
"d_in2 1.000000 0.958812 -0.047477 0.118002 \n", | |
"diff_in2 0.958812 1.000000 -0.070066 0.086094 \n", | |
"jacccard_coeffs -0.047477 -0.070066 1.000000 0.490212 \n", | |
"abstract_similarity 0.118002 0.086094 0.490212 1.000000 \n", | |
"title_similarity 0.075758 0.044622 0.460570 0.613475 \n", | |
"\n", | |
" title_similarity \n", | |
"abstract_cosim 0.537044 \n", | |
"abstract_svm 0.353542 \n", | |
"n_title 0.699251 \n", | |
"n_years 0.145372 \n", | |
"n_authors 0.154396 \n", | |
"is_same_journal 0.077082 \n", | |
"is_author_missing -0.010804 \n", | |
"is_journal_missing -0.102017 \n", | |
"n_authors_source 0.021092 \n", | |
"n_authors_target 0.057620 \n", | |
"common 0.346464 \n", | |
"d_out 0.099873 \n", | |
"d_in 0.074677 \n", | |
"common2 0.347508 \n", | |
"d_out2 0.104493 \n", | |
"d_in2 0.075758 \n", | |
"diff_in2 0.044622 \n", | |
"jacccard_coeffs 0.460570 \n", | |
"abstract_similarity 0.613475 \n", | |
"title_similarity 1.000000 " | |
] | |
}, | |
"execution_count": 256, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_var.corr()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Variables importances RandomForest " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 241, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA78AAAHaCAYAAADIRv5aAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzs3Xu43GV97/33B4JQkIOnLixq0Go9B4yAWBDG465SKUrB\njUqpFavbVlHUXQ/VZLl9imi1T6pPtSg70qp9kAIKWEFLM3IQDIZAQlS6rWB3D4GnW4SImnL4Pn/M\nbyXDYs3KrLUmmZVZ79d1zTX375778J35LXJdX+577klVIUmSJEnSKNtl2AFIkiRJkrS9mfxKkiRJ\nkkaeya8kSZIkaeSZ/EqSJEmSRp7JryRJkiRp5C0adgCaXhKP45YkSZK0oFVV5jqGye9OwJ+j0ny0\nfPlyli9fPuwwpCn596n5yr9NzVf+bWo+S+ac9wJue5YkSZIkLQAmv5IkSZKkkWfyK2lWWq3WsEOQ\nevLvU/OVf5uar/zb1EIQv086vyUp75EkSZKkhSrJQA68cuV3J5CEJOy//4HDDkWSJEmSdkqu/M5z\nnZ86mrhH8eRnSZIkSQvKvF75TXLV9hh3e0pyVpKnDHH+W5I8fFjzS5IkSdIoc+V3wJLsWlX3zaLf\nD4FDqurHk+pd+ZUkSZK0YM33ld9NzfP+Sb6Z5Pok65Ic0aP9LklWNm1uTHJaU39qktVJ1iY5L8ke\nTf3KJH+R5JokP0hydJKzk3w3yf/sGvfFSb6V5DtJzk2y5zQxr0qytCmf1MSyLsmHJ7+vpnx8kpVd\n8XwqyTXAmUmWNfGsauJ7S1e/C5Ncl2R9klO7Q5jZpyxJkiRJ6tf2OvBqYnny1cClVbUUOAi4oUf7\ng4EDqmpJVR0ErGzqz6+qw6rqWcD3gdd39dmvqp4LnA5cBHysqp4GLEmyJMkjgD8GXlhVhwBrgHds\nK/AkjwY+DLSauA5Ncuyk9zX5fdLE/9yqemdz/WTgxcBzgGVJdm3qX1dVhwKHAqcledi2YpIkSZIk\nzc2i7Tz+dcDZSXYDvlJVN/Zo90Pg8UlWAH8HfL2pf2aSDwH7AXsBl3X1ubh5Xg9srKrvNtcbgAOB\nxwJPA65OEmA34Jo+Yj4UWDWx/TjJF4Cj6CTY063Onjfp+qtVdS/wf5LcBowB/wa8LclxTZvHAE8C\nVk8f0vItpXa77e+wSZIkSRpZ7Xabdrs98HG3a/JbVVcmOQo4Bvhcko9V1eenaPeTJAcB/wV4E3AC\ncCrwOeDYqropySnA0V3dNjfP93eVJ64XNc9fr6rXzCL0Xklu90rvHpNeu3vS9YNiSnI08ALgOVW1\nOcmqKcaZwvLmedzEV5IkSdJIa7VaD8h7xsfHBzLu9tr2HIAkjwNur6qzgc8CS6ds3NmivGtVXUhn\nq/JEu4cCG5uV4+mS2KmS1WuBI5L8ajPHnkme1Efsq4Gjkjy82ap8EtBuXtuY5MlJdgFe0cdYk+0L\n3NEkvk8BDp/FGJIkSZKkGdpeK78TK6Qt4F1J7gE2Ab/To/0BwMomqSzg3U39B+gko7cD3wb2njT+\n5Pm2lKvqP5L8LvA3SXZv6v8Y+F/TxV1VG5O8m60J7yVVdUlTfg/w1Sae79BJzqeK50HjNs+XAm9K\nsgG4mQduw/YYZ0mSJEnaTvypo0aSdcDLq+pHw46lmz91JEmSJGkhG9RPHW3vA692Ckm+Dtw43xLf\nrTr3eWxs8ZDjkCRJkqSd0w5PfpNcCzxk4pLOsubJVbVhB81/AZ3ToLvn/+9V9fc7Yv7ZcLVXkiRJ\nkubGbc/zXJLyHkmSJElaqAa17Xl7nfYsSZIkSdK8YfIrSZIkSRp5Jr+SJEmSpJFn8itJkiRJGnkm\nv5IkSZKkkWfyK0mSJEkaeSa/kiRJkqSRZ/IrSZIkSRp5Jr87gSQPeOy//4HDDkmSJEmSdiqpqmHH\noGkkKZh8j4L3TZIkSdJCkISqylzHmdHKb5Kr5jphH3McneS5Xdcrk7xye8/bI5aXJ/nvs+i33T8n\nSZIkSVL/Fs2kcVUdub0C6dICfgpcM9eBkuxaVffNtn9VXQxcPIt+O+JzkiRJkiT1aaYrv5ua5/2T\nfDPJ9UnWJTlimj5/kWR1kvVJlnXV35Lk4U352UlWJVkMvAl4WzP2xLhHJ7k6yQ+6V4GTfLQZ98Yk\nJzZ1Rye5IslXgA09Ylqc5HvNqvLNST6f5IVJrmquD2nanZLkE035hGautUnaTd3Tkny7ifWGJL86\n6XM6unlf5zXz/XVXDC9r6q5LsiLJjJNsSZIkSVJ/ZrTyy9Yvn74auLSqzkgSYM9p+ry3qn6SZBfg\n8iTnV9VNPPiLrFVVP0ryaWBTVX0cIMmpwP5VdUSSpwIXARckOR5YUlXPTPLLwHVJvtmM9Szg6VX1\nz9PE9avA8VX13STfAU6qqiOTHAu8D3jFpPf8fuAlVfXvSfZp6t4E/N9V9TdJFgG7TuoDcDDwNGAj\ncHWSXwfWAJ8Gjqyqf07yxSk+D0mSJEnSgMw0+Z1wHXB2kt2Ar1TVjdO0/a9J3tDMtT+dRPAmYCZf\nWP4yQFV9r0l0AY4A/qapv71ZjT0U2ASs3kbiC3BLVX23KW8ALm/K64HFU7S/CjgnyZeAC5q6a4D3\nJXkMcGFV/WCKfqur6t8BktwAHAjcDfxTV4x/A7yhd6jLu8qt6d6TJEmSJO3U2u027XZ74OPOKvmt\nqiuTHAUcA3wuyceq6vOT2yU5EHgH8OyquivJSmCP5uV72brteo/JfSfZ3D1sjzbd9XdvY7zJY97f\ndX0/U3wuVfXmJIcCvwmsSbK0WfG9tqn7uyS/X1Xtaea5r2vsGST/y/tvKkmSJEk7sVarRavV2nI9\nPj4+kHFn+ju/AUjyOOD2qjob+CywtEf7fegcXrUpyRjw0q7XbgGe3ZSP76rf1PSbNgbgSuBVSXZJ\n8ijgecDqmb6XvhsnT6iq66pqGXA78Ngkj6+qW6rqE8BXgCV9jn0z8PjmcwR41UxikSRJkiTNzGy/\n89sC3pXkHjrJ6u9M2bhqXbPV93vA/6azdXjCB+lsnb4TaHfVXwz8bfPd27cwxXeDm7EvTHI4cCOd\n1dp3NdufnzrD9zK53MtHkzypKf99897+KMnJwD3AvwP/1zbGm4j9F0neDFyW5Kd0tpH7nV9JkiRJ\n2k5SZc41DEn2qqq7m/L/A/xjVa2Yol09OC8O3jdJkiRJC0ESqmpGO3enMtNtzxqcNzQ/m7SBzjbv\nv+zdNA94jI1NdR6XJEmSJKmXga38Ngc/PWTiks5y5clVNeVv7e4Ize8IX87WpdOJuF5YVXcMK66Z\nSFKu8kqSJElaqAa18uu253nO5FeSJEnSQua2Z0mSJEmS+mTyK0mSJEkaeSa/kiRJkqSRZ/IrSZIk\nSRp5Jr+SJEmSpJFn8itJkiRJGnkmv5IkSZKkkWfyK0mSJEkaeSa/kiRJkqSRZ/K7E0jygMf++x84\n7JAkSZIkaafSV/Kb5KqZDpzkkiT7zDykwUiyaQBjvDHJa2fY59FJvjTXuR+oHvC47bYfDXZ4SZIk\nSRpxqaphxzCtJLtW1X2z6HdXVQ0t+R6UJNVJeh9Qy3y/b5IkSZI0CEmoqsx1nH5Xfjc1z/sn+WaS\n65OsS3LENH1uSfLwpnx6kvVNn9OausVJ1ne1f0eSDzTlVUn+LMlq4K1JViZZkeTqJD9I8sqm3V5J\n/j7Jd5LcmOTYPt/P0UnaSb7cjHdGklcn+XYzzuObdsuSnN6U35pkQ5Ibknyxa5y1zeexpolny/tK\nckqS85N8LcnNSc7siuH1Td21Sc5K8uf9xC5JkiRJmrlFfbabWGZ8NXBpVZ2RJMCe2+qTZClwCnAo\nsCvw7SRt4Cc8eEmz225VdVgzxkpg/6o6IslTgYuAC4BfAMdV1U+TPAK4tnmtH0uApzRx/BD4TFU9\nJ8lbgbcAp09q/0fAgVV1T9d27ncAb66qa5Ls2cSz5b03DgIOBu4Bbm6S3PuBP27qfwqsAm7oM25J\nkiRJ0gz1m/xOuA44O8luwFeq6sY++hwJXFhVvwBIcgHwPODibfQ7d9L1lwGq6ntJfrmpC3BGkqPo\nJJS/kuSXq+r2ft7LRLsk/wR8valfD7SmaH8j8MUkX56IBbga+LMkXwAuqKp/7fw/gQe4vKp+2syz\nAVgMPApoV9WdTf15wJN6h7q8qzxVaJIkSZI0GtrtNu12e+Djzij5raorm0TzGOBzST5WVZ+f5dz3\n0lkJnrDHpNfvnnS9uas8kWG+Bngk8Kyquj/JLVOM00v3ePd3Xd/P1J/LMcBRwLHA+5I8o6rOTHJJ\n89rVSV4yadyp5pkYewZ71pf331SSJEmSdmKtVotWq7Xlenx8fCDj9vtTRwFI8jjg9qo6G/gssHRb\nfYArgeOS7JFkL+AVwBXAbcCjkjwsye7Ab84g7omx923iuT/J8+msqk5uMyiPq6pvAu8G9gEemuQJ\nVbWhqj5CZ1X8KX3OfR1wVJJ9kywCjh9wrJIkSZKkLjP9zm8LeFeSe4BNwO9sq19VrU3yOToJXwFn\nVdU6gCQfbOr/BfjeFPNt6/oLwMVJbgS+s40xpo1zuhebBPXzzXd9A6yoqruSfKhJuu8DNgBfA35l\nmvEKoKr+LcmfAKuBHwPfB+6cQbySJEmSpBnYLj91lGRXYCOdQ6pm/DNFC0GSvarq7uazuhA4u6q+\nMkU7f+pIkiRJ0oK1Q3/qaBZuonN6solvb8uTrKVzwNYPp0p8JUmSJEmDMeeV3yTXAg+ZuKSzTPna\nqvruHGObsyTPAP6arUunAX5RVc8dXlQz01n5faCxscVs3HjrEKKRJEmSpB1rUCu/22XbswYnSXmP\nJEmSJC1U833bsyRJkiRJ84bJryRJkiRp5Jn8SpIkSZJGnsmvJEmSJGnkmfxKkiRJkkaeya8kSZIk\naeSZ/EqSJEmSRp7JryRJkiRp5Jn8SpIkSZJGnsnvTiDJNh/773/gsMOUJEmSpHkrVTXsGDSNJAX9\n3KPgvZQkSZI0apJQVZnrOK789inJ4iQndV2fkuQTw4xJkiRJktQfk9/+PR549aS6WS+1JvGzlyRJ\nkqQdZKQSsGZ19rtJzkpyU5JLk+zeo+2pSVYnWZvkvCR7NPUrk7yyq92mpngGcGSS65Oc1tQdkORr\nSW5OcmZXn5OSrGseH+4eK8mfJlkLHJ7kjCQbktyQ5COD/jwkSZIkSR0jlfw2ngh8oqqeAdwJHN+j\n3flVdVhVPQv4PvD6Hu0mVnffDVxZVUurakVTdxBwArAEeFWSA5I8Gvgw0AIOBg5NcmzTfi/gmq45\nX1FVT6+qg4EPzfL9SpIkSZK2YdGwA9gObqmq9U15DXBgj3bPTPIhYD86Sells5jr8qr6KUCSDcBi\n4JHAqqr6cVP/BeAo4CLgPuCCpu+dwM+TfBb4KnBJ72mWd5VbzUOSJEmSRk+73abdbg983FFMfjd3\nle8D9ujR7nPAsVV1U5JTgKOb+ntpVsSTBHhIn3Pdz9bPs9dJZD+v5kjmqrovyWHAC+msHv9hU57C\n8mlCkCRJkqTR0Wq1aLVaW67Hx8cHMu4obnvu9wjshwIbk+wGvKar/lbgkKb8W8BuTXkTsHcf464G\njkry8CS7AicB7cmxJdkL2K+qLgVOp7N1WpIkSZK0HYziym+/JzC/n06iejvwbbYmtp8BvtIcSnUZ\ncHdTvw64v6n/HHDHVPNW1cYk72ZrwvvVqrqku01j72aeiZXpt/cZtyRJkiRphtLswtU8laT6y+eD\n91KSJEnSqElCVfW7w7enUVz5HUHbvs9jY4t3QBySJEmStHMa+eQ3ySeBI+gsn6Z5XlFV5ww1sBlw\nRVeSJEmS5sZtz/NckvIeSZIkSVqoBrXteRRPe5YkSZIk6QFMfiVJkiRJI8/kV5IkSZI08kx+JUmS\nJEkjz+RXkiRJkjTyTH4lSZIkSSPP5FeSJEmSNPJMfiVJkiRJI8/kdyeQpO/H/vsfOOxwJUmSJGne\nSVUNOwZNI0nBTO5R8J5KkiRJGhVJqKrMdZx5t/KbZHGSk7quT0nyiWHGJEmSJEnauc275Bd4PPDq\nSXWzXspMMtT3OOz5JUmSJEmzTH6b1dnvJjkryU1JLk2ye4+2pyZZnWRtkvOS7NHUr0zyyq52m5ri\nGcCRSa5PclpTd0CSryW5OcmZXX1OSrKueXy4e6wkf5pkLXB4kjOSbEhyQ5KPTPO+Tkiyvom13dTt\nnuR/NnOsSdJq6h+wIp3k4iRH9Zj/kCRXN/Nfm2SvJLsk+UiSbzf1b5jBLZAkSZIkzcBcViWfCHyi\nqp4B3Akc36Pd+VV1WFU9C/g+8Poe7SZWd98NXFlVS6tqRVN3EHACsAR4VZIDkjwa+DDQAg4GDk1y\nbNN+L+CarjlfUVVPr6qDgQ9N857eD7yk6Tcx1h8A91fVEjor0uckecikmCfrnv864FzgLc38LwJ+\n0XwOP6mq5wCHAb+fZPE0sUmSJEmSZmnRHPreUlXrm/Ia4MAe7Z6Z5EPAfnSSwstmMdflVfVTgCQb\ngMXAI4FVVfXjpv4LwFHARcB9wAVN3zuBnyf5LPBV4JJp5rmKTnL7pa7+RwJ/DlBVNye5Ffi1bcR7\nb1f/JwP/VlXXN2NMvI+X0PlsTmja7QM8CfjRg4db3lVuNQ9JkiRJGj3tdpt2uz3wceeS/G7uKt8H\n7NGj3eeAY6vqpiSnAEc39ffSrDwnCfCQqbs/aK772Rp3rxO/fl7NkcdVdV+Sw4AX0lk9/sOm/CBV\n9eYkhwK/CaxJ8uwpmk3MuSX+Rvf7/0U98MjlqeIMndXgb/R4D12Wb7uJJEmSJI2AVqtFq9Xacj0+\nPj6Qceey7bnfo6YfCmxMshvwmq76W4FDmvJvAbs15U3A3n2Muxo4KsnDk+wKnAS0J8eWZC9gv6q6\nFDidztbpKSV5QlVdV1XLgNuBxwBXAq9tXv814LHAzU38B6fjsXS2Lm8Zqqt8M7D/RCKd5KFNvJcB\nb06yqKl/UpJf6uN9S5IkSZJmaC4rv/2ewPx+Oonq7cC32ZrYfgb4SnMo1GXA3U39OuD+pv5zwB1T\nzVtVG5O8m60J71er6pLuNo29m3kmVmbfPk2sH03ypKZ8eVWtS3Iz8Kkk64B7gFOq6h7g6mYL9Abg\ne3S2fj8gxibOe5K8Cvhkk9z+jM73fj9LZ6v49c3K9+3AcdPEJkmSJEmapTxwd67mmyQ1s196Ct5T\nSZIkSaMiCVXV787jnuay8qsdpv/7PDbmgdGSJEmSNNnAkt8knwSOoLNMmeZ5RVWdM6g5BiXJe+kc\nftUd63lVdcZQA+vBlVxJkiRJmhu3Pc9zScp7JEmSJGmhGtS257mc9ixJkiRJ0k7B5FeSJEmSNPJM\nfiVJkiRJI8/kV5IkSZI08kx+JUmSJEkjz+RXkiRJkjTyTH4lSZIkSSPP5FeSJEmSNPJMfiVJkiRJ\nI8/kdyeQZEaP/fc/cNghS5IkSdK8Mq+T3ySLk5zUdX1Kkk/soLl/K8lTdtBc75m+Rc3ocdttP9qO\n0UqSJEnSzmdeJ7/A44FXT6qr2Q6WZCbv9zjg6TMcf9eZRbTFe2fZT5IkSZLUhzknv83q7HeTnJXk\npiSXJtm9R9tTk6xOsjbJeUn2aOpXJnllV7tNTfEM4Mgk1yc5rak7IMnXktyc5MyuPiclWdc8Ptw9\nVpI/TbIWODzJGUk2JLkhyUd6xPlc4FjgI83cj99G7J9Kci1wZpJHJvl6kvVJPpPk1iQPb9q+Jsm3\nmzE/lWSXJGcAv9TU/fXs7oIkSZIkaTqDWvl9IvCJqnoGcCdwfI9251fVYVX1LOD7wOt7tJtY3X03\ncGVVLa2qFU3dQcAJwBLgVUkOSPJo4MNACzgYODTJsU37vYBruuZ8RVU9vaoOBj405eRV1wAXAe9q\n5r5lG7EfUFWHV9U7gWXA5VX1TOBvgccCNFuoXwX8elUtBe4HXl1V7wF+1sxzco/PQ5IkSZI0B4sG\nNM4tVbW+Ka8BDuzR7plJPgTsRycpvWwWc11eVT8FSLIBWAw8ElhVVT9u6r8AHEUngb0PuKDpeyfw\n8ySfBb4KXDKDeaeL/byu8pF0tkxTVZcluaOpfyGwFLguSYA9gI3Na5l+6uVd5VbzkCRJkqTR0263\nabfbAx93UMnv5q7yfXQSu6l8Dji2qm5KcgpwdFN/L80qdJMYPqTPue5n63volUD+vKoKoKruS3IY\nnUT0BOAPm3I/esUOcHdXefJ3ktP1fE5Vva/P+bosn3kXSZIkSdoJtVotWq3Wluvx8fGBjDuobc/b\nWLnc4qHAxiS7Aa/pqr8VOKQp/xawW1PeBOzdx7irgaOSPLw5dOokoD05tiR7AftV1aXA6XS2Tvey\nCdinj9gnu5rO9maSvITOSjHA5cBvJ3lU89rDkjy2ee0/53BYliRJkiRpGwaV/PZ7AvP76SSqVwLf\n66r/DHD0xKFUbF1JXQfc3xwyddoU80ys6G6k8/3gNrAW+E5VXdLdprE3cEmSG4ErgLdPE+v/C7wr\nyZokj58m9skxfRB4cZJ1dL77vBHYVFXfA/4Y+Hoz/9eBRzd9zgLWe+CVJEmSJG0faXYEa0CSPAS4\nr9lifTjwF80BV7Mdr2b+607B+ypJkiRpFCShqvrdbdzToL7zq60eB3yp+U3hzcAbhhyPJEmSJC14\n2yX5TfJJ4Ag6S5ZpnldU1TnbY765SPJeOodfdcd6XlWdMZvxquoHdE51HqCZ/U+OsbHFg51ekiRJ\nknZybnue55KU90iSJEnSQjWobc+DOvBKkiRJkqR5y+RXkiRJkjTyTH4lSZIkSSPP5FeSJEmSNPJM\nfiVJkiRJI8/kV5IkSZI08kx+JUmSJEkjz+RXkiRJkjTyTH4lSZIkSSNv0bAD0LYl2eFzjo0tZuPG\nW3f4vJIkSZK0PaSqhh2DppGkYBj3KPi3IUmSJGnYklBVc14RdNtzlyTLkrwjyfIkL2jqjkxyU5Lr\nk+ye5KNJ1ic5s8cYb0zy2m3M86Ik30lyY5Lrkjx/e7wfSZIkSVKHK79dkiwDNlXVx7vqPgVcWVVf\nbK5/Ajys5vDBJTkIuK2qNiZ5OnBZVT2mR1tXfiVJkiQtWINa+V3wyW+S9wG/A9wG/AuwBngGcDHw\nMOAjwE+AbwH7AMcA64Azquq8KcbbkkAnWQV8G3g+sC/w+qq6eoo+/wE8uqrumeI1k19JkiRJC9ag\nkt8FfeBVkqXAicAS4CHA9cB3aLLNqjo7yZHAxVV1QdPnrqpaOoNpdq2q5yR5KbAcePGkGH4buH6q\nxHer5V3lVvOQJEmSpNHTbrdpt9sDH3dBJ7/A84ALq2ozsDnJV4A0j0G5oHleAyzufqHZ8nwGkxLi\nB1s+wHAkSZIkaf5qtVq0Wq0t1+Pj4wMZ1wOvHmgi6R3kft/NzfN9dP3PhiSPoZMYn1xVtw5wPkmS\nJEnSJAs9+b0COK45xXlv4OV0Et/pVn7nsiocgCT7AZcAf1RV185hPEmSJElSHxZ08ltVa4Fz6Rxg\n9VVg9cRL3c0md5vJFD2u/wD4VeADSdY2P6P0yBmMK0mSJEmagQV/2vN852nPkiRJkhYyT3teUAZ5\n/lZ/xsYWb7uRJEmSJO0kTH5nKcl7gRPY+h3hAs6rqjMGPZcrsJIkSZI0N257nueSlPdIkiRJ0kI1\nqG3PC/rAK0mSJEnSwmDyK0mSJEkaeSa/kiRJkqSRZ/IrSZIkSRp5Jr+SJEmSpJFn8itJkiRJGnkm\nv5IkSZKkkWfyK0mSJEkaeYuGHYC2LZnz7znPytjYYjZuvHUoc0uSJEnSIKWqhh2DppGkYFj3KPj3\nIUmSJGmYklBVc14RdNvzLCRZluT0AY11UJKXDmIsSZIkSdLUTH6H72DgZcMOQpIkSZJGmclvn5K8\nL8nNSa4AnjxNu4OSXJPkhiTnJ9m3qV+VZGlTfkSSW5IsAj4InJjk+iQn7JA3I0mSJEkLjMlvH5qk\n9URgCXAMcOg0zf8KeFdVHQzcBCzr0a6q6l7gA8C5VbW0qs4bYNiSJEmSpIanPffnecCFVbUZ2Jzk\noqkaJdkH2LeqrmqqzgG+NPfpl3eVW81DkiRJkkZPu92m3W4PfFyT3x3nXrautO8xs67LBxyKJEmS\nJM1PrVaLVqu15Xp8fHwg47rtuT9XAMcl2T3J3sDLp2pUVXcBdyQ5oqk6GfhmU74VOKQpd3+3dxOw\nz8AjliRJkiRtYfLbh6paC5wLrAO+CqyepvkpwJ8muQE4iM6BVgB/Cvy3JGuAh3e1XwU8zQOvJEmS\nJGn7SVUNOwZNI0nBsO5R8O9DkiRJ0jAloaoy13H8zu9OYc73eVbGxhYPZV5JkiRJGjST31lK8kng\nCDrLsmmeV1TVOYOey9VXSZIkSZobtz3Pc0nKeyRJkiRpoRrUtmcPvJIkSZIkjTyTX0mSJEnSyDP5\nlSRJkiSNPJNfSZIkSdLIM/mVJEmSJI08k19JkiRJ0sgz+ZUkSZIkjTyTX0mSJEnSyDP5lSRJkiSN\nvEXDDkDblmTYITzA2NhiNm68ddhhSJIkSVLfXPntIcmyJKcPaKyDkry06/rVSW5sHlcleeb0I9S8\netx224/m9HlIkiRJ0o5m8rtjHAy8rOv6h8BRVXUQ8CHgM0OJSpIkSZIWCJPfLknel+TmJFcAT56m\n3UFJrklyQ5Lzk+zb1K9KsrQpPyLJLUkWAR8ETkxyfZITquraqrqzGe5a4IDt/NYkSZIkaUEz+W00\nSeuJwBLgGODQaZr/FfCuqjoYuAlY1qNdVdW9wAeAc6tqaVWdN6nNqcDX5hS8JEmSJGlaHni11fOA\nC6tqM7A5yUVTNUqyD7BvVV3VVJ0DfGk2EyZ5PvA64MjpWy7vKreahyRJkiSNnna7TbvdHvi4Jr+D\ndS9bV9P3mK5hkiXAWcBvVNUd0w+7fAChSZIkSdL812q1aLVaW67Hx8cHMq7bnre6Ajguye5J9gZe\nPlWjqroLuCPJEU3VycA3m/KtwCFN+YSubpuAfSYukjwOOB84uar+aWDvQJIkSZI0pVTVsGOYN5K8\nB/hd4Dbgn4Hrq+rjU7RbAvwl8Et0Tm5+XVXdmeTJdLZA3wt8FXhtVT0hycOAy+istJ8BvAR4JfAj\nIMA9VXWACdLaAAAgAElEQVRYj5iq8xND80nw70aSJEnSjpCEqsqcxzGJmd9MfiVJkiQtZINKft32\nLEmSJEkaeR54NY0knwSOoLP0muZ5RVWds4Mj2bHTbcPY2OJhhyBJkiRJM+K253kuSXmPJEmSJC1U\nbnuWJEmSJKlPJr+SJEmSpJFn8itJkiRJGnkmv5IkSZKkkWfyK0mSJEkaeSa/kiRJkqSRZ/IrSZIk\nSRp5Jr+SJEmSpJFn8itJkiRJGnmLhh2Ati3JsEPoy9jYYjZuvHXYYUiSJEnSg6Sqhh2DppGkYGe5\nR8G/J0mSJEmDlISqmvOKoNueByDJKUn277o+K8lTmvJ7JrXdtKPjkyRJkqSFzpXfAUiyCnhnVa2Z\n4rVNVbV31/VdVbXPDMZ25VeSJEnSguXK73aWZHGS7zaruDcluTTJ7lO0Ox44BPh8kuuT7JFkVZKl\nSc4Afqmp/+uJLl1935lkdZIbkizbMe9MkiRJkhYeD7ya3hOBV1XV7yc5Fzge+GJ3g6o6P8kfAO+o\nqrWw9YCqqnpPkj+oqqXdXZo2LwaeVFWHpdPhoiRHVtVVDw5jeVe51TwkSZIkafS0223a7fbAxzX5\nnd4tVbW+Ka8BDuzRLnSt6PbpJcCLk1zf9N0LeBKwjeRXkiRJkkZXq9Wi1WptuR4fHx/IuCa/09vc\nVb4P2GMWY/RKigOcUVWfmcWYkiRJkqQZ8Du/0+t3NfcuoNchVv+ZpPt/MkyMeRnwe0n2AkjyK0ke\nNbswJUmSJEnTceV3ev0eXXwO8OkkPwN+fVK/s4B1SdZU1ckTr1XVN5qfQ7qm+Y7wJuC1wP83qOAl\nSZIkSR3+1NE8508dSZIkSVrIBvVTR6787hTmfJ93iLGxxcMOQZIkSZKmZPI7A0k+CRxBZyk2zfOK\nqjpne87raqokSZIkzY3bnue5JOU9kiRJkrRQDWrbs6c9S5IkSZJGnsmvJEmSJGnkmfxKkiRJkkae\nya8kSZIkaeSZ/EqSJEmSRp7JryRJkiRp5Jn8SpIkSZJGnsmvJEmSJGnkLRp2ANq2ZM6/5zxUY2OL\n2bjx1mGHIUmSJGkBS1UNOwZNI0nBzn6Pgn9nkiRJkmYjCVU15xVBtz3PQpJlSU7v8dp4khdso/+r\nk9zYPK5K8sztE6kkSZIkCdz2PHBVtayPZj8EjqqqO5P8BvAZ4PDtG5kkSZIkLVyu/PYpyfuS3Jzk\nCuDJ07RbmeSVTfmWJMuTrGlWeX8NoKqurao7my7XAgds9zcgSZIkSQuYyW8fkiwFTgSWAMcAh86g\n++1V9Wzg08C7pnj9VOBrcw5SkiRJktST25778zzgwqraDGxOctEM+l7YPK8BXtH9QpLnA68Djpx+\niOVd5VbzkCRJkqTR0263abfbAx/X5Hf729w830fX551kCXAW8BtVdcf0QyzfTqFJkiRJ0vzSarVo\ntVpbrsfHxwcyrtue+3MFcFyS3ZPsDbx8LoMleRxwPnByVf3TIAKUJEmSJPXmym8fqmptknOBdcBt\nwOrpmvcod3s/8HDgL5IEuKeqDhtIsJIkSZKkB0lVr/xM80GS6p1D7yyCf2eSJEmSZiMJVZW5juPK\n705hzvd5qMbGFg87BEmSJEkLnMnvLCX5JHAEnWXZNM8rquqcQc/lqqkkSZIkzY3bnue5JOU9kiRJ\nkrRQDWrbs6c9S5IkSZJGnsmvJEmSJGnkmfxKkiRJkkaeya8kSZIkaeSZ/EqSJEmSRp7JryRJkiRp\n5Jn8SpIkSZJGnsmvJEmSJGnkmfxKkiRJkkbeomEHoG1LMuwQtpuxscVs3HjrsMOQJEmSNOJSVcOO\nYaQkWQZsqqqPT/HaOPDNqvqHGYxXMMr3KPg3KEmSJKmXJFTVnFcEXfndgapq2bBjkCRJkqSFyO/8\nDkCS9yW5OckVwJOnabcyySub8i1JlidZk+TGJL+2wwKWJEmSpAXG5HeOkiwFTgSWAMcAh86g++1V\n9Wzg08C7tkN4kiRJkiTc9jwIzwMurKrNwOYkF82g74XN8xrgFb2bLe8qt5qHJEmSJI2edrtNu90e\n+Lgmv8O1uXm+j2nvxfIdEIokSZIkDV+r1aLVam25Hh8fH8i4bnueuyuA45LsnmRv4OXDDkiSJEmS\n9ECu/M5RVa1Nci6wDrgNWD1d8x5lSZIkSdJ25O/8znP+zq8kSZKkhWxQv/PrtmdJkiRJ0shz2/N2\nkOSTwBF0lmzTPK+oqnNmOeLAYptvxsYWDzsESZIkSQuA257nuSTlPZIkSZK0ULntWZIkSZKkPpn8\nSpIkSZJGnsmvJEmSJGnkmfxKkiRJkkaeya8kSZIkaeSZ/EqSJEmSRp7JryRJkiRp5Jn8SpIkSZJG\nnsmvJEmSJGnkLRp2ANq2JMMOYUEbG1vMxo23DjsMSZIkSXOQqhp2DJpGkgLv0XAF/zuRJEmShiMJ\nVTXnFcGhbHtOsm+S/9aUH53kS035oCQv7Wp3SpJPDGjOLfPMoM94khc05VVJls6h/2lJ9phJf0mS\nJEnSYAzrO78PA94MUFX/XlUnNvUHAy+b1HYgS26T5um3z7Kq+ofZzJdkl0n93wbsOZuxJEmSJElz\nM6zv/J4BPCHJ9cAPgKcCS4EPAnskOaJps0WSRwKfBh7bVL29qr411eBJjgJW0EmcCzgKeCRwSVU9\nM8kpwHHAXsATgY8BDwFOBn4BvKyqfpJkJXBxVV0wafy/AA4Bfgn426oab+pvAc4FXgR8pFnFvhg4\nAPgVYFWS/wA+Dyypqrc3/U4FnlpV75jZxyhJkiRJ6sewkt93A0+vqqVJFtNJMO9J8gHg2VX1Vuhs\ne+7qswL4eFV9K8ljgcuAp/UY/53Am6vqmiR70klo4YGryE+ns9K8J50E/F1NPB8Hfgf482nif2+T\nHO8CXJ7k/Kq6qXntP6rqkCb+lwJU1SeSnA60quqOJHsB703yzqq6D3gd8Pu9p1veVW41D0mSJEka\nPe12m3a7PfBxd6bTnl8EPDVbjz5+aJI9q+pnU7S9GvizJF8ALqiqf53ixORVTd+fJfkJcElTvx54\n5jZi+a9J3kDn89ufThI+kfyeO02/AFTV3Un+AfjNJN8HFlXVht7dlm8jHEmSJEkaDa1Wi1arteV6\nfHx8IOPuTMlvgOdU1T3balhVZya5BDgGuDrJS4DNk5p1X1fX9f1M87kkORB4B50V6ruardHdB1nd\nva34GmcD7wW+D6zss48kSZIkaRaGdeDVJmDvppxJ9fv06PN14LSJiyQH9Ro8yROqakNVfQS4DnjK\nFHPN1j7AT4FNScaAl26j/YS76HpvVbWazveXTwL+ZgBxSZIkSZJ6GEryW1U/prMiuw74SNdLq4Cn\nJbk+yQmTup0GHJLkxiQ3AW+cZoq3JVmf5AbgP4GvTUzdK6Q+6quJfR1wA/A9OgdXXTXNON3XnwEu\nTXJ5V92XgKur6s5eb0SSJEmSNHepGsgvCWkWklxM5xCvVdO0qQH92pNmLfjfiSRJkjQcSaiqOe/i\n3Zm+8zsykuwLrAbWTpf4dvXY3iFpGmNji4cdgiRJkqQ52qlXfpP8Lp3t0N1v4uqqestwIhq8JLUz\n3yNJkiRJmotBrfzu1MnvQmDyK0mSJGkhG1TyO6zTniVJkiRJ2mFMfiVJkiRJI8/kV5IkSZI08kx+\nJUmSJEkjz+RXkiRJkjTyTH4lSZIkSSPP5FeSJEmSNPJMfiVJkiRJI2/RsAPQtiVz/j1naSjGxhaz\nceOtww5DkiRJIlU17Bg0jSQF3iPtrIL/xkiSJGkuklBVc14RdNvzkCTZddgxSJIkSdJCsWCT3ySL\nk3w3yVlJbkpyaZLdp2j3hCRruq6fOHGd5NlJ2kmuS/K1JGNN/alJVidZm+S8JHs09SuTfCrJtcCZ\nSY5q2lyfZE2SvXbQ25ckSZKkBWXBJr+NJwKfqKpnAHcCx09uUFU/BH6SZElT9Trg7CSLgD8Hjq+q\nQ4GVwJ80bc6vqsOq6lnA94HXdw15QFUdXlXvBN4JvLmqlgLPA34++LcoSZIkSVroB17dUlXrm/Ia\n4MAe7c4GXpfkHcCrgEOBJwPPAL6RzolUuwD/1rRfkuR/APsBewGXdY11Xlf5auDPknwBuKCq/nXu\nb0mSJEmSNNlCT343d5XvA/bo0e58YBmwCvhOVd2R5ADgpqo6Yor2K4Fjq+qmJKcAR3e9dvdEoarO\nTHIJcAxwdZKXVNU/Pni45V3lVvOQJEmSpNHTbrdpt9sDH3ehJ799nRhWVZuTXAZ8Cvi9pvpm4FFJ\nDq+qa5tt0L9WVd8FHgpsTLIb8BrgX6acPHlCVW0ANiQ5FHgKsI3kV5IkSZJGV6vVotVqbbkeHx8f\nyLgL/Tu/M/kNli/QWR3+OkBV3QP8Np2Dq24A1gLPbdp+AFgNXAl8b5r53pZkfdP/P4GvzfgdSJIk\nSZK2yd/57VPzfd99qmrZDp7X3/nVTszf+ZUkSdLcDOp3fhf6tue+JLkAeALwgmHHIkmSJEmaOZPf\nLkk+CRxBZ6k1zfOKqnrlUAPr76vJ0rwzNrZ42CFIkiRJgNue570k5T2SJEmStFANatvzQj/wSpIk\nSZK0AJj8SpIkSZJGnsmvJEmSJGnkmfxKkiRJkkaeya8kSZIkaeSZ/EqSJEmSRp7JryRJkiRp5Jn8\nSpIkSZJGnsmvJEmSJGnkLRp2ANq2JMMOQdpuxsYWs3HjrcMOQ5IkSSMuVTXsGDSNJAXeI42y4L9D\nkiRJ6iUJVTXnFcFZb3tOsmkGbY9O8tzZzjXFeKcl2WMW/caTvGAG7R+d5EtN+egkF89wvu7+ByV5\n6cwiliRJkiQNwly+8zuTpZoW8OtTvZBk11nM/TZgz5l2qqplVfUPM2j/71V1YndVv32T7Dqp/8HA\ny/rtL0mSJEkanL6S3yQXJrkuyfokp26tzseT3JTkG0ke0VS+NcmGJDck+WKSxcCbgLcluT7JEUlW\nJvlUkmuBM5McmuRbSdYkuSrJk5qxdkny0WbeG5L8QZK3AL8CrEpyeY94d2nmWJfkxiSnNfUrk7yy\nKd+S5E+SrE2yOsmzklya5H8leWPTZnGS9VOM3yveU5J8pYnr7yf6J1kEfBA4sfkMTkzyj12fWZp5\nH9HXXZMkSZIkzUi/B169rqp+0mw1vi7JBcBewOqqOj3J+4FlwFuBPwIOrKp7kuxTVXcl+TSwqao+\nDtAk0AdU1eHN9UOBI6vq/iQvBM4Afht4I7AYWFJVlWS/Jo63A62quqNHvAc34y9pxt+nR7tbq+pZ\nST4OrKSzOr0ncBPwl02bqVZ7v9cjXoBnAc+sqjubxL+q6t4kHwCeXVVvbWJ6MvBaYAXwIuCGqvo/\nPeKUJEmSJM1Bv8nv25Ic15QfAzwJuA/4UlP3eeD8pnwj8MUkXwa+PM2Y53WV9wP+qllBra64Xgh8\nqprTcKrqJ019mkcvPwQen2QF8HfA13u0m/gO73pgr6r6GfCzJL+YJmGeLl6Ab1TVndP0nbCSzuez\nAvi95rqH5V3lVvOQJEmSpNHTbrdpt9sDH3ebyW+So4EXAM+pqs1JVgFTHTY1sUJ6DHAUcCzwviTP\n6DH03V3l/wH8Q1W9slktXdXvG5hKszp8EPBf6Gy5PgE4dYqmm5vn+7vK8OCEdrLp4r27R5/JMf5L\nktuSPB84FHh179bL+xlSkiRJknZ6rVaLVqu15Xp8fHwg4/bznd99gTuaxPcpwOFN/a5s3er7GuCq\npvy4qvom8G5gH+ChwKam3Ms+wL825dd11X8DeOPEoVhJHtbU3zXdeM13Z3etqguBPwaWbutNTmOq\nFeZ9e8Q7nak+g7PprJp/aWJ1W5IkSZI0eP0kv5cCuyXZAPwJ8K2m/qfAYc2BUC3gg83BTp9PciOw\nBlhRVXfR2V78iokDr3jw92g/Cnw4yZpJMX0W+N/AuiRrgZOa+s8Al/Y68Ao4AGg3ff6aTiLOpHmn\nSza31e4jPeKdzirgac1ncEJTdxGd705/rs8xJEmSJEmzEBcchyfJIcDHquroadrUzH5VStrZBP8d\nkiRJUi9JqKrpznzqS78HXmnAkvwRne8jT/NdX0mSJEnSIOz0K7/NbwU/ZOKSzjLpyVW1YXhRDU5n\n5VcaXWNji9m48dZhhyFJkqR5alArvzt98jvqkngWliRJkqQFa1DJb7+HNUmSJEmStNMy+ZUkSZIk\njTyTX0mSJEnSyDP5lSRJkiSNPJNfSZIkSdLIM/mVJEmSJI08k19JkiRJ0sgz+ZUkSZIkjTyTX0mS\nJEnSyFs07AC0bUmGHYK0UxkbW8zGjbcOOwxJkiTNI6mqYcegaSQp8B5JMxP8t02SJGk0JKGq5rwi\nOK+3PSfZNIO2Ryd57gDnPi3JHoMaT5IkSZI0PPM6+WVmS54t4NeneiHJrrOY+23AnrPoJ0mSJEma\nZ+ZN8pvkwiTXJVmf5NSt1fl4kpuSfCPJI5rKtybZkOSGJF9Mshh4E/C2JNcnOSLJyiSfSnItcGaS\nQ5N8K/n/27v/aEur+r7j7w8gWiCIRDM1oEPSoAZlZEijUMdwgFVD/RUgUn8gsGhsWMaoSJOakmWd\nCYoiSgWzIEGFgEoUCriARAoMXApB5NcwM84gwSrREmmgHQKDgsB8+8fZFw7DnDt37rn3njvnvl9r\nnXWeZz9772efc/Z65n5n72c/uT3JjUn2bHVtk+TUdt47k3wgyQeBXwauS7K8T3u3aedYlWRlGyl+\nZZLv9ORZmGRV2/5hkpOTrEhyS5LFSa5Mck+S42bum5UkSZIkzaUFr46tqofaVONbk1wC7AjcUlUn\nJPkY8HHgQ8BHgT2q6okkO1fVw0n+Anikqk4DaAH0blW1X9vfCVhSVRuSHAx8CngHcBywEFhUVZVk\nl9aOjwCdqlrXp737tPoXtfrH2/G8JAur6h+AdwJ/3VPm3qpanOQ04Fy6I9U7AN8F/rL/V7O0Z7vT\nXpIkSZI0esbGxhgbG5v2eudS8Ht8kkPb9u7AnsBTwIUt7avAxW17JXBBkm8C35ygzot6tncBzm8j\nvsUzn/1g4Kxqq+NU1UMtPe3Vzw+AX0lyOvC3wFUt/UK6Qe9n2vsRPWUub++rgR2r6qfAT5M8Nh48\nb/pUSydohiRJkiSNjk6nQ6fTeXp/2bJl01LvnJj2nOQA4CDg9VW1D3AnsKnFpsbvAX4L8OfAvnRH\nift9jkd7tk8Crq2qvYG39al/0lqQ/FpgjO7o8ZfaoQuBd7Yge0NV/aCn2OPtfUPPNjw7GJckSZIk\nTbM5EfwCLwTWVdXjSV4F7NfSt6U7NRngSODGtv3yqroe+BNgZ2An4JG23c/OwH1t+9ie9KuB48YX\nxUryopb+8ET1tfuPt62qS4GPAYsBWrD7VEv7xgTtkSRJkiTNkrkS/F4JPC/JGuBk4KaWvh54XZLV\ndG90/bMk2wFfTbISuB04vU0Xvhw4bHzBK567UvSpwKeT3M6zP/eXgB8Dq5KsAN7d0r8IXNlvwStg\nN2CslfkK3UB83DfoBusX9qRNtHK1DySVJEmSpBmUdqur5qgkZWwsbangtU2SJGk0JKGqJlqPaVK8\nz3SrMPDvLM0rCxYsHHYTJEmSNMcY/E5Ce1bw9uO7dIdij6qqNbNxfkewJEmSJGkwTnue45KUv5Ek\nSZKk+Wq6pj3PlQWvJEmSJEmaMQa/kiRJkqSRZ/ArSZIkSRp5Br+SJEmSpJFn8CtJkiRJGnkGv5Ik\nSZKkkWfwK0mSJEkaeQa/kiRJkqSRt92wG6DNSwZ+nrOkEbRgwULuv//eYTdDkiRpq5CqGnYbNIEk\nBf5GkjYleA2XJEmjLglVNfCI4FCnPSd5ZAvyHpBk/2k894eTvGC66utzjpcmuXAmzyFJkiRJ2rxh\n3/O7JUMWHeDfbOpAkm2ncO7jgR2mUG7SquonVfXvZ/IckiRJkqTNm7Vpz0kuBXYHXgCcXlVfaiO/\nXwTeBPwEeFdV/d8kHwKOA54A1gL/BbgZeBJ4APgg8D7gMWAxcCPwDeB04PnAz4Bjq+qeJNsApwCH\nAE+1820DfBb4HvBgVR3cp82HAJ9s+R+sqn+b5EXAOcCvAo8Cx1XV6iQHAJ+nG9AX8FvAi4Erqmrv\nJMcAhwI7Ar8GfA7YHjiqfY43V9VDm2iD054l9eG0Z0mSNPqma9rzbC54dWxVPdSmGt+a5BK6geAt\nVXVCko8BHwc+BHwU2KOqnkiyc1U9nOQvgEeq6jSAJO8Ddquq/dr+TsCSqtqQ5GDgU8A76AbRC4FF\nVVVJdmnt+AjQqap1m2pskhcDZ7c6f5Rkl3ZoGXBHVR2W5EDgfLoB+H8C/qCqvp1kB7oBLTw7cn01\nsA/dEefvA39cVfsmOQ04GjhjgO9XkiRJktTHbAa/xyc5tG3vDuxJdyR2/J7YrwIXt+2VwAVJvgl8\nc4I6L+rZ3gU4P8medAPO8c92MHBWteGRntHVtFc/+wHXV9WPNiq3BDi8pV2XZNcWeP8d8N+SfA24\npKru28QqzddV1U+BnyZ5CLiipa8G9u7flKU92532kiRJkqTRMzY2xtjY2LTXOyvBb5sSfBDw+qp6\nPMl1dKc/b2x8lPQtdKcNvx340ySv6VP1oz3bJwHXVtXhSRYC101H0ydo47PyVNUpSa6g2/a/S/Im\n4PGN8vbuV8/+Bib8LZZOusGSJEmStDXrdDp0Op2n95ctWzYt9c7WglcvBNa1wPdVdEdVAbalOzUZ\n4Ei69+4CvLyqrgf+BNgZ2Al4pG33szNwX9s+tif9auC48UWx2j27AA9vpr6bgTe2QLq33A3Ae1ta\nB3igqtYn+dWqWlNVnwFuBV7V8vuQXkmSJEkastkKfq8EnpdkDXAycFNLXw+8LslqunN5/yzJdsBX\nk6wEbqe7ONbDwOXAYUnuSPIGnjsCeyrw6SS38+zP9SXgx8CqJCuAd7f0LwJXJlm+qQZX1YPA7wOX\ntnJfb4eWAb/R2ncy3Xt1oTute3WSO4GfA98ar6rPd+IqNZIkSZI0S2ZttWdNjas9S+rP1Z4lSdLo\n2xpXe9aUOXNa0nMtWLBw2E2QJEnaahj8AklupvvMXehGmgUcVVVrhteqZziyI0mSJEmDcdrzHJek\n/I0kSZIkzVfTNe15tha8kiRJkiRpaAx+JUmSJEkjz+BXkiRJkjTyDH4lSZIkSSPP4FeSJEmSNPIM\nfiVJkiRJI8/gV5IkSZI08gx+JUmSJEkjb7thN0Cblwz8PGdJkjRPLViwkPvvv3fYzZCkoUtVDbsN\nmkCSAn8jSZI0VcG/9yRtzZJQVQOPCDrtWZIkSZI08gx+JUmSJEkjb94Ev0mOTrIyyYok5yVZmGR5\nkjuTXJ1k95bv3CRnJvl2ku8nOSDJl5OsTXJOT32PJPlMku8muSrJbya5rpV5a8vz/CTnJFmV5PYk\nnZZ+TJKLk3wryd1JThnKlyJJkiRJ88S8CH6T7AWcCHSqajFwPPAF4Nyq2ge4oO2P26Wq9gdOAC4D\nPldVewGLkixqeXYErqmq1wDrgZOAg4HD2zbAB4ANVbUIeA9wXpLt27HXAkcAi4B3JtltBj66JEmS\nJIn5s9rzQcBFVbUOoKrWJdkfOKwd/wrQO/p6eXtfDdxfVWvb/hpgD2AV8HhVXdWT77Gq2pBkNbCw\npS8BzmjnvDvJvcAr2rHlVbUeIMnaVua+TTd/ac92p70kSZIkafSMjY0xNjY27fXOl+B3UyZa9vDx\n9r6hZ3t8f/w7e2Kj9McBqqqS9Ptee1co6633KSb8LZZO0FRJkiRJGh2dTodOp/P0/rJly6al3nkx\n7Rm4Fjgiya4A7f0m4N3t+HuBG/qU7bek9kRLbY8fuwE4sp3zFcDLgLsn32xJkiRJ0nSYFyO/VbU2\nySeB65M8CawAPgj8VZI/Ah4Ajh3PvnHxSWw/55Tt/UzgrCSr6I4UH1NVTyTPiZt9+J4kSZIkzaD4\n0PO5LUkZG0uSpKkL/r0naWuWhKqaaObtpMyXac+SJEmSpHlsXkx73voN/J8ckiRpnlqwYOHmM0nS\nPGDwuxVwqpIkSZIkDcZpz5IkSZKkkWfwK0mSJEkaeQa/kiRJkqSRZ/ArSZIkSRp5Br+SJEmSpJFn\n8CtJkiRJGnkGv5IkSZKkkWfwK0mSJEkaeQa/kiRJkqSRt92wG6DNSzLsJkiSJEkasgULFnL//fcO\nuxlbrVTVsNsw7yT5F8BFwL8CngQur6oT++Qt8DeSJEmSFOZj/JaEqhp4RNBpz8NzalX9OrAYWJLk\nt4fdIEmSJEkaVSMX/CY5OsnKJCuSnJdkYZLlSe5McnWS3Vu+c5OcmeTbSb6f5IAkX06yNsk5PfU9\nkuQzSb6b5Kokv5nkulbmrS3P85Ock2RVktuTdFr6MUkuTvKtJHcnOQWgqn5WVde37SeBO4DdZ/mr\nkiRJkqR5Y6SC3yR7AScCnapaDBwPfAE4t6r2AS5o++N2qar9gROAy4DPVdVewKIki1qeHYFrquo1\nwHrgJOBg4PC2DfABYENVLQLeA5yXZPt27LXAEcAi4J1JdtuozbsAbwOWT9PXIEmSJEnayKgteHUQ\ncFFVrQOoqnVJ9gcOa8e/ApzSk//y9r4auL+q1rb9NcAewCrg8aq6qiffY1W1IclqYGFLXwKc0c55\nd5J7gVe0Y8uraj1AkrWtzH1tf1u6Afnnq+re/h9rac92p70kSZIkafSMjY0xNjY27fWOWvC7KRPd\nEf54e9/Qsz2+P/7dPLFR+uMAVVVJ+n1/vTdj99b7FM/+zs8G7q6q3tHoTVg68WFJkiRJGhGdTodO\np/P0/rJly6al3pGa9gxcCxyRZFeA9n4T8O52/L3ADX3K9ls9bKJVxcaP3QAc2c75CuBlwN0TNTTJ\nJ4Cdq+ojE+WTJEmSJA1upEZ+q2ptkk8C1yd5ElgBfBD4qyR/BDwAHDuefePik9h+zinb+5nAWUlW\n0R0pPqaqntjE83kLoN33eyJwV5IVLf3Pq+qcjQtIkiRJkgbnc37nOJ/zK0mSJKnL5/wOYtSmPUuS\nJDPgXRoAAAgFSURBVEmS9BwjNe15dA38nxySJEmStnILFizcfCb1ZfC7FZiPUxskSZIkaTo57VmS\nJEmSNPIMfiVJkiRJI8/gV5IkSZI08gx+JUmSJEkjz+BXkiRJkjTyDH4lSZIkSSPP4FeSJEmSNPIM\nfiVJkiRJI8/gV5IkSZI08rYbdgO0eUmG3QRJkiRJ2qwFCxZy//33DrsZm5SqGnYbNIEkBf5GkiRJ\nkrYGYbpjzCRU1cAjgjM+7TnJjTN9jqlIcl2SfWfhPH+d5M4kH07yyiQrktye5Fdm+tySJEmSpK4Z\nn/ZcVUtm+hybk2TbqnpqCOf9l8C/rqo92/5HgYuq6uTZboskSZIkzWezMfL7SJIdklyT5LYkK5O8\nvef40S1tRZLzWtovJbmkjZiuSLLfBHnfmuTmNpp6VZKXtPSPJzm/jTyfn+QFSb6eZE2SS4AXbKbd\nh7Q6VyS5uqW9KMmlrQ03Jdm7pe+Q5Ms97Xhbq+Z/AL+c5I4k/xU4Hnh/kuWtzBWt/lVJjpjO712S\nJEmS9IzZWPCqgMeAQ6tqfZJfBG4GLkvyauBEYP+qWpdkl1bmDGCsqg5Pd7WnnZLs1SfvDVU1Hhz/\nHvCfgT9ux34deENV/TzJR4D1VfXqFrTe0a/BSV4MnA0sqaof9ZxrGXBHVR2W5EDgfGAx8KfA8qr6\nvSQvBG5Jcg3wduDyqtq31Rvgkao6LcnhwH1V9dZ27Bem+P1KkiRJkjZjtlZ7DvDpJG8ENtAdDf0l\n4EC604DXAVTVQy3/QcBRLa2AR5Ic1Cfvy5JcCLwUeB7ww57zXlZVP2/bvwWc3squTrJygvbuB1xf\nVT/a6FxLgMNb2nVJdk2yE/Am4G1JxoPu7YGX0w36+1kNfDbJp4C/qaoJ7o1e2rPdaS9JkiRJGj1j\nY2OMjY1Ne72zEfwGeC/wi8DiqtqQ5Ic8M+14U6t2bcnyYF8APltVf5PkAODjPcce3Uy7JrIl7Qrw\nu1V1z7MSk4X9Kq+qe9qCW28GPpHkmqr6xKZzL91MUyVJkiRpNHQ6HTqdztP7y5Ytm5Z6Z/ye32Zn\n4J9a4HsgMB4UXgu8I8mu0L2ntqUvB/6gpW2TZOeW94hN5N0Z+Me2fcwEbfifwJGt7GuARRPkvRl4\n43jw2nOuG+gG8iTpAA9W1Xq69/Z+aLxwkn166tpkkJ3kpcDPquoC4FRgxleeliRJkqT5ajZGfjcA\nXwOuaFONbwPuAqiqtUk+CVyf5ElgBfAf6C4MdXa7h/dJ4P1V9Z0+eZcB/z3J/6MbIO/Rpx1nAecm\nWdPOf1u/BlfVg0l+H7i03af7T8Bvt3Od0z7HozwTbJ8EfD7JKrrB7g/p3u8L/UeL9wZOTbIB+Dnw\n/n7tkSRJkiQNJtP9AOJnVd5d3Oq2qvKZtlOUpLZsFrgkSZIkDUuY7hgzCVW1udtWN2vGRn7btN4x\nulN6NZCBf2dJkiRJmnELFvRd9mjoZnTkd2uQ5Ga6qzNDN8os4KiqWjO8Vj0jSc3330iSJEnS/DVd\nI7+zteDVnFVV+1XVvu21uL3PicBXmstmYvl5abrYPzVX2Tc1V9k3NR/M++BX0tT4j6TmMvun5ir7\npuYq+6bmA4NfSZIkSdLIM/iVJEmSJI28eb/g1VzXfdSRJEmSJM1f07HglcGvJEmSJGnkOe1ZkiRJ\nkjTyDH4lSZIkSSPP4FeSJEmSNPIMfockySFJvpfk75N8tE+eM5Lck+TOJPtsSVlpEFPon4t70u9N\nsjLJiiS3zF6rNR9srm8meWWSm5I8luSELSkrDWLAvul1UzNqEv3zPa0PrkxyY5JFky0rDWLAvrnF\n104XvBqCJNsAfw8cDPwjcCvwrqr6Xk+efwf8YVW9JcnrgdOrar/JlJUGMUj/bMd+APxGVa2b/dZr\nlE2yb74YWAgcCqyrqtMmW1aaqkH6ZjvmdVMzZpL9cz/grqr65ySHAEv9u1MzbZC+2Y5t8bXTkd/h\neB1wT1X9Q1U9AXwd+J2N8vwOcD5AVX0HeGGSBZMsKw1ikP4JELy2aGZstm9W1YNVdTvw5JaWlQYw\nSN8Er5uaWZPpnzdX1T+33ZuB3SZbVhrAIH0TpnDt9EI7HLsBP+7Z/988+4ecKM9kykqDmEr/vK8n\nTwFXJ7k1yX+csVZqPhrk+ue1UzNp0P7ldVMzaUv75/uAb02xrLQlBumbMIVr53Zb3EQNy8APdZZm\nyRuq6idJXkL3gnRXVd047EZJ0hzmdVNzQpIDgWOBJcNui9SrT9/c4munI7/DcR/w8p793Vvaxnle\ntok8kykrDWKQ/klV/aS9PwBcSndKizQdBrn+ee3UTBqof3nd1AybVP9sCwmdDby95x5Kr52aSYP0\nzSldOw1+h+NW4NeSLEyyPfAu4LKN8lwGHA1P3+j9UFX9n0mWlQYx5f6ZZIckO7X0HYE3Ad+dvaZr\nxG3p9a93xozXTs2kKfdNr5uaBZvtn0leDlwMHFVV/2tLykoDmHLfnOq102nPQ1BVTyX5Q+Aquv8B\n8eWquivJcd3DdXZV/W2SNyf5PvAo3WH+vmWH9FE0ggbpn8AC4NIkRff68rWqumoYn0OjZzJ9sy28\ndhvwC8CGJB8G9qqq9V47NVMG6ZvAS/C6qRk0mf4JfAzYFTgzSYAnqup1/t2pmTRI32SKf3P6qCNJ\nkiRJ0shz2rMkSZIkaeQZ/EqSJEmSRp7BryRJkiRp5Bn8SpIkSZJGnsGvJEmSJGnkGfxKkiRJkkae\nwa8kSZIkaeT9f1EicgMsxNDuAAAAAElFTkSuQmCC\n", | |
"text/plain": [ | |
"<matplotlib.figure.Figure at 0x1692e3750>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"%matplotlib inline\n", | |
"plot_simple_imp(clf.feature_importances_, feature_names=colnames)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Xgboost " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 51, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import xgboost" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Basic train validation split " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 187, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Accuracy 0.912009357941\n", | |
"F1 0.91365897207\n" | |
] | |
} | |
], | |
"source": [ | |
"# Create boosting trees model with xgboost \n", | |
"gbm = xgboost.XGBClassifier(max_depth=3, learning_rate=0.05, n_estimators=50, nthread=4)\n", | |
"\n", | |
"# train model\n", | |
"gbm.fit(xtrain, ytrain)\n", | |
"\n", | |
"# Predict on validation set\n", | |
"pvalid = gbm.predict(xvalid)\n", | |
"\n", | |
"# Evaluation metrics\n", | |
"accuracy = metrics.accuracy_score(yvalid, pvalid)\n", | |
"f1 = metrics.f1_score(yvalid, pvalid)\n", | |
"print 'Accuracy', accuracy\n", | |
"print 'F1', f1" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Grid search " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 53, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.grid_search import GridSearchCV" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 54, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Fitting 3 folds for each of 18 candidates, totalling 54 fits\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[Parallel(n_jobs=-1)]: Done 25 tasks | elapsed: 10.6min\n", | |
"[Parallel(n_jobs=-1)]: Done 54 out of 54 | elapsed: 31.3min finished\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[CV] n_estimators=100, learning_rate=0.05, max_depth=2 ...............\n", | |
"[CV] n_estimators=100, learning_rate=0.05, max_depth=2 ...............\n", | |
"[CV] n_estimators=100, learning_rate=0.05, max_depth=2 ...............\n", | |
"[CV] n_estimators=300, learning_rate=0.05, max_depth=2 ...............\n", | |
"[CV] n_estimators=300, learning_rate=0.05, max_depth=2 ...............\n", | |
"[CV] n_estimators=300, learning_rate=0.05, max_depth=2 ...............\n", | |
"[CV] n_estimators=500, learning_rate=0.05, max_depth=2 ...............\n", | |
"[CV] n_estimators=500, learning_rate=0.05, max_depth=2 ...............\n", | |
"[CV] ...... n_estimators=100, learning_rate=0.05, max_depth=2 - 52.6s[CV] ...... n_estimators=100, learning_rate=0.05, max_depth=2 - 53.3s[CV] ...... n_estimators=100, learning_rate=0.05, max_depth=2 - 53.8s[CV] ...... n_estimators=300, learning_rate=0.05, max_depth=2 - 2.6min[CV] ...... n_estimators=300, learning_rate=0.05, max_depth=2 - 2.6min[CV] ...... n_estimators=300, learning_rate=0.05, max_depth=2 - 2.6min[CV] ...... n_estimators=500, learning_rate=0.05, max_depth=2 - 4.3min[CV] ...... n_estimators=500, learning_rate=0.05, max_depth=2 - 4.3min\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"[CV] n_estimators=500, learning_rate=0.05, max_depth=2 ...............[CV] n_estimators=100, learning_rate=0.05, max_depth=4 ...............[CV] n_estimators=100, learning_rate=0.05, max_depth=4 ...............[CV] n_estimators=300, learning_rate=0.05, max_depth=4 ...............[CV] n_estimators=300, learning_rate=0.05, max_depth=4 ...............[CV] n_estimators=500, learning_rate=0.05, max_depth=4 ...............[CV] n_estimators=500, learning_rate=0.05, max_depth=4 ...............[CV] n_estimators=100, learning_rate=0.05, max_depth=6 ...............\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"[CV] ...... n_estimators=100, learning_rate=0.05, max_depth=6 - 1.2min[CV] ...... n_estimators=500, learning_rate=0.05, max_depth=2 - 4.3min[CV] ...... n_estimators=100, learning_rate=0.05, max_depth=4 - 1.6min[CV] ...... n_estimators=100, learning_rate=0.05, max_depth=4 - 1.6min[CV] ...... n_estimators=300, learning_rate=0.05, max_depth=4 - 4.8min[CV] ...... n_estimators=300, learning_rate=0.05, max_depth=4 - 4.8min[CV] ...... n_estimators=500, learning_rate=0.05, max_depth=4 - 8.0min[CV] ...... n_estimators=500, learning_rate=0.05, max_depth=4 - 4.0min\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"[CV] n_estimators=100, learning_rate=0.05, max_depth=6 ...............[CV] n_estimators=100, learning_rate=0.05, max_depth=6 ...............[CV] n_estimators=100, learning_rate=0.05, max_depth=4 ...............[CV] n_estimators=300, learning_rate=0.05, max_depth=4 ...............[CV] n_estimators=500, learning_rate=0.05, max_depth=6 ...............\n", | |
"[CV] n_estimators=300, learning_rate=0.05, max_depth=6 ...............\n", | |
"[CV] n_estimators=300, learning_rate=0.1, max_depth=2 ................[CV] n_estimators=100, learning_rate=0.1, max_depth=2 ................\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"[CV] ...... n_estimators=500, learning_rate=0.05, max_depth=6 - 6.0min[CV] ...... n_estimators=300, learning_rate=0.05, max_depth=6 - 7.3min\n", | |
"[CV] ....... n_estimators=100, learning_rate=0.1, max_depth=2 - 54.1s[CV] ...... n_estimators=100, learning_rate=0.05, max_depth=6 - 2.5min[CV] ...... n_estimators=100, learning_rate=0.05, max_depth=6 - 1.2min[CV] ...... n_estimators=100, learning_rate=0.05, max_depth=4 - 1.7min[CV] ...... n_estimators=300, learning_rate=0.05, max_depth=4 - 4.9min\n", | |
"\n", | |
"[CV] ....... n_estimators=300, learning_rate=0.1, max_depth=2 - 1.3min\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"[CV] n_estimators=100, learning_rate=0.1, max_depth=4 ................[CV] n_estimators=500, learning_rate=0.1, max_depth=4 ................\n", | |
"[CV] n_estimators=100, learning_rate=0.1, max_depth=2 ................[CV] n_estimators=500, learning_rate=0.05, max_depth=6 ...............[CV] n_estimators=300, learning_rate=0.05, max_depth=6 ...............[CV] n_estimators=500, learning_rate=0.05, max_depth=4 ...............[CV] n_estimators=300, learning_rate=0.05, max_depth=6 ...............\n", | |
"\n", | |
"[CV] n_estimators=500, learning_rate=0.1, max_depth=2 ................\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"[CV] ....... n_estimators=100, learning_rate=0.1, max_depth=4 - 1.6min[CV] ....... n_estimators=500, learning_rate=0.1, max_depth=4 - 3.8min\n", | |
"[CV] ....... n_estimators=100, learning_rate=0.1, max_depth=2 - 25.3s[CV] ...... n_estimators=500, learning_rate=0.05, max_depth=6 - 6.0min[CV] ...... n_estimators=300, learning_rate=0.05, max_depth=6 - 3.5min[CV] ...... n_estimators=500, learning_rate=0.05, max_depth=4 - 3.9min[CV] ...... n_estimators=300, learning_rate=0.05, max_depth=6 - 7.5min\n", | |
"\n", | |
"[CV] ....... n_estimators=500, learning_rate=0.1, max_depth=2 - 4.2min\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"[CV] n_estimators=500, learning_rate=0.1, max_depth=4 ................[CV] n_estimators=500, learning_rate=0.1, max_depth=6 ................\n", | |
"[CV] n_estimators=100, learning_rate=0.1, max_depth=2 ................[CV] n_estimators=300, learning_rate=0.1, max_depth=4 ................[CV] n_estimators=300, learning_rate=0.1, max_depth=2 ................\n", | |
"[CV] n_estimators=500, learning_rate=0.05, max_depth=6 ...............[CV] n_estimators=500, learning_rate=0.1, max_depth=4 ................\n", | |
"\n", | |
"[CV] n_estimators=100, learning_rate=0.1, max_depth=6 ................\n", | |
"\n", | |
"[CV] ....... n_estimators=300, learning_rate=0.1, max_depth=2 - 2.5min\n", | |
"\n", | |
"[CV] ....... n_estimators=500, learning_rate=0.1, max_depth=4 -13.6min[CV] ....... n_estimators=500, learning_rate=0.1, max_depth=6 -11.1min\n", | |
"[CV] ....... n_estimators=100, learning_rate=0.1, max_depth=2 - 52.4s[CV] ....... n_estimators=300, learning_rate=0.1, max_depth=4 - 2.3min\n", | |
"[CV] ...... n_estimators=500, learning_rate=0.05, max_depth=6 - 5.8min[CV] ....... n_estimators=500, learning_rate=0.1, max_depth=4 -13.8min\n", | |
"\n", | |
"[CV] ....... n_estimators=100, learning_rate=0.1, max_depth=6 - 1.2min\n", | |
"\n", | |
"[CV] n_estimators=500, learning_rate=0.1, max_depth=2 ................\n", | |
"\n", | |
"[CV] n_estimators=100, learning_rate=0.1, max_depth=6 ................[CV] n_estimators=300, learning_rate=0.1, max_depth=2 ................\n", | |
"\n", | |
"[CV] n_estimators=100, learning_rate=0.1, max_depth=4 ................[CV] n_estimators=300, learning_rate=0.1, max_depth=6 ................\n", | |
"\n", | |
"[CV] ....... n_estimators=500, learning_rate=0.1, max_depth=2 - 2.0min\n", | |
"\n", | |
"[CV] ....... n_estimators=100, learning_rate=0.1, max_depth=6 - 1.2min[CV] ....... n_estimators=300, learning_rate=0.1, max_depth=2 - 1.3min\n", | |
"[CV] ....... n_estimators=100, learning_rate=0.1, max_depth=4 - 47.8s[CV] ....... n_estimators=300, learning_rate=0.1, max_depth=6 -12.4min\n", | |
"\n", | |
"[CV] n_estimators=300, learning_rate=0.1, max_depth=4 ................\n", | |
"\n", | |
"[CV] n_estimators=500, learning_rate=0.1, max_depth=6 ................[CV] n_estimators=500, learning_rate=0.1, max_depth=2 ................\n", | |
"[CV] n_estimators=300, learning_rate=0.1, max_depth=4 ................\n", | |
"\n", | |
"\n", | |
"[CV] ....... n_estimators=300, learning_rate=0.1, max_depth=4 - 2.3min[CV] ....... n_estimators=300, learning_rate=0.1, max_depth=4 -10.8min[CV] ....... n_estimators=500, learning_rate=0.1, max_depth=2 - 2.0min[CV] ....... n_estimators=500, learning_rate=0.1, max_depth=6 -13.8min\n", | |
"\n", | |
"\n", | |
"\n", | |
"[CV] n_estimators=300, learning_rate=0.1, max_depth=6 ................[CV] n_estimators=500, learning_rate=0.1, max_depth=6 ................[CV] n_estimators=100, learning_rate=0.1, max_depth=4 ................\n", | |
"\n", | |
"\n", | |
"[CV] ....... n_estimators=100, learning_rate=0.1, max_depth=4 - 1.6min[CV] ....... n_estimators=300, learning_rate=0.1, max_depth=6 - 9.4min[CV] ....... n_estimators=500, learning_rate=0.1, max_depth=6 - 4.5min\n", | |
"\n", | |
"\n", | |
"[CV] n_estimators=100, learning_rate=0.1, max_depth=6 ................\n", | |
"[CV] ....... n_estimators=100, learning_rate=0.1, max_depth=6 - 1.2min\n", | |
"[CV] n_estimators=300, learning_rate=0.1, max_depth=6 ................\n", | |
"[CV] ....... n_estimators=300, learning_rate=0.1, max_depth=6 -12.6min\n" | |
] | |
}, | |
{ | |
"ename": "IndexError", | |
"evalue": "tuple index out of range", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-54-444fc4f76be0>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mclf_cv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxtrain_all\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mytrain_all\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0;32mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"best model : {} , best score : {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[0;31mIndexError\u001b[0m: tuple index out of range" | |
] | |
} | |
], | |
"source": [ | |
"# select all data \n", | |
"xtrain_all = np.vstack((xtrain, xvalid))\n", | |
"ytrain_all = np.concatenate((ytrain, yvalid))\n", | |
"xgb_model = xgboost.XGBClassifier()\n", | |
"# -1 for nb_jobs will use all processors (usefull on a aws spot instance) put 4 on a mac \n", | |
"clf_cv = GridSearchCV(xgb_model,\n", | |
" {'max_depth': [2,4,6],\n", | |
" 'learning_rate': [0.05, 0.1],\n", | |
" 'n_estimators': [100,300,500]}, verbose=2,n_jobs=-1) # -1 is using all processo\n", | |
"clf_cv.fit(xtrain_all,ytrain_all)\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 55, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"best model : {'n_estimators': 300, 'learning_rate': 0.1, 'max_depth': 6} , best score : 0.972214572758\n" | |
] | |
} | |
], | |
"source": [ | |
"print(\"best model : {} , best score : {}\".format(clf_cv.best_params_ , clf_cv.best_score_))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 56, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"grid_score : [mean: 0.96650, std: 0.00040, params: {'n_estimators': 100, 'learning_rate': 0.05, 'max_depth': 2}, mean: 0.96847, std: 0.00025, params: {'n_estimators': 300, 'learning_rate': 0.05, 'max_depth': 2}, mean: 0.96928, std: 0.00017, params: {'n_estimators': 500, 'learning_rate': 0.05, 'max_depth': 2}, mean: 0.96799, std: 0.00026, params: {'n_estimators': 100, 'learning_rate': 0.05, 'max_depth': 4}, mean: 0.96981, std: 0.00011, params: {'n_estimators': 300, 'learning_rate': 0.05, 'max_depth': 4}, mean: 0.97016, std: 0.00025, params: {'n_estimators': 500, 'learning_rate': 0.05, 'max_depth': 4}, mean: 0.96926, std: 0.00036, params: {'n_estimators': 100, 'learning_rate': 0.05, 'max_depth': 6}, mean: 0.97172, std: 0.00038, params: {'n_estimators': 300, 'learning_rate': 0.05, 'max_depth': 6}, mean: 0.97056, std: 0.00088, params: {'n_estimators': 500, 'learning_rate': 0.05, 'max_depth': 6}, mean: 0.96789, std: 0.00043, params: {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 2}, mean: 0.96965, std: 0.00003, params: {'n_estimators': 300, 'learning_rate': 0.1, 'max_depth': 2}, mean: 0.96979, std: 0.00038, params: {'n_estimators': 500, 'learning_rate': 0.1, 'max_depth': 2}, mean: 0.96983, std: 0.00019, params: {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 4}, mean: 0.97030, std: 0.00031, params: {'n_estimators': 300, 'learning_rate': 0.1, 'max_depth': 4}, mean: 0.97170, std: 0.00035, params: {'n_estimators': 500, 'learning_rate': 0.1, 'max_depth': 4}, mean: 0.97009, std: 0.00039, params: {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 6}, mean: 0.97221, std: 0.00152, params: {'n_estimators': 300, 'learning_rate': 0.1, 'max_depth': 6}, mean: 0.97192, std: 0.00189, params: {'n_estimators': 500, 'learning_rate': 0.1, 'max_depth': 6}]\n" | |
] | |
} | |
], | |
"source": [ | |
"print(\"grid_score : {}\".format(clf_cv.grid_scores_))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"clf_cv.best_params_" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# How many connected components in training set? (cited)\n", | |
"print 'All edges', len(list(nx.connected_components(graph_info)))\n", | |
"print '1-edges', len(list(nx.connected_components(connected_graph)))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"??xgboost.cv" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"xtrain.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"xvalid.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"yvalid.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 197, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"xtrain_all = np.vstack((xtrain, xvalid))\n", | |
"ytrain_all = np.concatenate((ytrain, yvalid))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 153, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"df_all = pd.DataFrame(xtrain_all, columns=colnames)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 155, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"df_all.to_csv('train_all.csv', index=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 156, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"u'/Users/ericfourrier/Documents/Programmation et machine learning /Altergrad/citation-mining/scripts'" | |
] | |
}, | |
"execution_count": 156, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pwd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"ytrain_all.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"xtrain_all.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 77, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import xgboost" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 198, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,\n", | |
" gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=5,\n", | |
" min_child_weight=1, missing=None, n_estimators=400, nthread=-1,\n", | |
" objective='binary:logistic', reg_alpha=0, reg_lambda=1,\n", | |
" scale_pos_weight=1, seed=0, silent=True, subsample=1)" | |
] | |
}, | |
"execution_count": 198, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Create and fit Random Forest for valid and train \n", | |
"#clf = RandomForestClassifier(n_estimators=300, n_jobs=4)\n", | |
"gbm = xgboost.XGBClassifier(max_depth=5, learning_rate=0.05, n_estimators=400)\n", | |
"#clf = SGDClassifier(alpha=0.00001)\n", | |
"gbm.fit(xtrain_all, ytrain_all)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def plot_features_imp_gbm(gbm, features)\n", | |
" mapFeat = dict(zip([\"f\"+str(i) for i in range(len(features))],features))\n", | |
" ts = pd.Series(gbm.booster().get_fscore())\n", | |
" ts.index = ts.reset_index()['index'].map(mapFeat)\n", | |
" ts.order().plot(kind=\"barh\", title=(\"features importance\"))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 79, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Predict on test set\n", | |
"# ptest = clf.predict(xtest)\n", | |
"ptest_xgboost=gbm.predict(xtest)\n", | |
"with open('../results/improved_prediction_xgboost_gensim.csv', 'w') as pred1:\n", | |
" csv_out = csv.writer(pred1)\n", | |
" csv_out.writerow(['id', 'category']) # write header\n", | |
" for i, row in enumerate(ptest_xgboost):\n", | |
" csv_out.writerow([i, row])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Create and fit Random Forest\n", | |
"rf = RandomForestClassifier(n_estimators=400, n_jobs=4)\n", | |
"#clf = SGDClassifier(alpha=0.00001)\n", | |
"rf.fit(xtrain_all, ytrain_all)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Ensemble model " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 200, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[ 0.6625, 0.3375],\n", | |
" [ 0. , 1. ],\n", | |
" [ 0. , 1. ],\n", | |
" ..., \n", | |
" [ 0.8875, 0.1125],\n", | |
" [ 1. , 0. ],\n", | |
" [ 0.15 , 0.85 ]])" | |
] | |
}, | |
"execution_count": 200, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"probas_rf = rf.predict_proba(xtest)\n", | |
"probas_rf" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 203, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[ 6.90260291e-01, 3.09739709e-01],\n", | |
" [ 3.01003456e-04, 9.99698997e-01],\n", | |
" [ 1.14846230e-03, 9.98851538e-01],\n", | |
" ..., \n", | |
" [ 8.41670871e-01, 1.58329114e-01],\n", | |
" [ 9.99908447e-01, 9.15376295e-05],\n", | |
" [ 2.77833402e-01, 7.22166598e-01]], dtype=float32)" | |
] | |
}, | |
"execution_count": 203, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"probas_gbm = gbm.predict_proba(xtest)\n", | |
"probas_gbm" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 208, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"probas_ensemble = 0.6*probas_rf[:,1] + 0.4*probas_gbm[:,1]\n", | |
"pred_ensemble = (probas_ensemble > 0.5).astype(np.int64)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 209, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([0, 1, 1, ..., 0, 0, 1])" | |
] | |
}, | |
"execution_count": 209, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pred_ensemble" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 207, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([0, 1, 1, ..., 0, 0, 1])" | |
] | |
}, | |
"execution_count": 207, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ptest_rf" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 211, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"32581" | |
] | |
}, | |
"execution_count": 211, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"(pred_ensemble == ptest_rf).sum()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 212, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(32648,)" | |
] | |
}, | |
"execution_count": 212, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pred_ensemble.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 213, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"ptest_rf=rf.predict(xtest)\n", | |
"with open('../results/improved_prediction_rf_ensemble.csv', 'w') as pred1:\n", | |
" csv_out = csv.writer(pred1)\n", | |
" csv_out.writerow(['id', 'category']) # write header\n", | |
" for i, row in enumerate(pred_ensemble):\n", | |
" csv_out.writerow([i, row])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Pagerank and friends take a lot of time\n", | |
"'''\n", | |
"class GraphExtended(GraphOnly):\n", | |
" def __init__(self, graph_info):\n", | |
" GraphOnly.__init__(self, graph_info)\n", | |
" # do pagerank\n", | |
" #print 'Computing Pagerank'\n", | |
" #self.pagerank = nx.pagerank(graph_info)\n", | |
" # other centralities\n", | |
" print 'Computing Closeness'\n", | |
" self.closeness_centrality = nx.betweenness_centrality(graph_info)\n", | |
" # current_flow_closeness_centrality\n", | |
" print 'Computing Flow Centrality'\n", | |
" self.flow = nx.current_flow_closeness_centrality(graph_info)\n", | |
" \n", | |
" def get_feature(self, source_id, target_id):\n", | |
" \"\"\"Graph Features\"\"\"\n", | |
" common = len(list(nx.common_neighbors(self.graph_info, source_id, target_id)))\n", | |
" d_out = self.graph_info.degree(source_id) \n", | |
" d_in = self.graph_info.degree(target_id) \n", | |
" s_flow = self.flow[source_id]\n", | |
" t_flow = self.flow[target_id]\n", | |
" #s_pagerank = self.pagerank[source_id]\n", | |
" #t_pagerank = self.pagerank[target_id]\n", | |
" s_close = self.closeness_centrality[source_id]\n", | |
" t_close = self.closeness_centrality[target_id]\n", | |
" return np.asarray((common, \n", | |
" d_out, d_in,\n", | |
" s_flow, t_flow,\n", | |
" #s_pagerank, t_pagerank,\n", | |
" s_close, t_close\n", | |
" )) \n", | |
"\n", | |
"class GraphOnlyPlus(GraphOnly):\n", | |
" def get_feature(self, source_id, target_id):\n", | |
" \"\"\"Graph Features\"\"\"\n", | |
" common = len(list(nx.common_neighbors(self.graph_info, source_id, target_id)))\n", | |
" d_out = self.graph_info.degree(source_id) \n", | |
" d_in = self.graph_info.degree(target_id) \n", | |
" jaccard = list(nx.jaccard_coefficient(\n", | |
" self.graph_info, [(source_id, target_id)]))[0][2]\n", | |
" try:\n", | |
" dst = nx.shortest_path_length(self.graph_info, \n", | |
" source=source_id, \n", | |
" target=target_id,\n", | |
" weight='WGH')\n", | |
" #dst = nx.maximum_flow_value(self.graph_info, \n", | |
" # s=source_id, \n", | |
" # t=target_id,\n", | |
" # capacity='WGH')\n", | |
" has_path = dst > 0\n", | |
" except nx.NetworkXNoPath:\n", | |
" dst = -10\n", | |
" has_path = 0\n", | |
"\n", | |
" print '{} to {} dst is {}'.format(source_id, target_id, dst )\n", | |
" \n", | |
" return np.asarray((common, \n", | |
" d_out, d_in,\n", | |
" jaccard,\n", | |
" dst,\n", | |
" has_path\n", | |
" ))\n", | |
"'''\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.11" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment