|
""" |
|
OVERVIEW: This script will automatically export a list of all your co-authors and |
|
their institutions to an Excel file based on information already in the Scopus database. |
|
|
|
LIMITATIONS: |
|
1. Only up to 150 collaborators are supported by Scopus. |
|
2. Sometimes, you want to filter by collaborators for only the last 4 years. Unfortunately, |
|
there is no simple way to do this. |
|
|
|
INSTRUCTIONS: |
|
|
|
0. Make sure you have the "pandas" and "bs4" (BeautifulSoup) libraries installed |
|
1. Find your author profile on scopus.com, e.g. https://www.scopus.com/authid/detail.uri?authorId=7404463800 |
|
2. There should be some text above some other results that states how many co-authors you have, (i) click that text AND (ii) also click "View in Search Results Format" |
|
3. Click "exclude" to exclude institutions from the proposal (e.g., your home institution) or other exclusionary factors. Use the sidebar to click some attributes then click "Exclude". |
|
--->Remember you are limited to 150 results. So excluding results might be important!! |
|
4. Make sure you display 200 results per page so you get everything |
|
5. Export the HTML page as a file called "coauthors.html" and move it to your Desktop. |
|
|
|
RUN!! You should see a file called coauthors.xlsx appear in the Desktop. |
|
""" |
|
|
|
|
|
def cond(x): |
|
# condition for a table row actually containing data! |
|
if x: |
|
return x.startswith("resultDataRow") |
|
else: |
|
return False |
|
|
|
|
|
if __name__ == "__main__": |
|
import os |
|
|
|
import bs4 as bs |
|
from pandas import DataFrame |
|
|
|
in_path = os.path.expanduser("~/Desktop/coauthors.html") # location containing the input HTML file from Scopus |
|
out_path = os.path.expanduser("~/Desktop/coauthors.xlsx") # location to place the output Excel file |
|
sort_col = "institution" # sort the data at the end by this column |
|
|
|
source = None |
|
with open(in_path) as f: |
|
source = f.read() |
|
|
|
soup = bs.BeautifulSoup(source, features="lxml") |
|
|
|
raw_data = [] |
|
table = soup.find('table', id='srchResultsList') |
|
rows = table.find_all('tr', id=cond) |
|
for row in rows: |
|
cols = row.find_all('td') |
|
cols = [elem.text.strip() for elem in cols] |
|
raw_data.append([elem for elem in cols]) # Get rid of empty values |
|
|
|
processed_data = [] |
|
|
|
for x in raw_data: |
|
author = x[0].split("\n")[0] |
|
lastname = author.split(",")[0] |
|
firstname = author.split(",")[1] |
|
institution = x[3] |
|
processed_data.append([lastname, firstname, institution]) |
|
|
|
|
|
df = DataFrame(processed_data, columns=["lastname", "firstname", "institution"]) |
|
df.sort_values(by=[sort_col], inplace=True) |
|
df.to_excel(os.path.expanduser(out_path), index=False) |