Created
November 15, 2024 15:36
-
-
Save rossant/c61d25582fc9cdef9ac499f0b26cab81 to your computer and use it in GitHub Desktop.
Publications script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import pandas as pd | |
def get_author_id(author_name): | |
"""Retrieve OpenAlex author ID using the author's name.""" | |
search_url = f"https://api.openalex.org/authors?search={author_name}" | |
response = requests.get(search_url) | |
response.raise_for_status() | |
data = response.json() | |
if data['results']: | |
return data['results'][0]['id'] | |
return None | |
def get_author_metrics_and_publications(author_id): | |
"""Fetch author metrics and publications from OpenAlex API.""" | |
author_details_url = f"https://api.openalex.org/{author_id}" | |
response = requests.get(author_details_url) | |
response.raise_for_status() | |
author_details = response.json() | |
# Extract metrics | |
h_index = author_details.get('summary_stats', {}).get('h_index', 0) | |
total_works = author_details.get('works_count', 0) | |
total_citations = author_details.get('cited_by_count', 0) | |
# Fetch publications sorted by publication date (descending) | |
works_url = f"https://api.openalex.org/works?filter=author.id:{author_id}&per_page=200&sort=publication_date:desc" | |
response = requests.get(works_url) | |
response.raise_for_status() | |
works_data = response.json() | |
publications = [ | |
{ | |
'Date': work.get('publication_date', ''), | |
'Authors': [_.get('author', {}).get('display_name', '') for _ in work.get('authorships', [])], | |
'Title': work['display_name'], | |
'Publication Name': ((work.get('primary_location', {}) or {}).get('source', {}) or {}).get('display_name', ''), | |
'Cited By': work.get('cited_by_count', 0) | |
# 'DOI': work.get('doi', ''), | |
} | |
for work in works_data['results'] | |
] | |
for pub in publications: | |
if len(pub['Authors']) >= 3: | |
pub['Authors'] = ', '.join(pub['Authors'][:3]) + ' et al.' | |
else: | |
pub['Authors'] = ', '.join(pub['Authors']) | |
return h_index, total_works, total_citations, publications | |
def save_publications_to_excel(author_name, publications): | |
"""Save the list of publications to an Excel file.""" | |
df = pd.DataFrame(publications) | |
file_name = f"{author_name} - publications.xlsx" | |
df.to_excel(file_name, index=False) | |
adjust_excel(file_name) | |
print(f"Saved publications to {file_name}") | |
def save_scores_to_excel(scores): | |
"""Save metrics of all authors to a single Excel file.""" | |
df_scores = pd.DataFrame(scores, columns=['Author', 'h-index', 'Articles', 'Citations']) | |
file_name = "scores.xlsx" | |
df_scores.to_excel(file_name, index=False) | |
adjust_excel(file_name) | |
print("Saved metrics to scores.xlsx") | |
def adjust_excel(path): | |
with pd.ExcelWriter(path, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer: | |
worksheet = writer.sheets['Sheet1'] | |
for col in worksheet.columns: | |
max_length = max(len(str(cell.value)) for cell in col) | |
col_letter = col[0].column_letter | |
worksheet.column_dimensions[col_letter].width = max_length + 2 | |
def main(authors): | |
scores = [] | |
for author_name in authors: | |
author_name = author_name.strip() | |
print(f"Processing author: {author_name}") | |
author_id = get_author_id(author_name) | |
if not author_id: | |
print(f"Author '{author_name}' not found.") | |
continue | |
h_index, total_works, total_citations, publications = get_author_metrics_and_publications(author_id) | |
# Save individual publication list to Excel | |
save_publications_to_excel(author_name, publications) | |
# Collect metrics for the scores file | |
scores.append([author_name, h_index, total_works, total_citations]) | |
# Save all scores to a single Excel file | |
save_scores_to_excel(scores) | |
print("Data collection complete.") | |
# List of authors to process | |
with open('authors.txt', 'r') as f: | |
author_list = f.readlines() | |
main(author_list) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment