Skip to content

Instantly share code, notes, and snippets.

@rossant
Created November 15, 2024 15:36
Show Gist options
  • Save rossant/c61d25582fc9cdef9ac499f0b26cab81 to your computer and use it in GitHub Desktop.
Save rossant/c61d25582fc9cdef9ac499f0b26cab81 to your computer and use it in GitHub Desktop.
Publications script
import requests
import pandas as pd
def get_author_id(author_name):
"""Retrieve OpenAlex author ID using the author's name."""
search_url = f"https://api.openalex.org/authors?search={author_name}"
response = requests.get(search_url)
response.raise_for_status()
data = response.json()
if data['results']:
return data['results'][0]['id']
return None
def get_author_metrics_and_publications(author_id):
"""Fetch author metrics and publications from OpenAlex API."""
author_details_url = f"https://api.openalex.org/{author_id}"
response = requests.get(author_details_url)
response.raise_for_status()
author_details = response.json()
# Extract metrics
h_index = author_details.get('summary_stats', {}).get('h_index', 0)
total_works = author_details.get('works_count', 0)
total_citations = author_details.get('cited_by_count', 0)
# Fetch publications sorted by publication date (descending)
works_url = f"https://api.openalex.org/works?filter=author.id:{author_id}&per_page=200&sort=publication_date:desc"
response = requests.get(works_url)
response.raise_for_status()
works_data = response.json()
publications = [
{
'Date': work.get('publication_date', ''),
'Authors': [_.get('author', {}).get('display_name', '') for _ in work.get('authorships', [])],
'Title': work['display_name'],
'Publication Name': ((work.get('primary_location', {}) or {}).get('source', {}) or {}).get('display_name', ''),
'Cited By': work.get('cited_by_count', 0)
# 'DOI': work.get('doi', ''),
}
for work in works_data['results']
]
for pub in publications:
if len(pub['Authors']) >= 3:
pub['Authors'] = ', '.join(pub['Authors'][:3]) + ' et al.'
else:
pub['Authors'] = ', '.join(pub['Authors'])
return h_index, total_works, total_citations, publications
def save_publications_to_excel(author_name, publications):
"""Save the list of publications to an Excel file."""
df = pd.DataFrame(publications)
file_name = f"{author_name} - publications.xlsx"
df.to_excel(file_name, index=False)
adjust_excel(file_name)
print(f"Saved publications to {file_name}")
def save_scores_to_excel(scores):
"""Save metrics of all authors to a single Excel file."""
df_scores = pd.DataFrame(scores, columns=['Author', 'h-index', 'Articles', 'Citations'])
file_name = "scores.xlsx"
df_scores.to_excel(file_name, index=False)
adjust_excel(file_name)
print("Saved metrics to scores.xlsx")
def adjust_excel(path):
with pd.ExcelWriter(path, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
worksheet = writer.sheets['Sheet1']
for col in worksheet.columns:
max_length = max(len(str(cell.value)) for cell in col)
col_letter = col[0].column_letter
worksheet.column_dimensions[col_letter].width = max_length + 2
def main(authors):
scores = []
for author_name in authors:
author_name = author_name.strip()
print(f"Processing author: {author_name}")
author_id = get_author_id(author_name)
if not author_id:
print(f"Author '{author_name}' not found.")
continue
h_index, total_works, total_citations, publications = get_author_metrics_and_publications(author_id)
# Save individual publication list to Excel
save_publications_to_excel(author_name, publications)
# Collect metrics for the scores file
scores.append([author_name, h_index, total_works, total_citations])
# Save all scores to a single Excel file
save_scores_to_excel(scores)
print("Data collection complete.")
# List of authors to process
with open('authors.txt', 'r') as f:
author_list = f.readlines()
main(author_list)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment