rossant · November 15, 2024 15:36
diff --git a/publications.py b/publications.py
 import requests
 import pandas as pd

 def get_author_id(author_name):
    """Retrieve OpenAlex author ID using the author's name."""
    search_url = f"https://api.openalex.org/authors?search={author_name}"
    response = requests.get(search_url)
    response.raise_for_status()
    data = response.json()
    if data['results']:
        return data['results'][0]['id']
    return None

 def get_author_metrics_and_publications(author_id):
    """Fetch author metrics and publications from OpenAlex API."""
    author_details_url = f"https://api.openalex.org/{author_id}"
    response = requests.get(author_details_url)
    response.raise_for_status()
    author_details = response.json()

    # Extract metrics
    h_index = author_details.get('summary_stats', {}).get('h_index', 0)
    total_works = author_details.get('works_count', 0)
    total_citations = author_details.get('cited_by_count', 0)

    # Fetch publications sorted by publication date (descending)
    works_url = f"https://api.openalex.org/works?filter=author.id:{author_id}&per_page=200&sort=publication_date:desc"
    response = requests.get(works_url)
    response.raise_for_status()
    works_data = response.json()
    publications = [
        {
            'Date': work.get('publication_date', ''),
            'Authors': [_.get('author', {}).get('display_name', '') for _ in work.get('authorships', [])],
            'Title': work['display_name'],
            'Publication Name': ((work.get('primary_location', {}) or {}).get('source', {}) or {}).get('display_name', ''),
            'Cited By': work.get('cited_by_count', 0)
            # 'DOI': work.get('doi', ''),
        }
        for work in works_data['results']
    ]
    for pub in publications:
        if len(pub['Authors']) >= 3:
            pub['Authors'] = ', '.join(pub['Authors'][:3]) + ' et al.'
        else:
            pub['Authors'] = ', '.join(pub['Authors'])

    return h_index, total_works, total_citations, publications

 def save_publications_to_excel(author_name, publications):
    """Save the list of publications to an Excel file."""
    df = pd.DataFrame(publications)
    file_name = f"{author_name} - publications.xlsx"
    df.to_excel(file_name, index=False)
    adjust_excel(file_name)
    print(f"Saved publications to {file_name}")

 def save_scores_to_excel(scores):
    """Save metrics of all authors to a single Excel file."""
    df_scores = pd.DataFrame(scores, columns=['Author', 'h-index', 'Articles', 'Citations'])
    file_name = "scores.xlsx"
    df_scores.to_excel(file_name, index=False)
    adjust_excel(file_name)
    print("Saved metrics to scores.xlsx")

 def adjust_excel(path):
    with pd.ExcelWriter(path, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
        worksheet = writer.sheets['Sheet1']
        for col in worksheet.columns:
            max_length = max(len(str(cell.value)) for cell in col)
            col_letter = col[0].column_letter
            worksheet.column_dimensions[col_letter].width = max_length + 2

 def main(authors):
    scores = []

    for author_name in authors:
        author_name = author_name.strip()
        print(f"Processing author: {author_name}")
        author_id = get_author_id(author_name)
        if not author_id:
            print(f"Author '{author_name}' not found.")
            continue

        h_index, total_works, total_citations, publications = get_author_metrics_and_publications(author_id)

        # Save individual publication list to Excel
        save_publications_to_excel(author_name, publications)

        # Collect metrics for the scores file
        scores.append([author_name, h_index, total_works, total_citations])

    # Save all scores to a single Excel file
    save_scores_to_excel(scores)
    print("Data collection complete.")

 # List of authors to process

 with open('authors.txt', 'r') as f:
    author_list = f.readlines()

 main(author_list)
	import requests
	import pandas as pd

	def get_author_id(author_name):
	"""Retrieve OpenAlex author ID using the author's name."""
	search_url = f"https://api.openalex.org/authors?search={author_name}"
	response = requests.get(search_url)
	response.raise_for_status()
	data = response.json()
	if data['results']:
	return data['results'][0]['id']
	return None

	def get_author_metrics_and_publications(author_id):
	"""Fetch author metrics and publications from OpenAlex API."""
	author_details_url = f"https://api.openalex.org/{author_id}"
	response = requests.get(author_details_url)
	response.raise_for_status()
	author_details = response.json()

	# Extract metrics
	h_index = author_details.get('summary_stats', {}).get('h_index', 0)
	total_works = author_details.get('works_count', 0)
	total_citations = author_details.get('cited_by_count', 0)

	# Fetch publications sorted by publication date (descending)
	works_url = f"https://api.openalex.org/works?filter=author.id:{author_id}&per_page=200&sort=publication_date:desc"
	response = requests.get(works_url)
	response.raise_for_status()
	works_data = response.json()
	publications = [
	{
	'Date': work.get('publication_date', ''),
	'Authors': [_.get('author', {}).get('display_name', '') for _ in work.get('authorships', [])],
	'Title': work['display_name'],
	'Publication Name': ((work.get('primary_location', {}) or {}).get('source', {}) or {}).get('display_name', ''),
	'Cited By': work.get('cited_by_count', 0)
	# 'DOI': work.get('doi', ''),
	}
	for work in works_data['results']
	]
	for pub in publications:
	if len(pub['Authors']) >= 3:
	pub['Authors'] = ', '.join(pub['Authors'][:3]) + ' et al.'
	else:
	pub['Authors'] = ', '.join(pub['Authors'])

	return h_index, total_works, total_citations, publications

	def save_publications_to_excel(author_name, publications):
	"""Save the list of publications to an Excel file."""
	df = pd.DataFrame(publications)
	file_name = f"{author_name} - publications.xlsx"
	df.to_excel(file_name, index=False)
	adjust_excel(file_name)
	print(f"Saved publications to {file_name}")

	def save_scores_to_excel(scores):
	"""Save metrics of all authors to a single Excel file."""
	df_scores = pd.DataFrame(scores, columns=['Author', 'h-index', 'Articles', 'Citations'])
	file_name = "scores.xlsx"
	df_scores.to_excel(file_name, index=False)
	adjust_excel(file_name)
	print("Saved metrics to scores.xlsx")

	def adjust_excel(path):
	with pd.ExcelWriter(path, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
	worksheet = writer.sheets['Sheet1']
	for col in worksheet.columns:
	max_length = max(len(str(cell.value)) for cell in col)
	col_letter = col[0].column_letter
	worksheet.column_dimensions[col_letter].width = max_length + 2

	def main(authors):
	scores = []

	for author_name in authors:
	author_name = author_name.strip()
	print(f"Processing author: {author_name}")
	author_id = get_author_id(author_name)
	if not author_id:
	print(f"Author '{author_name}' not found.")
	continue

	h_index, total_works, total_citations, publications = get_author_metrics_and_publications(author_id)

	# Save individual publication list to Excel
	save_publications_to_excel(author_name, publications)

	# Collect metrics for the scores file
	scores.append([author_name, h_index, total_works, total_citations])

	# Save all scores to a single Excel file
	save_scores_to_excel(scores)
	print("Data collection complete.")

	# List of authors to process

	with open('authors.txt', 'r') as f:
	author_list = f.readlines()

	main(author_list)