Created
September 2, 2023 13:29
-
-
Save htlin222/3af7d14b5ad13541e9b2d6987ddb5fd8 to your computer and use it in GitHub Desktop.
In zotero, export the collection with note into a CSV file, then run script, which will use Pubmed API to get the abstract, and use chatGPT to convert to bullet points, finally convert to a marp md slide
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# title: getab | |
# date: "2023-09-02" | |
# author: Hsieh-Ting Lin, the Lizard 🦎 | |
import re | |
import subprocess | |
import sys | |
import openai | |
import pandas as pd | |
import requests | |
from Bio import Entrez | |
def respond(prompt): | |
openai.api_key = "YOUR_API_KEY" | |
completions = openai.Completion.create( | |
engine="text-davinci-002", | |
prompt=prompt, | |
max_tokens=1000, | |
n=1, | |
stop=None, | |
temperature=0.5, | |
) | |
message = completions.choices[0].text | |
return message | |
def doi_to_pmid(doi): | |
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" | |
params = {"db": "pubmed", "term": f"{doi}[DOI]", "retmode": "json"} | |
response = requests.get(base_url, params=params) | |
response_json = response.json() | |
if "esearchresult" in response_json and "idlist" in response_json[ | |
"esearchresult"]: | |
pmids = response_json["esearchresult"]["idlist"] | |
if pmids: | |
# Return the first PMID if there are multiple matches (which is rare) | |
return pmids[0] | |
return None | |
def get_abstract_from_pmid(pmid): | |
Entrez.email = "[email protected]" | |
handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml") | |
records = Entrez.read(handle) | |
try: | |
abstract = records["PubmedArticle"][0]["MedlineCitation"]["Article"][ | |
"Abstract"]["AbstractText"][0] | |
except (KeyError, IndexError): | |
abstract = "Abstract not available" | |
return abstract | |
def get_abstract(doi): | |
pmid = doi_to_pmid(doi) | |
print(pmid) | |
if pmid: | |
return get_abstract_from_pmid(pmid) | |
else: | |
print("No DOI Found") | |
return None | |
def clean_text(string): | |
# Split the string by lines | |
lines = string.split("\n") | |
# Filter out lines that don't start with "- " | |
filtered_lines = [line for line in lines if line.startswith("- ")] | |
# Add "\n\n---\n\n" separator every 3 bullet points | |
grouped_lines = [] | |
for i in range(0, len(filtered_lines), 3): | |
grouped_lines.extend(filtered_lines[i:i + 3]) | |
if i + 3 < len(filtered_lines): | |
grouped_lines.append("\n\n---\n\n") | |
# Join the lines back into a string and return | |
return "\n".join(grouped_lines) | |
def extract_plain_text_line_by_line(string: str) -> str: | |
# Split by closing tags | |
lines = re.split(r"<\/[^>]+>", string) | |
# Remove all other HTML tags and strip each line | |
lines = [ | |
re.sub(r"<[^>]+>", "", line).strip() for line in lines | |
if re.sub(r"<[^>]+>", "", line).strip() | |
] | |
combined = "- " + "\n- ".join(lines) | |
combined = clean_text(combined) | |
return combined | |
if __name__ == "__main__": | |
# Check if the correct number of arguments are passed | |
if len(sys.argv) != 3: | |
print("Usage: script_name.py -f filename.csv") | |
sys.exit(1) | |
flag, filename = sys.argv[1], sys.argv[2] | |
# Check if the correct flag is used | |
if flag != "-f": | |
print("Usage: script_name.py -f filename.csv") | |
sys.exit(1) | |
# Read the CSV | |
df = pd.read_csv(filename) | |
doi_list = df["DOI"].dropna().unique().tolist() | |
markdown_content = "# Title Page\n\n---\n\n" | |
for doi in doi_list: | |
# Fetch the title for the DOI from the DataFrame | |
title = df[df["DOI"] == doi]["Title"].iloc[0] | |
note = df[df["DOI"] == doi]["Notes"].iloc[0] | |
if isinstance(note, str) and len(note) > 0: | |
cleaned_note = (f"## {title}\n\n### Hightlights\n\n" + | |
extract_plain_text_line_by_line(note) + | |
"\n\n---\n\n") | |
else: | |
cleaned_note = "" | |
print(title) | |
abstract = get_abstract(doi) # Get the abstract using the DOI | |
response = respond( | |
f"transferred to bullet points in markdown, like '- content\n- content' here are the context:\n{abstract}" | |
) | |
lines = [line for line in response.splitlines() if line.strip()] | |
bullet_points = "\n".join(lines) | |
bullet_points = clean_text(bullet_points) | |
# Fetch the publication year for the DOI | |
year = df[df["DOI"] == doi]["Publication Year"].iloc[0] | |
# Format the information in the desired markdown structure | |
markdown_content += f"{cleaned_note}## {title}\n\n### Summary\n\n{bullet_points}\n\n<!-- {abstract} -->\n\n> {title} ({year}). DOI: {doi}\n\n---\n\n" | |
markdown_content += "\n\n## Thank You for Your Listening\n" | |
# Export the markdown content to a .md file | |
with open("output.md", "w") as file: | |
file.write(markdown_content) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
自動生成投影片
Introduction
🪴之前有分享過報journal時,最好把每一篇Reference都簡單看一下,簡報更充實。但最近做Journal Reading 時,發現花太多時間當搬運工,不斷copy and paste,把paper的東西從pdf貼到slide裡。
🪴光是要整理這些paper,讀完、摘要、羅列重點、就會讓人覺得今天的網路用得夠多了,還要再把它們變投影片,CP值很低,是在浪費生命,不如尷尬10分鐘,多睡三小時。因此一直以來都在思考有沒有辦法讓這個流程儘量自動化。
Method
🪴 在zotero(文獻管理)軟體,裝muisedestiny/zotero-reference 這個外掛,這樣他會自動列出這篇paper的citation條目,我就一口氣加入一個collection裡,接著,我就慢慢看這些paper,在zotero裡把我覺得重要的關鍵句子畫記。zotero有一個不錯的功能,可以一邊劃一邊加入條目的筆記裡。然後我就可以把這個collection輸出成一個csv檔。
🪴這個csv檔可說是精華,裡面有每一篇標題、doi、跟我的筆記內容。我就可以執行
python csv_to_marp.py -f "my_collection.csv"
下面這個script會幫我產生投影片,(這些程式碼我都是請chatGPT幫我寫):---
來分頁Result
🪴成品大概如圖所示,這樣我就可以省下當搬運工的無腦時間,最多就是再放一些重要圖表就可以了,去一趟廁所的時間,就可以得到30篇paper的標題、條列式摘要的投影片,心情真的會不錯
Discussion
🪴不過這個做法有一點技術門鑑,首先要先知道markdown是什麼、知道怎麼用zotero管理文獻、知道DOI的好、知道怎麼用python叫pumbed 跟ChatGPT API、讀csv、然後會用marp來產生投影片。
🪴想到其他可能玩法:在pubmed裡用進階搜尋,「過去一週發表在NEJM的文章」、「關於某主題最近發表的trial」然後存下所有條目,跑一輪上面的流程,再加上之前分享過的edge-tts,讓微軟先生幫我講成mp3,我就可以土炮做一個週報的中文podcast了。
🪴歡迎各醫療學術單位共結善緣 🙏