StevenSong · May 16, 2023 15:15
diff --git a/extract-pubmed-abstracts.py b/extract-pubmed-abstracts.py
 import os
 import re
 import gzip
 import hashlib
 from tqdm import tqdm
 from bs4 import BeautifulSoup, SoupStrainer

 fnames = []
 for fname in os.listdir('PubMed'):
    if fname.endswith('.xml'):
        fname = 'PubMed/' + fname
        fnames.append(fname)
 fnames = sorted(fnames)

 strainer = SoupStrainer(['ArticleTitle', 'AbstractText'])
 title_count = 0
 abstract_count = 0
 last_was_title = False
 with open('pubmed-immune-abstracts.txt', 'w') as out:
    for fname in tqdm(fnames):
        # read titles + abstracts
        with open(fname, 'r') as fp:
            soup = BeautifulSoup(fp, 'xml', parse_only=strainer)

        for x in soup.find_all():
            line = x.text
            line = re.sub(' *\(ABSTRACT.{0,10}( AT (250|400) WORDS)?\)?[\. ]*$', '', line)
            # I've seen lots of typos or truncations of " TRUNCATED", seems like ABSTRACT usually makes it through okay
            if 'immun' in line.lower():
                if x.name == 'ArticleTitle':
                    out.write(f'\n{line}\n')
                    last_was_title = True
                    title_count += 1
                else: # AbstractText
                    out.write(f'{line}\n')
                    if last_was_title:
                        abstract_count += 1
                    last_was_title = False

 print(f'Saved {title_count} titles with {abstract_count} abstracts.')
	import os
	import re
	import gzip
	import hashlib
	from tqdm import tqdm
	from bs4 import BeautifulSoup, SoupStrainer

	fnames = []
	for fname in os.listdir('PubMed'):
	if fname.endswith('.xml'):
	fname = 'PubMed/' + fname
	fnames.append(fname)
	fnames = sorted(fnames)

	strainer = SoupStrainer(['ArticleTitle', 'AbstractText'])
	title_count = 0
	abstract_count = 0
	last_was_title = False
	with open('pubmed-immune-abstracts.txt', 'w') as out:
	for fname in tqdm(fnames):
	# read titles + abstracts
	with open(fname, 'r') as fp:
	soup = BeautifulSoup(fp, 'xml', parse_only=strainer)

	for x in soup.find_all():
	line = x.text
	line = re.sub(' \(ABSTRACT.{0,10}( AT (250\|400) WORDS)?\)?[\. ]$', '', line)
	# I've seen lots of typos or truncations of " TRUNCATED", seems like ABSTRACT usually makes it through okay
	if 'immun' in line.lower():
	if x.name == 'ArticleTitle':
	out.write(f'\n{line}\n')
	last_was_title = True
	title_count += 1
	else: # AbstractText
	out.write(f'{line}\n')
	if last_was_title:
	abstract_count += 1
	last_was_title = False

	print(f'Saved {title_count} titles with {abstract_count} abstracts.')