Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save ginixsan/14142ae50387f9d958f8371a79da8336 to your computer and use it in GitHub Desktop.
Save ginixsan/14142ae50387f9d958f8371a79da8336 to your computer and use it in GitHub Desktop.
import os
import re
import gzip
import hashlib
from tqdm import tqdm
from bs4 import BeautifulSoup, SoupStrainer
fnames = []
for fname in os.listdir('PubMed'):
if fname.endswith('.xml'):
fname = 'PubMed/' + fname
fnames.append(fname)
fnames = sorted(fnames)
strainer = SoupStrainer(['ArticleTitle', 'AbstractText'])
title_count = 0
abstract_count = 0
last_was_title = False
with open('pubmed-immune-abstracts.txt', 'w') as out:
for fname in tqdm(fnames):
# read titles + abstracts
with open(fname, 'r') as fp:
soup = BeautifulSoup(fp, 'xml', parse_only=strainer)
for x in soup.find_all():
line = x.text
line = re.sub(' *\(ABSTRACT.{0,10}( AT (250|400) WORDS)?\)?[\. ]*$', '', line)
# I've seen lots of typos or truncations of " TRUNCATED", seems like ABSTRACT usually makes it through okay
if 'immun' in line.lower():
if x.name == 'ArticleTitle':
out.write(f'\n{line}\n')
last_was_title = True
title_count += 1
else: # AbstractText
out.write(f'{line}\n')
if last_was_title:
abstract_count += 1
last_was_title = False
print(f'Saved {title_count} titles with {abstract_count} abstracts.')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment