Skip to content

Instantly share code, notes, and snippets.

@dela3499
Last active November 19, 2015 01:46
Show Gist options
  • Save dela3499/8fcdc573841dcec61f68 to your computer and use it in GitHub Desktop.
Save dela3499/8fcdc573841dcec61f68 to your computer and use it in GitHub Desktop.
PubMed abstract similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
# String -> [String]
def parse_abstracts(filename):
""" Given a text file containing PubMed abstracts,
return these abstracts as a list of strings. """
f = open(filename).read()
lines = f.split("\n")
docs = [[]]
n = 0
for line in lines:
if line.startswith("{}. ".format(n + 1)):
n += 1
docs.append([])
docs[-1].append(line)
return map(format_lines, docs[1:])
# [String] -> String
def format_lines(lines):
""" Given a list of lines with information for one abstract,
return a single, well-formatted string. """
return ' '.join(map(lambda x: x if x != "" else "\n", lines))
# Search URL : http://www.ncbi.nlm.nih.gov/pubmed/?term=(%22aging%22%5BMeSH+Terms%5D+OR+%22aging%22%5BAll+Fields%5D)+OR+(%22aging%22%5BMeSH+Terms%5D+OR+%22aging%22%5BAll+Fields%5D+OR+%22ageing%22%5BAll+Fields%5D)+AND+(hasabstract%5Btext%5D+AND+(%222015%2F01%2F01%22%5BPDAT%5D+%3A+%222015%2F12%2F31%22%5BPDAT%5D)+AND+%22humans%22%5BMeSH+Terms%5D)
abstracts = parse_abstracts("/path/to/pubmed_result.txt")
# get text similarity (http://stackoverflow.com/questions/8897593/similarity-between-two-text-documents?answertab=votes#tab-top)
vect = TfidfVectorizer(min_df=1)
tfidf = vect.fit_transform(abstracts)
mat = (tfidf * tfidf.T).A
# save to file
df = pd.DataFrame(mat)
df.to_csv("/path/to/abstract_similarities.csv", index = False, header = False)
## Imperative Version
# String -> [String]
def parse_abstracts(filename):
""" Given a text file containing PubMed abstracts,
return these abstracts as a list of strings. """
f = open(filename).read()
lines = f.split("\n")
docs = [[]]
n = 0
for line in lines:
if line.startswith("{}. ".format(n + 1)):
n += 1
docs.append([])
docs[-1].append(line)
return map(format_lines, docs[1:])
## Functional Version
# (n, [[String]]) -> String -> (n, [[String]])
def parse_line((n, docs), line):
test_string = "{}. ".format(n + 1)
if line.startswith(test_string): # True on first line of new abstract
return (n + 1, push(docs, [line])) # Put line as first of new doc
else:
return (n, pushlast(docs, line)) # Add line to last doc
# String -> [String]
def parse_abstracts(filename):
""" Given a text file containing PubMed abstracts,
return these abstracts as a list of strings. """
return tz.thread_last(
filename,
readfile,
lambda text: text.split("\n"),
(foldl, parse_line, (0, [[]])),
snd,
tail,
(map, format_lines))
## Functional Version 2
# [[String]] -> String -> [[String]]
def parse_line(docs, line):
test_string = "{}. ".format(len(docs))
if line.startswith(test_string): # True on first line of new abstract
return push(docs, [line]) # Put line as first of new doc
else:
return pushlast(docs, line) # Add line to last doc
# String -> [String]
def parse_abstracts(filename):
""" Given a text file containing PubMed abstracts,
return these abstracts as a list of strings. """
return tz.thread_last(
filename,
readfile,
(argflip(str.split), "\n"),
(foldl, parse_line, [[]]),
tail,
(map, format_lines))
import toolz as tz
# (a -> b -> c) -> (b -> a -> c)
def argflip(f):
return lambda a,b: f(b,a)
# (a -> b -> a) -> a -> [b] -> a
def foldl(f, accum, xs):
for x in xs:
accum = f(accum, x)
return accum
# String -> String
def readfile(filename):
f = open(filename)
contents = f.read()
f.close()
return contents
# [a] -> a -> [a]
def push(xs, x):
return xs + [x]
# [[a]] -> a -> [[a]]
def pushlast(xs, x):
""" Push x onto the last sublist of xs. """
return push(xs[:-1], push(xs[-1], x))
# [a] -> [a]
def tail(xs):
""" Return list of xs, excluding first element. """
return xs[1:]
# (a,b) -> b
def snd((a,b)):
return b
# [String] -> String
def format_lines(lines):
""" Given a list of lines with information for one abstract,
return a single, well-formatted string. """
return tz.thread_last(
lines,
(map, lambda x: "\n" if x == "" else x),
(str.join, ' '))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment