Last active
November 19, 2015 01:46
-
-
Save dela3499/8fcdc573841dcec61f68 to your computer and use it in GitHub Desktop.
PubMed abstract similarity
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.feature_extraction.text import TfidfVectorizer | |
import pandas as pd | |
# String -> [String] | |
def parse_abstracts(filename): | |
""" Given a text file containing PubMed abstracts, | |
return these abstracts as a list of strings. """ | |
f = open(filename).read() | |
lines = f.split("\n") | |
docs = [[]] | |
n = 0 | |
for line in lines: | |
if line.startswith("{}. ".format(n + 1)): | |
n += 1 | |
docs.append([]) | |
docs[-1].append(line) | |
return map(format_lines, docs[1:]) | |
# [String] -> String | |
def format_lines(lines): | |
""" Given a list of lines with information for one abstract, | |
return a single, well-formatted string. """ | |
return ' '.join(map(lambda x: x if x != "" else "\n", lines)) | |
# Search URL : http://www.ncbi.nlm.nih.gov/pubmed/?term=(%22aging%22%5BMeSH+Terms%5D+OR+%22aging%22%5BAll+Fields%5D)+OR+(%22aging%22%5BMeSH+Terms%5D+OR+%22aging%22%5BAll+Fields%5D+OR+%22ageing%22%5BAll+Fields%5D)+AND+(hasabstract%5Btext%5D+AND+(%222015%2F01%2F01%22%5BPDAT%5D+%3A+%222015%2F12%2F31%22%5BPDAT%5D)+AND+%22humans%22%5BMeSH+Terms%5D) | |
abstracts = parse_abstracts("/path/to/pubmed_result.txt") | |
# get text similarity (http://stackoverflow.com/questions/8897593/similarity-between-two-text-documents?answertab=votes#tab-top) | |
vect = TfidfVectorizer(min_df=1) | |
tfidf = vect.fit_transform(abstracts) | |
mat = (tfidf * tfidf.T).A | |
# save to file | |
df = pd.DataFrame(mat) | |
df.to_csv("/path/to/abstract_similarities.csv", index = False, header = False) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Imperative Version | |
# String -> [String] | |
def parse_abstracts(filename): | |
""" Given a text file containing PubMed abstracts, | |
return these abstracts as a list of strings. """ | |
f = open(filename).read() | |
lines = f.split("\n") | |
docs = [[]] | |
n = 0 | |
for line in lines: | |
if line.startswith("{}. ".format(n + 1)): | |
n += 1 | |
docs.append([]) | |
docs[-1].append(line) | |
return map(format_lines, docs[1:]) | |
## Functional Version | |
# (n, [[String]]) -> String -> (n, [[String]]) | |
def parse_line((n, docs), line): | |
test_string = "{}. ".format(n + 1) | |
if line.startswith(test_string): # True on first line of new abstract | |
return (n + 1, push(docs, [line])) # Put line as first of new doc | |
else: | |
return (n, pushlast(docs, line)) # Add line to last doc | |
# String -> [String] | |
def parse_abstracts(filename): | |
""" Given a text file containing PubMed abstracts, | |
return these abstracts as a list of strings. """ | |
return tz.thread_last( | |
filename, | |
readfile, | |
lambda text: text.split("\n"), | |
(foldl, parse_line, (0, [[]])), | |
snd, | |
tail, | |
(map, format_lines)) | |
## Functional Version 2 | |
# [[String]] -> String -> [[String]] | |
def parse_line(docs, line): | |
test_string = "{}. ".format(len(docs)) | |
if line.startswith(test_string): # True on first line of new abstract | |
return push(docs, [line]) # Put line as first of new doc | |
else: | |
return pushlast(docs, line) # Add line to last doc | |
# String -> [String] | |
def parse_abstracts(filename): | |
""" Given a text file containing PubMed abstracts, | |
return these abstracts as a list of strings. """ | |
return tz.thread_last( | |
filename, | |
readfile, | |
(argflip(str.split), "\n"), | |
(foldl, parse_line, [[]]), | |
tail, | |
(map, format_lines)) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import toolz as tz | |
# (a -> b -> c) -> (b -> a -> c) | |
def argflip(f): | |
return lambda a,b: f(b,a) | |
# (a -> b -> a) -> a -> [b] -> a | |
def foldl(f, accum, xs): | |
for x in xs: | |
accum = f(accum, x) | |
return accum | |
# String -> String | |
def readfile(filename): | |
f = open(filename) | |
contents = f.read() | |
f.close() | |
return contents | |
# [a] -> a -> [a] | |
def push(xs, x): | |
return xs + [x] | |
# [[a]] -> a -> [[a]] | |
def pushlast(xs, x): | |
""" Push x onto the last sublist of xs. """ | |
return push(xs[:-1], push(xs[-1], x)) | |
# [a] -> [a] | |
def tail(xs): | |
""" Return list of xs, excluding first element. """ | |
return xs[1:] | |
# (a,b) -> b | |
def snd((a,b)): | |
return b | |
# [String] -> String | |
def format_lines(lines): | |
""" Given a list of lines with information for one abstract, | |
return a single, well-formatted string. """ | |
return tz.thread_last( | |
lines, | |
(map, lambda x: "\n" if x == "" else x), | |
(str.join, ' ')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment