dela3499 · November 19, 2015 01:46
diff --git a/abstract-similarity.py b/abstract-similarity.py
 from sklearn.feature_extraction.text import TfidfVectorizer
 import pandas as pd

 # String -> [String]
 def parse_abstracts(filename):
    """ Given a text file containing PubMed abstracts, 
        return these abstracts as a list of strings. """
    f = open(filename).read()
    lines = f.split("\n")
    docs = [[]]
    n = 0
    for line in lines:
        if line.startswith("{}. ".format(n + 1)): 
            n += 1
            docs.append([])
        docs[-1].append(line)
    return map(format_lines, docs[1:])

 # [String] -> String
 def format_lines(lines):
    """ Given a list of lines with information for one abstract, 
        return a single, well-formatted string. """
    return ' '.join(map(lambda x: x if x != "" else "\n", lines))

 # Search URL : http://www.ncbi.nlm.nih.gov/pubmed/?term=(%22aging%22%5BMeSH+Terms%5D+OR+%22aging%22%5BAll+Fields%5D)+OR+(%22aging%22%5BMeSH+Terms%5D+OR+%22aging%22%5BAll+Fields%5D+OR+%22ageing%22%5BAll+Fields%5D)+AND+(hasabstract%5Btext%5D+AND+(%222015%2F01%2F01%22%5BPDAT%5D+%3A+%222015%2F12%2F31%22%5BPDAT%5D)+AND+%22humans%22%5BMeSH+Terms%5D)

 abstracts = parse_abstracts("/path/to/pubmed_result.txt")

 # get text similarity (http://stackoverflow.com/questions/8897593/similarity-between-two-text-documents?answertab=votes#tab-top)
 vect = TfidfVectorizer(min_df=1)
 tfidf = vect.fit_transform(abstracts)
 mat = (tfidf * tfidf.T).A

 # save to file
 df = pd.DataFrame(mat)
 df.to_csv("/path/to/abstract_similarities.csv", index = False, header = False)    
diff --git a/comparison.py b/comparison.py
 ## Imperative Version

 # String -> [String]
 def parse_abstracts(filename):
    """ Given a text file containing PubMed abstracts, 
        return these abstracts as a list of strings. """
    f = open(filename).read()
    lines = f.split("\n")
    docs = [[]]
    n = 0
    for line in lines:
        if line.startswith("{}. ".format(n + 1)): 
            n += 1
            docs.append([])
        docs[-1].append(line)
    return map(format_lines, docs[1:])
    
    
 ## Functional Version    
    
 # (n, [[String]]) -> String -> (n, [[String]])
 def parse_line((n, docs), line):
    test_string = "{}. ".format(n + 1)
    if line.startswith(test_string): # True on first line of new abstract
        return (n + 1, push(docs, [line])) # Put line as first of new doc
    else: 
        return (n, pushlast(docs, line)) # Add line to last doc
    
 # String -> [String]
 def parse_abstracts(filename):
    """ Given a text file containing PubMed abstracts, 
        return these abstracts as a list of strings. """
    return tz.thread_last(
        filename,
        readfile,
        lambda text: text.split("\n"),
        (foldl, parse_line, (0, [[]])), 
        snd,
        tail,
        (map, format_lines))
   
        
 ## Functional Version 2
    
 # [[String]] -> String -> [[String]]
 def parse_line(docs, line):
    test_string = "{}. ".format(len(docs))
    if line.startswith(test_string): # True on first line of new abstract
        return push(docs, [line]) # Put line as first of new doc
    else: 
        return pushlast(docs, line) # Add line to last doc
    
 # String -> [String]
 def parse_abstracts(filename):
    """ Given a text file containing PubMed abstracts, 
        return these abstracts as a list of strings. """
    return tz.thread_last(
        filename,
        readfile,
        (argflip(str.split), "\n"),
        (foldl, parse_line, [[]]), 
        tail,
        (map, format_lines))        

diff --git a/utils.py b/utils.py
 import toolz as tz

 # (a -> b -> c) -> (b -> a -> c)
 def argflip(f):
    return lambda a,b: f(b,a)

 # (a -> b -> a) -> a -> [b] -> a
 def foldl(f, accum, xs):
    for x in xs: 
        accum = f(accum, x)
    return accum

 # String -> String
 def readfile(filename):
    f = open(filename)
    contents = f.read()
    f.close()
    return contents

 # [a] -> a -> [a]
 def push(xs, x):
    return xs + [x]

 # [[a]] -> a -> [[a]]
 def pushlast(xs, x):
    """ Push x onto the last sublist of xs. """
    return push(xs[:-1], push(xs[-1], x))

 # [a] -> [a]
 def tail(xs):
    """ Return list of xs, excluding first element. """
    return xs[1:]

 # (a,b) -> b
 def snd((a,b)):
    return b

 # [String] -> String
 def format_lines(lines):
    """ Given a list of lines with information for one abstract, 
        return a single, well-formatted string. """
    return tz.thread_last(
        lines,
        (map, lambda x: "\n" if x == "" else x),
        (str.join, ' '))
	from sklearn.feature_extraction.text import TfidfVectorizer
	import pandas as pd

	# String -> [String]
	def parse_abstracts(filename):
	""" Given a text file containing PubMed abstracts,
	return these abstracts as a list of strings. """
	f = open(filename).read()
	lines = f.split("\n")
	docs = [[]]
	n = 0
	for line in lines:
	if line.startswith("{}. ".format(n + 1)):
	n += 1
	docs.append([])
	docs[-1].append(line)
	return map(format_lines, docs[1:])

	# [String] -> String
	def format_lines(lines):
	""" Given a list of lines with information for one abstract,
	return a single, well-formatted string. """
	return ' '.join(map(lambda x: x if x != "" else "\n", lines))

	# Search URL : http://www.ncbi.nlm.nih.gov/pubmed/?term=(%22aging%22%5BMeSH+Terms%5D+OR+%22aging%22%5BAll+Fields%5D)+OR+(%22aging%22%5BMeSH+Terms%5D+OR+%22aging%22%5BAll+Fields%5D+OR+%22ageing%22%5BAll+Fields%5D)+AND+(hasabstract%5Btext%5D+AND+(%222015%2F01%2F01%22%5BPDAT%5D+%3A+%222015%2F12%2F31%22%5BPDAT%5D)+AND+%22humans%22%5BMeSH+Terms%5D)

	abstracts = parse_abstracts("/path/to/pubmed_result.txt")

	# get text similarity (http://stackoverflow.com/questions/8897593/similarity-between-two-text-documents?answertab=votes#tab-top)
	vect = TfidfVectorizer(min_df=1)
	tfidf = vect.fit_transform(abstracts)
	mat = (tfidf * tfidf.T).A

	# save to file
	df = pd.DataFrame(mat)
	df.to_csv("/path/to/abstract_similarities.csv", index = False, header = False)
	## Imperative Version

	# String -> [String]
	def parse_abstracts(filename):
	""" Given a text file containing PubMed abstracts,
	return these abstracts as a list of strings. """
	f = open(filename).read()
	lines = f.split("\n")
	docs = [[]]
	n = 0
	for line in lines:
	if line.startswith("{}. ".format(n + 1)):
	n += 1
	docs.append([])
	docs[-1].append(line)
	return map(format_lines, docs[1:])


	## Functional Version

	# (n, [[String]]) -> String -> (n, [[String]])
	def parse_line((n, docs), line):
	test_string = "{}. ".format(n + 1)
	if line.startswith(test_string): # True on first line of new abstract
	return (n + 1, push(docs, [line])) # Put line as first of new doc
	else:
	return (n, pushlast(docs, line)) # Add line to last doc

	# String -> [String]
	def parse_abstracts(filename):
	""" Given a text file containing PubMed abstracts,
	return these abstracts as a list of strings. """
	return tz.thread_last(
	filename,
	readfile,
	lambda text: text.split("\n"),
	(foldl, parse_line, (0, [[]])),
	snd,
	tail,
	(map, format_lines))


	## Functional Version 2

	# [[String]] -> String -> [[String]]
	def parse_line(docs, line):
	test_string = "{}. ".format(len(docs))
	if line.startswith(test_string): # True on first line of new abstract
	return push(docs, [line]) # Put line as first of new doc
	else:
	return pushlast(docs, line) # Add line to last doc

	# String -> [String]
	def parse_abstracts(filename):
	""" Given a text file containing PubMed abstracts,
	return these abstracts as a list of strings. """
	return tz.thread_last(
	filename,
	readfile,
	(argflip(str.split), "\n"),
	(foldl, parse_line, [[]]),
	tail,
	(map, format_lines))
	import toolz as tz

	# (a -> b -> c) -> (b -> a -> c)
	def argflip(f):
	return lambda a,b: f(b,a)

	# (a -> b -> a) -> a -> [b] -> a
	def foldl(f, accum, xs):
	for x in xs:
	accum = f(accum, x)
	return accum

	# String -> String
	def readfile(filename):
	f = open(filename)
	contents = f.read()
	f.close()
	return contents

	# [a] -> a -> [a]
	def push(xs, x):
	return xs + [x]

	# [[a]] -> a -> [[a]]
	def pushlast(xs, x):
	""" Push x onto the last sublist of xs. """
	return push(xs[:-1], push(xs[-1], x))

	# [a] -> [a]
	def tail(xs):
	""" Return list of xs, excluding first element. """
	return xs[1:]

	# (a,b) -> b
	def snd((a,b)):
	return b

	# [String] -> String
	def format_lines(lines):
	""" Given a list of lines with information for one abstract,
	return a single, well-formatted string. """
	return tz.thread_last(
	lines,
	(map, lambda x: "\n" if x == "" else x),
	(str.join, ' '))