davidrichards · May 25, 2015 20:14
diff --git a/extractor.py b/extractor.py
 from bs4 import BeautifulSoup
 from urllib import urlopen
 import textmining

 urls = [
  'http://www.webmd.com/cancer/childhood-leukemia-symptoms-treatments',
  'http://kidshealth.org/parent/medical/cancer/cancer_leukemia.html',
  'http://www.nlm.nih.gov/medlineplus/childhoodleukemia.html'
 ]

 matrix = textmining.TermDocumentMatrix()

 def add_document(m, url):
    html = urlopen(url).read()
    soup = BeautifulSoup(html)
    text = soup.get_text()
    m.add_doc(text)
    return True
diff --git a/importer.py b/importer.py
 import numpy as np
 import pandas

 terms = pandas.read_csv('matrix.csv')
 data = terms.values
 U, s, V = np.linalg.svd(data.T)
 S = np.diag(s)
	from bs4 import BeautifulSoup
	from urllib import urlopen
	import textmining

	urls = [
	'http://www.webmd.com/cancer/childhood-leukemia-symptoms-treatments',
	'http://kidshealth.org/parent/medical/cancer/cancer_leukemia.html',
	'http://www.nlm.nih.gov/medlineplus/childhoodleukemia.html'
	]

	matrix = textmining.TermDocumentMatrix()

	def add_document(m, url):
	html = urlopen(url).read()
	soup = BeautifulSoup(html)
	text = soup.get_text()
	m.add_doc(text)
	return True
	import numpy as np
	import pandas

	terms = pandas.read_csv('matrix.csv')
	data = terms.values
	U, s, V = np.linalg.svd(data.T)
	S = np.diag(s)