kf0jvt · January 2, 2016 01:39 · krmaxwell · Jan 3, 2014 · krmaxwell · Jan 3, 2014
diff --git a/sample strings.txt b/sample strings.txt
 http://www.utphysicians.com/21756/uthealth-informs-patients-incident-related-patient-information/ (20130830),http://healthitsecurity.com/2013/08/29/ut-physicians-informs-patients-of-data-breach/ (20130830)

 https://oag.ca.gov/system/files/Final%20version%20of%20breach%20notification%20in%20PDF%20format%20%2800751822%29_0.PDF  http://www.phiprivacy.net/burglar-snatches-laptop-with-patient-medical-records-from-san-jose-internists-office/

 http://doj.nh.gov/consumer/security-breaches/documents/waste-management-20070403.pdf
diff --git a/wackness.r b/wackness.r
 # Script to pull all the vcdb incidents where a laptop was stolen and then create a Document 
 # Term Matrix out of the articles used to code those incidents. Later we will use some packages like TM and stuff
 # to do stuff to the text.

 library(RMongo)

 mongodb <- mongoDbConnect(dbName='kevin',host='localhost',port='27017')
 collection <- 'vcdb'
 stolen.querystring <- '{"action.physical.variety":"Theft","asset.assets":{"variety":"U - Laptop"}}'

 # Get the stolen laptop incidents from the database
 stolen.laptops <- dbGetQuery(mongodb,collection,stolen.querystring)
 # head(stolen.laptops['reference'])
 # nrow(stolen.laptops)
 # ncol(stolen.laptops)
 # stolen.laptops[['reference']][1] or stolen.laptops$reference[1]

 get.urls <- function(inUrl){
  local.urls <- sub(';',' ',inUrl)
  local.urls <- sub(',','',local.urls)
  local.urls <- strsplit(local.urls,'\ ')
  return(grep('^http.*[^pPdDfF].*$',local.urls[[1]],value=TRUE))
 }
 stolen.laptop.urls <- unlist(lapply(stolen.laptops$reference,get.urls))

 # https://raw.github.com/tonybreyal/Blog-Reference-Functions/master/R/htmlToText/htmlToText.R
 source("htmlToText.R")
 stolen.laptop.text <- lapply(stolen.laptop.urls,htmlToText)
	http://www.utphysicians.com/21756/uthealth-informs-patients-incident-related-patient-information/ (20130830),http://healthitsecurity.com/2013/08/29/ut-physicians-informs-patients-of-data-breach/ (20130830)

	https://oag.ca.gov/system/files/Final%20version%20of%20breach%20notification%20in%20PDF%20format%20%2800751822%29_0.PDF http://www.phiprivacy.net/burglar-snatches-laptop-with-patient-medical-records-from-san-jose-internists-office/

	http://doj.nh.gov/consumer/security-breaches/documents/waste-management-20070403.pdf
	# Script to pull all the vcdb incidents where a laptop was stolen and then create a Document
	# Term Matrix out of the articles used to code those incidents. Later we will use some packages like TM and stuff
	# to do stuff to the text.

	library(RMongo)

	mongodb <- mongoDbConnect(dbName='kevin',host='localhost',port='27017')
	collection <- 'vcdb'
	stolen.querystring <- '{"action.physical.variety":"Theft","asset.assets":{"variety":"U - Laptop"}}'

	# Get the stolen laptop incidents from the database
	stolen.laptops <- dbGetQuery(mongodb,collection,stolen.querystring)
	# head(stolen.laptops['reference'])
	# nrow(stolen.laptops)
	# ncol(stolen.laptops)
	# stolen.laptops[['reference']][1] or stolen.laptops$reference[1]

	get.urls <- function(inUrl){
	local.urls <- sub(';',' ',inUrl)
	local.urls <- sub(',','',local.urls)
	local.urls <- strsplit(local.urls,'\ ')
	return(grep('^http.[^pPdDfF].$',local.urls[[1]],value=TRUE))
	}
	stolen.laptop.urls <- unlist(lapply(stolen.laptops$reference,get.urls))

	# https://raw.github.com/tonybreyal/Blog-Reference-Functions/master/R/htmlToText/htmlToText.R
	source("htmlToText.R")
	stolen.laptop.text <- lapply(stolen.laptop.urls,htmlToText)