christophergandrud · December 26, 2011 07:50
diff --git a/fed.speeches.parse.R b/fed.speeches.parse.R
 #######################
 ## Open text files, parse individually and remove text of the the speeches

 setwd("~/fed.text.indv/")

    # Create list of text files to parse and extract speech text from
    files <- list.files(path = "~/fed.text.indv/", pattern = "*.txt")
    
    # Create object to record empty text files (likely empty due to error in the website download)
    missing <- NULL
    
    # Indicate folder to save cleaned files into
    outpathB <- "~/fed.text.parsed/"

 for (i in files){
  
    # Parse HTML and extract speech text
    marker <- tryCatch(
        unlist(
            xpathSApply(
                doc = htmlParse(
                    file = i), "//p", xmlValue
                )
            ), error = function(e) e
        )

        # Fill object with the file numbers of the empty text files.
        # Then skip if the text file is empty, to prevent the loop from stopping
        if(inherits(marker, "error")){
            missing <- c(missing, i) 
            next
        }     
         
        # Further remove unwanted HTML markup and repeated text
        marker <- gsub("\\n", "", marker) 
        marker <- gsub("Return to top", "", marker)
        marker <- gsub("Return to text", "", marker)
        marker <- gsub("Accessible Version", "", marker)
        marker <- gsub("Accessible version", "", marker)    
        marker <- gsub("Speeches", "", marker) 
        
        # Collapse into a single character string
        marker <- paste(marker, collapse = "") 
        
    # Save as new .txt file
    write(as.character(marker), file = paste(outpathB, "/", "parsed.", i, sep = ""))
 }
	#######################
	## Open text files, parse individually and remove text of the the speeches

	setwd("~/fed.text.indv/")

	# Create list of text files to parse and extract speech text from
	files <- list.files(path = "~/fed.text.indv/", pattern = "*.txt")

	# Create object to record empty text files (likely empty due to error in the website download)
	missing <- NULL

	# Indicate folder to save cleaned files into
	outpathB <- "~/fed.text.parsed/"

	for (i in files){

	# Parse HTML and extract speech text
	marker <- tryCatch(
	unlist(
	xpathSApply(
	doc = htmlParse(
	file = i), "//p", xmlValue
	)
	), error = function(e) e
	)

	# Fill object with the file numbers of the empty text files.
	# Then skip if the text file is empty, to prevent the loop from stopping
	if(inherits(marker, "error")){
	missing <- c(missing, i)
	next
	}

	# Further remove unwanted HTML markup and repeated text
	marker <- gsub("\\n", "", marker)
	marker <- gsub("Return to top", "", marker)
	marker <- gsub("Return to text", "", marker)
	marker <- gsub("Accessible Version", "", marker)
	marker <- gsub("Accessible version", "", marker)
	marker <- gsub("Speeches", "", marker)

	# Collapse into a single character string
	marker <- paste(marker, collapse = "")

	# Save as new .txt file
	write(as.character(marker), file = paste(outpathB, "/", "parsed.", i, sep = ""))
	}
No results found