Btibert3 · November 26, 2012 14:46
diff --git a/collect-data-v2.r b/collect-data-v2.r
 #! /bin/bash
 # remove the lines below here before running------
 # Example 7: connect into the twitter firehouse - best approach is to use Command Line - run in Ubuntu 11.10
 #      Attempted to do this within R, but different approaches proved computing resources get consumed at incredible rate
 # need to make this file executable from a command line in a linux-type environment (run in Ubuntu) by chmod 755
 # saves datafile every hour (closes curl, reopens curl) from the twitter sample firehouse, raw json, should be able to process raw data in R and elsewhere
 # If you're already in the directory containing the file you could just type: ./filename.sh and press Enter.

 while true; do
    curl -s -m 3600 -u twitterusername:twitterpassword https://stream.twitter.com/1/statuses/sample.json -o "twtstream_$(date +%Y%m%d%H).txt"
 done
diff --git a/example-scripts-v1.R b/example-scripts-v1.R
 #==============================================================================
 # Name: Streaming twitter using RCURL
 # Date: Jan 9th, 2012
 #
 # saves the data in two formats:
 #    - raw text file of the response
 #    - adds each raw response as a new element to an R list; 
 #         - each element can be processed with apply functions + leverage parallel processing
 #         - plays well to hadoop-style processing of data
 #
 # concerns:  
 #    - if incoming responses are faster than the machine can process (cpu, disk speed, etc.)
 #    - saving the data in multiple files on hard disk is less than ideal
 #         - use couchdb or mongodb?
 #
 # NOTE:  This code slows down because each response requires opening an 
 #        increasingly larger .Rdata file.
 #
 # MOSTLY PROOF OF CONCEPT - NOT REALLY PRACTICAL FOR DATA COLLECTION.
 #==============================================================================
 require(RCurl)
 # require(rjson)


 # set the directory
 setwd("C:/Documents and Settings/BTIBERT/Desktop/Twitter Data")

 ## set the twitter account credentials
 USER <- "twitteracctname"
 PASSWORD <- "twitterpassword"
 UPASS <- paste(USER, PASSWORD, sep=":")

 #==============================================================================
 ## A function that we will use to save the twitter JSON response to disk
 ## Creates hourly files, each contains a list object, responses are added to list
 #==============================================================================
 WRITE_TO_FILE <- function(x) {
     
     # inputs: x = the raw response recieved from Twitter
     
     ## build the file name depending on the system hour
     fname_base <- paste("twtstream_", format(Sys.time(), "%m%d%Y%H"), sep="")
     fname_txt <- paste(fname_base, ".txt", sep="")
     fname_r <- paste(fname_base, ".Rdata", sep="")
     
     ## checks to see if the hourly R data file is in the current data directory
     ## loads the R data file if it exists
     if(fname_r %in% list.files()) {
          load(fname_r)
     }
     
     ## check to see if the list exists, if not,create it
     ## be careful, this should be loaded if the .Rdata file exists
     if(!exists("twt", mode="list")) {
     cat("creating twitter list obect\n")
     twt <- list()
     }

     
     ## if the response is not empty, saves two data files
     ## 1) write to text file 2) add to list and save Rdata file
     if (nchar(x) >0 ) {
          
          ## write the response to a text file
          write.table(x, file=fname_txt, append=T, row.names=F, col.names=F)
          
          ## add the response to a new list element
          twt <- c(twt, x) 
          save(twt, file=fname_r)
     }
     
     ## saved response
     cat("saved response-----------------\n")
 }


 # test <- function(x) {print(x)}

 ## windows users will need to get this certificate to authenticate
 download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.pem")

 ## write the raw JSON data from the Twitter Firehouse to files specified in write function
 ## enclose in a while-loop in case there is an error?  hacky way to catch an error?
 while(TRUE) {
     getURL("https://stream.twitter.com/1/statuses/sample.json", 
            userpwd=UPASS,
            cainfo = "cacert.pem", 
            write=WRITE_TO_FILE)
 }
	#! /bin/bash
	# remove the lines below here before running------
	# Example 7: connect into the twitter firehouse - best approach is to use Command Line - run in Ubuntu 11.10
	# Attempted to do this within R, but different approaches proved computing resources get consumed at incredible rate
	# need to make this file executable from a command line in a linux-type environment (run in Ubuntu) by chmod 755
	# saves datafile every hour (closes curl, reopens curl) from the twitter sample firehouse, raw json, should be able to process raw data in R and elsewhere
	# If you're already in the directory containing the file you could just type: ./filename.sh and press Enter.

	while true; do
	curl -s -m 3600 -u twitterusername:twitterpassword https://stream.twitter.com/1/statuses/sample.json -o "twtstream_$(date +%Y%m%d%H).txt"
	done
	#==============================================================================
	# Name: Streaming twitter using RCURL
	# Date: Jan 9th, 2012
	#
	# saves the data in two formats:
	# - raw text file of the response
	# - adds each raw response as a new element to an R list;
	# - each element can be processed with apply functions + leverage parallel processing
	# - plays well to hadoop-style processing of data
	#
	# concerns:
	# - if incoming responses are faster than the machine can process (cpu, disk speed, etc.)
	# - saving the data in multiple files on hard disk is less than ideal
	# - use couchdb or mongodb?
	#
	# NOTE: This code slows down because each response requires opening an
	# increasingly larger .Rdata file.
	#
	# MOSTLY PROOF OF CONCEPT - NOT REALLY PRACTICAL FOR DATA COLLECTION.
	#==============================================================================
	require(RCurl)
	# require(rjson)


	# set the directory
	setwd("C:/Documents and Settings/BTIBERT/Desktop/Twitter Data")

	## set the twitter account credentials
	USER <- "twitteracctname"
	PASSWORD <- "twitterpassword"
	UPASS <- paste(USER, PASSWORD, sep=":")

	#==============================================================================
	## A function that we will use to save the twitter JSON response to disk
	## Creates hourly files, each contains a list object, responses are added to list
	#==============================================================================
	WRITE_TO_FILE <- function(x) {

	# inputs: x = the raw response recieved from Twitter

	## build the file name depending on the system hour
	fname_base <- paste("twtstream_", format(Sys.time(), "%m%d%Y%H"), sep="")
	fname_txt <- paste(fname_base, ".txt", sep="")
	fname_r <- paste(fname_base, ".Rdata", sep="")

	## checks to see if the hourly R data file is in the current data directory
	## loads the R data file if it exists
	if(fname_r %in% list.files()) {
	load(fname_r)
	}

	## check to see if the list exists, if not,create it
	## be careful, this should be loaded if the .Rdata file exists
	if(!exists("twt", mode="list")) {
	cat("creating twitter list obect\n")
	twt <- list()
	}


	## if the response is not empty, saves two data files
	## 1) write to text file 2) add to list and save Rdata file
	if (nchar(x) >0 ) {

	## write the response to a text file
	write.table(x, file=fname_txt, append=T, row.names=F, col.names=F)

	## add the response to a new list element
	twt <- c(twt, x)
	save(twt, file=fname_r)
	}

	## saved response
	cat("saved response-----------------\n")
	}


	# test <- function(x) {print(x)}

	## windows users will need to get this certificate to authenticate
	download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.pem")

	## write the raw JSON data from the Twitter Firehouse to files specified in write function
	## enclose in a while-loop in case there is an error? hacky way to catch an error?
	while(TRUE) {
	getURL("https://stream.twitter.com/1/statuses/sample.json",
	userpwd=UPASS,
	cainfo = "cacert.pem",
	write=WRITE_TO_FILE)
	}