Created
November 26, 2012 14:46
-
-
Save Btibert3/4148578 to your computer and use it in GitHub Desktop.
Twitter Commands
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /bin/bash | |
| # remove the lines below here before running------ | |
| # Example 7: connect into the twitter firehouse - best approach is to use Command Line - run in Ubuntu 11.10 | |
| # Attempted to do this within R, but different approaches proved computing resources get consumed at incredible rate | |
| # need to make this file executable from a command line in a linux-type environment (run in Ubuntu) by chmod 755 | |
| # saves datafile every hour (closes curl, reopens curl) from the twitter sample firehouse, raw json, should be able to process raw data in R and elsewhere | |
| # If you're already in the directory containing the file you could just type: ./filename.sh and press Enter. | |
| while true; do | |
| curl -s -m 3600 -u twitterusername:twitterpassword https://stream.twitter.com/1/statuses/sample.json -o "twtstream_$(date +%Y%m%d%H).txt" | |
| done |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #============================================================================== | |
| # Name: Streaming twitter using RCURL | |
| # Date: Jan 9th, 2012 | |
| # | |
| # saves the data in two formats: | |
| # - raw text file of the response | |
| # - adds each raw response as a new element to an R list; | |
| # - each element can be processed with apply functions + leverage parallel processing | |
| # - plays well to hadoop-style processing of data | |
| # | |
| # concerns: | |
| # - if incoming responses are faster than the machine can process (cpu, disk speed, etc.) | |
| # - saving the data in multiple files on hard disk is less than ideal | |
| # - use couchdb or mongodb? | |
| # | |
| # NOTE: This code slows down because each response requires opening an | |
| # increasingly larger .Rdata file. | |
| # | |
| # MOSTLY PROOF OF CONCEPT - NOT REALLY PRACTICAL FOR DATA COLLECTION. | |
| #============================================================================== | |
| require(RCurl) | |
| # require(rjson) | |
| # set the directory | |
| setwd("C:/Documents and Settings/BTIBERT/Desktop/Twitter Data") | |
| ## set the twitter account credentials | |
| USER <- "twitteracctname" | |
| PASSWORD <- "twitterpassword" | |
| UPASS <- paste(USER, PASSWORD, sep=":") | |
| #============================================================================== | |
| ## A function that we will use to save the twitter JSON response to disk | |
| ## Creates hourly files, each contains a list object, responses are added to list | |
| #============================================================================== | |
| WRITE_TO_FILE <- function(x) { | |
| # inputs: x = the raw response recieved from Twitter | |
| ## build the file name depending on the system hour | |
| fname_base <- paste("twtstream_", format(Sys.time(), "%m%d%Y%H"), sep="") | |
| fname_txt <- paste(fname_base, ".txt", sep="") | |
| fname_r <- paste(fname_base, ".Rdata", sep="") | |
| ## checks to see if the hourly R data file is in the current data directory | |
| ## loads the R data file if it exists | |
| if(fname_r %in% list.files()) { | |
| load(fname_r) | |
| } | |
| ## check to see if the list exists, if not,create it | |
| ## be careful, this should be loaded if the .Rdata file exists | |
| if(!exists("twt", mode="list")) { | |
| cat("creating twitter list obect\n") | |
| twt <- list() | |
| } | |
| ## if the response is not empty, saves two data files | |
| ## 1) write to text file 2) add to list and save Rdata file | |
| if (nchar(x) >0 ) { | |
| ## write the response to a text file | |
| write.table(x, file=fname_txt, append=T, row.names=F, col.names=F) | |
| ## add the response to a new list element | |
| twt <- c(twt, x) | |
| save(twt, file=fname_r) | |
| } | |
| ## saved response | |
| cat("saved response-----------------\n") | |
| } | |
| # test <- function(x) {print(x)} | |
| ## windows users will need to get this certificate to authenticate | |
| download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.pem") | |
| ## write the raw JSON data from the Twitter Firehouse to files specified in write function | |
| ## enclose in a while-loop in case there is an error? hacky way to catch an error? | |
| while(TRUE) { | |
| getURL("https://stream.twitter.com/1/statuses/sample.json", | |
| userpwd=UPASS, | |
| cainfo = "cacert.pem", | |
| write=WRITE_TO_FILE) | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment