Last active
December 30, 2015 21:09
-
-
Save meefen/7885000 to your computer and use it in GitHub Desktop.
This gist contains Java code for archiving tweets in MongoDB, and R code for retrieving tweets for analysis. Please note that there're a couple of dependencies: (1) You should setup your MongoDB (I used the free plan at https://mongolab.com/);
(2) You should have setup your Twitter api (check https://dev.twitter.com/);
(3) Java code: include twi…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rmongodb) | |
## Host info and credentials | |
host <- "ds053858.mongolab.com:53858" | |
username <- "your_username" | |
password <- "your_pass" | |
db <- "your_db" | |
## Connect to mongodb | |
mongo <- mongo.create(host=host, db=db, | |
username=username, password=password) | |
## Get a list of collections within our namespace | |
# here I used each collection for a twitter archive | |
mongo.get.database.collections(mongo, db) | |
## Create a string that points to the namespace | |
# the collection I'm interested in is "#mri13" | |
collection <- "#mri13" | |
namespace <- paste(db, collection, sep=".") | |
## Check the total number of tweets in "#mri13" | |
mongo.count(mongo, namespace, mongo.bson.empty()) | |
## Build a query to find how many tweets were posted by me | |
buf <- mongo.bson.buffer.create() | |
mongo.bson.buffer.append(buf, "user_name", "bodongchen") | |
query <- mongo.bson.from.buffer(buf) | |
# get the count | |
count <- mongo.count(mongo, namespace, query) | |
count | |
## Get all tweets posted by me | |
tweets <- list() | |
cursor <- mongo.find(mongo, namespace, query) | |
while (mongo.cursor.next(cursor)) { | |
val <- mongo.cursor.value(cursor) | |
tweets[[length(tweets)+1]] <- mongo.bson.value(val, "tweet_text") | |
} | |
length(tweets) | |
## Retrieve all tweets and put into a dataframe | |
library(plyr) | |
df_arch1 = data.frame(stringsAsFactors = FALSE) | |
cursor <- mongo.find(mongo, namespace) | |
while (mongo.cursor.next(cursor)) { | |
# iterate and grab the next record | |
tmp = mongo.bson.to.list(mongo.cursor.value(cursor)) | |
# make it a dataframe | |
tmp.df = as.data.frame(t(unlist(tmp)), stringsAsFactors = F) | |
# bind to the master dataframe | |
df_arch1 = rbind.fill(df_arch1, tmp.df) | |
} | |
dim(df_arch1) | |
### Try with another collection | |
collection2 <- "#edtechchat" | |
namespace2 <- paste(db, collection2, sep=".") | |
mongo.count(mongo, namespace2, mongo.bson.empty()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package twitter_loop; | |
import com.mongodb.BasicDBObject; | |
import com.mongodb.DB; | |
import com.mongodb.DBCollection; | |
import com.mongodb.DBCursor; | |
import com.mongodb.Mongo; | |
import com.mongodb.MongoException; | |
import java.net.UnknownHostException; | |
import java.util.List; | |
import java.util.Scanner; | |
import twitter4j.Query; | |
import twitter4j.QueryResult; | |
import twitter4j.Status; | |
import twitter4j.Twitter; | |
import twitter4j.TwitterException; | |
import twitter4j.TwitterFactory; | |
import twitter4j.UserMentionEntity; | |
import twitter4j.conf.ConfigurationBuilder; | |
public class Twitter_loop { | |
/** | |
* Settings before running the program | |
*/ | |
// Mongodb info | |
private String host = "ds053858.mongolab.com"; | |
private int port = 53858; | |
private String db_name = "your_db"; | |
private String username = "your_username"; | |
private char[] password = "your_password".toCharArray(); | |
// Twitter api | |
private String consumerKey = "yourkey"; | |
private String consumerSecret = "yoursecret"; | |
private String accessToken = "yourtoken"; | |
private String accessTokenSecret = "yourtokensecret"; | |
// Time interval | |
private int seconds = 60; | |
// Number of tweets to get each time | |
private int count = 100; | |
private ConfigurationBuilder cb; | |
private DB db; | |
private DBCollection items; | |
/** | |
* static block used to construct a connection with tweeter with twitter4j | |
* configuration with provided settings. This configuration builder will be | |
* used for next search action to fetch the tweets from twitter.com. | |
*/ | |
public static void main(String[] args) throws InterruptedException { | |
Twitter_loop taskObj = new Twitter_loop(); | |
taskObj.loadMenu(); | |
} | |
public void loadMenu() throws InterruptedException { | |
System.out.print("Please choose your Keyword:\t"); | |
Scanner input = new Scanner(System.in); | |
String keyword = input.nextLine(); | |
connectdb(keyword); | |
int i = 0; | |
while (i < 1) { | |
cb = new ConfigurationBuilder(); | |
cb.setDebugEnabled(true); | |
cb.setOAuthConsumerKey(consumerKey); | |
cb.setOAuthConsumerSecret(consumerSecret); | |
cb.setOAuthAccessToken(accessToken); | |
cb.setOAuthAccessTokenSecret(accessTokenSecret); | |
getTweetByQuery(true, keyword); | |
cb = null; | |
Thread.sleep(seconds * 1000); // wait | |
} | |
} | |
public void connectdb(String keyword) { | |
try { | |
// on constructor load initialize MongoDB and load collection | |
initMongoDB(); | |
items = db.getCollection(keyword); | |
//make the tweet_ID unique in the database | |
BasicDBObject index = new BasicDBObject("tweet_ID", 1); | |
items.ensureIndex(index, new BasicDBObject("unique", true)); | |
} catch (MongoException ex) { | |
System.out.println("MongoException :" + ex.getMessage()); | |
} | |
} | |
/** | |
* initMongoDB been called in constructor so every object creation this | |
* initialize MongoDB. | |
*/ | |
public void initMongoDB() throws MongoException { | |
try { | |
System.out.println("Connecting to Mongo DB.."); | |
Mongo mongo; | |
// mongo = new Mongo("127.0.0.1"); | |
// db = mongo.getDB("tweetDB2"); | |
mongo = new Mongo(host, port); | |
db = mongo.getDB(db_name); | |
db.authenticate(username, password); | |
} catch (UnknownHostException ex) { | |
System.out.println("MongoDB Connection Error :" + ex.getMessage()); | |
} | |
} | |
public void getTweetByQuery(boolean loadRecords, String keyword) throws InterruptedException { | |
TwitterFactory tf = new TwitterFactory(cb.build()); | |
Twitter twitter = tf.getInstance(); | |
if (cb != null) { | |
try { | |
Query query = new Query(keyword); | |
query.setCount(count); | |
QueryResult result; | |
result = twitter.search(query); | |
System.out.println("Getting Tweets..."); | |
List<Status> tweets = result.getTweets(); | |
for (Status tweet : tweets) { | |
BasicDBObject basicObj = new BasicDBObject(); | |
basicObj.put("user_name", tweet.getUser().getScreenName()); | |
basicObj.put("retweet_count", tweet.getRetweetCount()); | |
basicObj.put("tweet_followers_count", tweet.getUser().getFollowersCount()); | |
basicObj.put("source", tweet.getSource()); | |
//basicObj.put("coordinates",tweet.getGeoLocation()); | |
UserMentionEntity[] mentioned = tweet.getUserMentionEntities(); | |
basicObj.put("tweet_mentioned_count", mentioned.length); | |
basicObj.put("tweet_ID", tweet.getId()); | |
basicObj.put("tweet_text", tweet.getText()); | |
try { | |
items.insert(basicObj); | |
} catch (Exception e) { | |
System.out.println("MongoDB Connection Error : " + e.getMessage()); | |
} | |
} | |
// Printing fetched records from DB. | |
if (loadRecords) { | |
getTweetsRecords(); | |
} | |
} catch (TwitterException te) { | |
System.out.println("te.getErrorCode() " + te.getErrorCode()); | |
System.out.println("te.getExceptionCode() " + te.getExceptionCode()); | |
System.out.println("te.getStatusCode() " + te.getStatusCode()); | |
if (te.getStatusCode() == 401) { | |
System.out.println("Twitter Error : \nAuthentication credentials (https://dev.twitter.com/pages/auth) were missing or incorrect.\nEnsure that you have set valid consumer key/secret, access token/secret, and the system clock is in sync."); | |
} else { | |
System.out.println("Twitter Error : " + te.getMessage()); | |
} | |
} | |
} else { | |
System.out.println("MongoDB is not Connected! Please check mongoDB intance running.."); | |
} | |
} | |
public void getTweetsRecords() throws InterruptedException { | |
BasicDBObject fields = new BasicDBObject("_id", true).append("user_name", true).append("tweet_text", true); | |
DBCursor cursor = items.find(new BasicDBObject(), fields); | |
while (cursor.hasNext()) { | |
System.out.println(cursor.next()); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Dear dirkchen, I'm currently working on the same code to transfer my data collections of tweets from my MongoDB databases to R to deals with text mining & social network analysis.
I have not understand the conversion of your collection with the function paste () which gave me a NULL result for me.
Anyway, I have a limitation with rmongodb and plyr packages on large tweets collection data sets, and things for me don't want to work properly. I have posted a stack overflow here : http://stackoverflow.com/questions/22445419/transfer-large-mongodb-collections-to-data-frame-in-r-with-rmongodb-and-plyr
Any help would be merely appreciated.
Cyrille