Skip to content

Instantly share code, notes, and snippets.

@meefen
Last active December 30, 2015 21:09
Show Gist options
  • Select an option

  • Save meefen/7885000 to your computer and use it in GitHub Desktop.

Select an option

Save meefen/7885000 to your computer and use it in GitHub Desktop.
This gist contains Java code for archiving tweets in MongoDB, and R code for retrieving tweets for analysis. Please note that there're a couple of dependencies: (1) You should setup your MongoDB (I used the free plan at https://mongolab.com/); (2) You should have setup your Twitter api (check https://dev.twitter.com/); (3) Java code: include twi…
library(rmongodb)
## Host info and credentials
host <- "ds053858.mongolab.com:53858"
username <- "your_username"
password <- "your_pass"
db <- "your_db"
## Connect to mongodb
mongo <- mongo.create(host=host, db=db,
username=username, password=password)
## Get a list of collections within our namespace
# here I used each collection for a twitter archive
mongo.get.database.collections(mongo, db)
## Create a string that points to the namespace
# the collection I'm interested in is "#mri13"
collection <- "#mri13"
namespace <- paste(db, collection, sep=".")
## Check the total number of tweets in "#mri13"
mongo.count(mongo, namespace, mongo.bson.empty())
## Build a query to find how many tweets were posted by me
buf <- mongo.bson.buffer.create()
mongo.bson.buffer.append(buf, "user_name", "bodongchen")
query <- mongo.bson.from.buffer(buf)
# get the count
count <- mongo.count(mongo, namespace, query)
count
## Get all tweets posted by me
tweets <- list()
cursor <- mongo.find(mongo, namespace, query)
while (mongo.cursor.next(cursor)) {
val <- mongo.cursor.value(cursor)
tweets[[length(tweets)+1]] <- mongo.bson.value(val, "tweet_text")
}
length(tweets)
## Retrieve all tweets and put into a dataframe
library(plyr)
df_arch1 = data.frame(stringsAsFactors = FALSE)
cursor <- mongo.find(mongo, namespace)
while (mongo.cursor.next(cursor)) {
# iterate and grab the next record
tmp = mongo.bson.to.list(mongo.cursor.value(cursor))
# make it a dataframe
tmp.df = as.data.frame(t(unlist(tmp)), stringsAsFactors = F)
# bind to the master dataframe
df_arch1 = rbind.fill(df_arch1, tmp.df)
}
dim(df_arch1)
### Try with another collection
collection2 <- "#edtechchat"
namespace2 <- paste(db, collection2, sep=".")
mongo.count(mongo, namespace2, mongo.bson.empty())
package twitter_loop;
import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.Mongo;
import com.mongodb.MongoException;
import java.net.UnknownHostException;
import java.util.List;
import java.util.Scanner;
import twitter4j.Query;
import twitter4j.QueryResult;
import twitter4j.Status;
import twitter4j.Twitter;
import twitter4j.TwitterException;
import twitter4j.TwitterFactory;
import twitter4j.UserMentionEntity;
import twitter4j.conf.ConfigurationBuilder;
public class Twitter_loop {
/**
* Settings before running the program
*/
// Mongodb info
private String host = "ds053858.mongolab.com";
private int port = 53858;
private String db_name = "your_db";
private String username = "your_username";
private char[] password = "your_password".toCharArray();
// Twitter api
private String consumerKey = "yourkey";
private String consumerSecret = "yoursecret";
private String accessToken = "yourtoken";
private String accessTokenSecret = "yourtokensecret";
// Time interval
private int seconds = 60;
// Number of tweets to get each time
private int count = 100;
private ConfigurationBuilder cb;
private DB db;
private DBCollection items;
/**
* static block used to construct a connection with tweeter with twitter4j
* configuration with provided settings. This configuration builder will be
* used for next search action to fetch the tweets from twitter.com.
*/
public static void main(String[] args) throws InterruptedException {
Twitter_loop taskObj = new Twitter_loop();
taskObj.loadMenu();
}
public void loadMenu() throws InterruptedException {
System.out.print("Please choose your Keyword:\t");
Scanner input = new Scanner(System.in);
String keyword = input.nextLine();
connectdb(keyword);
int i = 0;
while (i < 1) {
cb = new ConfigurationBuilder();
cb.setDebugEnabled(true);
cb.setOAuthConsumerKey(consumerKey);
cb.setOAuthConsumerSecret(consumerSecret);
cb.setOAuthAccessToken(accessToken);
cb.setOAuthAccessTokenSecret(accessTokenSecret);
getTweetByQuery(true, keyword);
cb = null;
Thread.sleep(seconds * 1000); // wait
}
}
public void connectdb(String keyword) {
try {
// on constructor load initialize MongoDB and load collection
initMongoDB();
items = db.getCollection(keyword);
//make the tweet_ID unique in the database
BasicDBObject index = new BasicDBObject("tweet_ID", 1);
items.ensureIndex(index, new BasicDBObject("unique", true));
} catch (MongoException ex) {
System.out.println("MongoException :" + ex.getMessage());
}
}
/**
* initMongoDB been called in constructor so every object creation this
* initialize MongoDB.
*/
public void initMongoDB() throws MongoException {
try {
System.out.println("Connecting to Mongo DB..");
Mongo mongo;
// mongo = new Mongo("127.0.0.1");
// db = mongo.getDB("tweetDB2");
mongo = new Mongo(host, port);
db = mongo.getDB(db_name);
db.authenticate(username, password);
} catch (UnknownHostException ex) {
System.out.println("MongoDB Connection Error :" + ex.getMessage());
}
}
public void getTweetByQuery(boolean loadRecords, String keyword) throws InterruptedException {
TwitterFactory tf = new TwitterFactory(cb.build());
Twitter twitter = tf.getInstance();
if (cb != null) {
try {
Query query = new Query(keyword);
query.setCount(count);
QueryResult result;
result = twitter.search(query);
System.out.println("Getting Tweets...");
List<Status> tweets = result.getTweets();
for (Status tweet : tweets) {
BasicDBObject basicObj = new BasicDBObject();
basicObj.put("user_name", tweet.getUser().getScreenName());
basicObj.put("retweet_count", tweet.getRetweetCount());
basicObj.put("tweet_followers_count", tweet.getUser().getFollowersCount());
basicObj.put("source", tweet.getSource());
//basicObj.put("coordinates",tweet.getGeoLocation());
UserMentionEntity[] mentioned = tweet.getUserMentionEntities();
basicObj.put("tweet_mentioned_count", mentioned.length);
basicObj.put("tweet_ID", tweet.getId());
basicObj.put("tweet_text", tweet.getText());
try {
items.insert(basicObj);
} catch (Exception e) {
System.out.println("MongoDB Connection Error : " + e.getMessage());
}
}
// Printing fetched records from DB.
if (loadRecords) {
getTweetsRecords();
}
} catch (TwitterException te) {
System.out.println("te.getErrorCode() " + te.getErrorCode());
System.out.println("te.getExceptionCode() " + te.getExceptionCode());
System.out.println("te.getStatusCode() " + te.getStatusCode());
if (te.getStatusCode() == 401) {
System.out.println("Twitter Error : \nAuthentication credentials (https://dev.twitter.com/pages/auth) were missing or incorrect.\nEnsure that you have set valid consumer key/secret, access token/secret, and the system clock is in sync.");
} else {
System.out.println("Twitter Error : " + te.getMessage());
}
}
} else {
System.out.println("MongoDB is not Connected! Please check mongoDB intance running..");
}
}
public void getTweetsRecords() throws InterruptedException {
BasicDBObject fields = new BasicDBObject("_id", true).append("user_name", true).append("tweet_text", true);
DBCursor cursor = items.find(new BasicDBObject(), fields);
while (cursor.hasNext()) {
System.out.println(cursor.next());
}
}
}
@SONEINT

SONEINT commented Mar 22, 2014

Copy link
Copy Markdown

Dear dirkchen, I'm currently working on the same code to transfer my data collections of tweets from my MongoDB databases to R to deals with text mining & social network analysis.
I have not understand the conversion of your collection with the function paste () which gave me a NULL result for me.
Anyway, I have a limitation with rmongodb and plyr packages on large tweets collection data sets, and things for me don't want to work properly. I have posted a stack overflow here : http://stackoverflow.com/questions/22445419/transfer-large-mongodb-collections-to-data-frame-in-r-with-rmongodb-and-plyr
Any help would be merely appreciated.

Cyrille

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment