Skip to content

Instantly share code, notes, and snippets.

@HyungSeokPark
Last active March 29, 2018 10:18
Show Gist options
  • Save HyungSeokPark/3c2d058021bd1b2075a1 to your computer and use it in GitHub Desktop.
Save HyungSeokPark/3c2d058021bd1b2075a1 to your computer and use it in GitHub Desktop.
/**
* (1)Get tweets by search keyword and (2)Stemming(Korean), (3)save to mysql finally
*/
var Promise = require('bluebird');
var TwitterKoreanText = require('twtkrjs');
var twitterKoreanText = new TwitterKoreanText({
stemmer: true, // (optional default: true)
normalizer: true, // (optional default: true)
spamfilter: true // (optional default: false)
});
var mysql = require('mysql');
var CONN_INFO = {
host : 'localhost',
user : 'user',
password : 'password',
database : 'database'
};
var TWITTER_CONSUMER_KEY = 'TWITTER_CONSUMER_KEY';
var TWITTER_CONSUMER_SECRET = 'TWITTER_CONSUMER_SECRET';
var ACCESS_TOKEN_KEY = 'ACCESS_TOKEN_KEY-RAlRjfkj0L2QSpD9G8kXRp7HLmfOphnYX5ewWSLtk';
var ACCESS_TOKEN_SECRET = 'ACCESS_TOKEN_SECRET';
var Twitter = require('twitter');
var twitterClient = new Twitter({
consumer_key: TWITTER_CONSUMER_KEY,
consumer_secret: TWITTER_CONSUMER_SECRET,
access_token_key: ACCESS_TOKEN_KEY,
access_token_secret: ACCESS_TOKEN_SECRET
});
var SEARCH_KEYWORD = 'math';
/**
* 마지막에 DB에 저장된 트윗의 id값 가져오기
* Getting the last saved tweet id from the db
*/
var getLastTweetId = function () {
return new Promise(function(resolve, reject) {
var sql = 'SELECT CAST(id AS CHAR) as id_str FROM tweets ORDER BY id DESC LIMIT 1';
var connection = mysql.createConnection(CONN_INFO);
connection.connect();
connection.query(sql, function(err, rows) {
if (err){
reject(err);
return;
}
var len = rows.length;
if(len < 1) {
resolve(-1);
return;
}
var lastTweet = rows[0];
console.log('last id=' + lastTweet.id_str);
resolve(lastTweet.id_str);
});
connection.end();
});
};
/**
* getting tweets by search keyword
* @returns {*}
*/
var getTweetsJson = function (lastTweetId) {
return new Promise(function(resolve, reject) {
// https://dev.twitter.com/rest/reference/get/search/tweets
twitterClient.get('search/tweets', {q: SEARCH_KEYWORD, count:100, result_type:'recent', since_id:lastTweetId, lang:'ko'}
, function(error, tweetsJson, response){
if(error) {
reject(error);
return;
}
resolve(tweetsJson);
return;
});
});
};
// Korean stemming
var makeMorphemeFromTweet = function (tweet) {
var txt = tweet.text;
return new Promise(function (resolve, reject) {
twitterKoreanText.extractPhrases(txt, function (err, result) {
if (err) throw err;
// filters after stemming
var len = result.length;
var morpheme = '';
for (var i = 0; i < len; ++i) {
var word = result[i].text;
// except numbers
var matches = word.match(/\d+/g);
if (matches != null) {
continue;
}
// except SEARCH_KEYWORD
if (word === SEARCH_KEYWORD) {
continue;
}
morpheme += (morpheme === ''
? ''
: ',');
morpheme += word
}
resolve({morphemeText:morpheme, tweet:tweet});
}); //extractPhrases
});//Promise
};
var saveToDB = function (tweetsJson) {
return new Promise(function(resolve, reject){
var arrTweets = tweetsJson.statuses;
var len = Number(arrTweets.length);
var RT_STR = 'RT @';
for(var i=0; i<len; ++i) {
var aTweet = arrTweets[i];
// except RT
if(Boolean(aTweet.retweeted)) {
continue;
}
var aTweetTxt = String(aTweet.txt);
if(aTweetTxt.substr(0, 4) === RT_STR) {
continue;
}
makeMorphemeFromTweet(aTweet).then(function(result){
var morphemeText = result.morphemeText;
var tweet = result.tweet;
var id_str = String(tweet.id_str);
if(!id_str) {
return;
}
var source = String(tweet.source);
var tweet_text = String(tweet.text);
var user_banner_img = String(tweet.user.profile_banner_url);
var user_id = String(tweet.user.id);
var user_img = String(tweet.user.profile_image_url);
var user_name = tweet.user.name;
var user_profile = tweet.user.description;
var user_screen_name = tweet.user.screen_name;
var user_url = tweet.user.url;
var created_at = tweet.created_at;
var created_date = new Date(created_at);
created_at = created_date.getFullYear() + '-' + (created_date.getMonth()+1) + '-' + created_date.getDate()
+ ' ' + created_date.getHours() + ':' + created_date.getMinutes() + ':' + created_date.getSeconds();
var sql = 'INSERT INTO tweets (id, source, tweet_text, morpheme, user_banner_img \
,user_id, user_img, user_name, user_profile, user_screen_name , user_url, twt_created_at) \
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)';
var arrParam = [id_str, source, tweet_text, morphemeText, user_banner_img
, user_id, user_img, user_name, user_profile, user_screen_name, user_url, created_at];
var connection = mysql.createConnection(CONN_INFO);
connection.connect();
connection.query(sql, arrParam, function(err, result) {
if (err) {
if(err.code == 'ER_DUP_ENTRY') {
console.log("mylog : ER_DUP_ENTRY detected, error=" + err);
return;
}
throw err;
}
});
connection.end();
});
}
});
};
getLastTweetId()
.then(getTweetsJson)
.then(saveToDB);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment