Last active
March 29, 2018 10:18
-
-
Save HyungSeokPark/3c2d058021bd1b2075a1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* (1)Get tweets by search keyword and (2)Stemming(Korean), (3)save to mysql finally | |
*/ | |
var Promise = require('bluebird'); | |
var TwitterKoreanText = require('twtkrjs'); | |
var twitterKoreanText = new TwitterKoreanText({ | |
stemmer: true, // (optional default: true) | |
normalizer: true, // (optional default: true) | |
spamfilter: true // (optional default: false) | |
}); | |
var mysql = require('mysql'); | |
var CONN_INFO = { | |
host : 'localhost', | |
user : 'user', | |
password : 'password', | |
database : 'database' | |
}; | |
var TWITTER_CONSUMER_KEY = 'TWITTER_CONSUMER_KEY'; | |
var TWITTER_CONSUMER_SECRET = 'TWITTER_CONSUMER_SECRET'; | |
var ACCESS_TOKEN_KEY = 'ACCESS_TOKEN_KEY-RAlRjfkj0L2QSpD9G8kXRp7HLmfOphnYX5ewWSLtk'; | |
var ACCESS_TOKEN_SECRET = 'ACCESS_TOKEN_SECRET'; | |
var Twitter = require('twitter'); | |
var twitterClient = new Twitter({ | |
consumer_key: TWITTER_CONSUMER_KEY, | |
consumer_secret: TWITTER_CONSUMER_SECRET, | |
access_token_key: ACCESS_TOKEN_KEY, | |
access_token_secret: ACCESS_TOKEN_SECRET | |
}); | |
var SEARCH_KEYWORD = 'math'; | |
/** | |
* 마지막에 DB에 저장된 트윗의 id값 가져오기 | |
* Getting the last saved tweet id from the db | |
*/ | |
var getLastTweetId = function () { | |
return new Promise(function(resolve, reject) { | |
var sql = 'SELECT CAST(id AS CHAR) as id_str FROM tweets ORDER BY id DESC LIMIT 1'; | |
var connection = mysql.createConnection(CONN_INFO); | |
connection.connect(); | |
connection.query(sql, function(err, rows) { | |
if (err){ | |
reject(err); | |
return; | |
} | |
var len = rows.length; | |
if(len < 1) { | |
resolve(-1); | |
return; | |
} | |
var lastTweet = rows[0]; | |
console.log('last id=' + lastTweet.id_str); | |
resolve(lastTweet.id_str); | |
}); | |
connection.end(); | |
}); | |
}; | |
/** | |
* getting tweets by search keyword | |
* @returns {*} | |
*/ | |
var getTweetsJson = function (lastTweetId) { | |
return new Promise(function(resolve, reject) { | |
// https://dev.twitter.com/rest/reference/get/search/tweets | |
twitterClient.get('search/tweets', {q: SEARCH_KEYWORD, count:100, result_type:'recent', since_id:lastTweetId, lang:'ko'} | |
, function(error, tweetsJson, response){ | |
if(error) { | |
reject(error); | |
return; | |
} | |
resolve(tweetsJson); | |
return; | |
}); | |
}); | |
}; | |
// Korean stemming | |
var makeMorphemeFromTweet = function (tweet) { | |
var txt = tweet.text; | |
return new Promise(function (resolve, reject) { | |
twitterKoreanText.extractPhrases(txt, function (err, result) { | |
if (err) throw err; | |
// filters after stemming | |
var len = result.length; | |
var morpheme = ''; | |
for (var i = 0; i < len; ++i) { | |
var word = result[i].text; | |
// except numbers | |
var matches = word.match(/\d+/g); | |
if (matches != null) { | |
continue; | |
} | |
// except SEARCH_KEYWORD | |
if (word === SEARCH_KEYWORD) { | |
continue; | |
} | |
morpheme += (morpheme === '' | |
? '' | |
: ','); | |
morpheme += word | |
} | |
resolve({morphemeText:morpheme, tweet:tweet}); | |
}); //extractPhrases | |
});//Promise | |
}; | |
var saveToDB = function (tweetsJson) { | |
return new Promise(function(resolve, reject){ | |
var arrTweets = tweetsJson.statuses; | |
var len = Number(arrTweets.length); | |
var RT_STR = 'RT @'; | |
for(var i=0; i<len; ++i) { | |
var aTweet = arrTweets[i]; | |
// except RT | |
if(Boolean(aTweet.retweeted)) { | |
continue; | |
} | |
var aTweetTxt = String(aTweet.txt); | |
if(aTweetTxt.substr(0, 4) === RT_STR) { | |
continue; | |
} | |
makeMorphemeFromTweet(aTweet).then(function(result){ | |
var morphemeText = result.morphemeText; | |
var tweet = result.tweet; | |
var id_str = String(tweet.id_str); | |
if(!id_str) { | |
return; | |
} | |
var source = String(tweet.source); | |
var tweet_text = String(tweet.text); | |
var user_banner_img = String(tweet.user.profile_banner_url); | |
var user_id = String(tweet.user.id); | |
var user_img = String(tweet.user.profile_image_url); | |
var user_name = tweet.user.name; | |
var user_profile = tweet.user.description; | |
var user_screen_name = tweet.user.screen_name; | |
var user_url = tweet.user.url; | |
var created_at = tweet.created_at; | |
var created_date = new Date(created_at); | |
created_at = created_date.getFullYear() + '-' + (created_date.getMonth()+1) + '-' + created_date.getDate() | |
+ ' ' + created_date.getHours() + ':' + created_date.getMinutes() + ':' + created_date.getSeconds(); | |
var sql = 'INSERT INTO tweets (id, source, tweet_text, morpheme, user_banner_img \ | |
,user_id, user_img, user_name, user_profile, user_screen_name , user_url, twt_created_at) \ | |
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'; | |
var arrParam = [id_str, source, tweet_text, morphemeText, user_banner_img | |
, user_id, user_img, user_name, user_profile, user_screen_name, user_url, created_at]; | |
var connection = mysql.createConnection(CONN_INFO); | |
connection.connect(); | |
connection.query(sql, arrParam, function(err, result) { | |
if (err) { | |
if(err.code == 'ER_DUP_ENTRY') { | |
console.log("mylog : ER_DUP_ENTRY detected, error=" + err); | |
return; | |
} | |
throw err; | |
} | |
}); | |
connection.end(); | |
}); | |
} | |
}); | |
}; | |
getLastTweetId() | |
.then(getTweetsJson) | |
.then(saveToDB); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment