|
const fs = require('fs'); |
|
|
|
// CHANGE TO YOUR USERNAME |
|
const USERNAME = "sadalsvvd"; |
|
|
|
// Read and preprocess the input file |
|
const rawData = fs.readFileSync('./data/tweets.js', 'utf8'); |
|
const jsonData = rawData.replace('window.YTD.tweets.part0 = ', ''); |
|
const tweetData = JSON.parse(jsonData); |
|
|
|
let count = 0; |
|
|
|
/********************************** |
|
* JSON FORMATTING FUNCTIONALITY |
|
*/ |
|
|
|
function buildTweetTree(tweetDataArray) { |
|
const idToTweetMap = {}; |
|
const roots = []; |
|
|
|
tweetDataArray.forEach(({ tweet }) => { |
|
tweet.children = []; |
|
idToTweetMap[tweet.id_str] = tweet; |
|
}); |
|
|
|
tweetDataArray.forEach(({ tweet }) => { |
|
if (tweet.in_reply_to_status_id) { |
|
const parentTweet = idToTweetMap[tweet.in_reply_to_status_id_str]; |
|
if (parentTweet) { |
|
parentTweet.children.push(tweet); |
|
} |
|
} else { |
|
roots.push(tweet); |
|
} |
|
}); |
|
|
|
return roots; |
|
} |
|
|
|
function filterTweetProperties(tweet) { |
|
// Skip retweets |
|
if (tweet.full_text.startsWith('RT @')) { |
|
return null; |
|
} |
|
|
|
count++; |
|
const filteredTweet = { |
|
id: tweet.id, |
|
id_str: tweet.id_str, |
|
full_text: tweet.full_text, |
|
favorite_count: tweet.favorite_count, |
|
retweet_count: tweet.retweet_count, |
|
created_at: new Date(tweet.created_at).toISOString(), |
|
url: `https://twitter.com/${USERNAME}/status/${tweet.id_str}`, |
|
children: [], |
|
}; |
|
|
|
if (tweet.in_reply_to_status_id) { |
|
filteredTweet.in_reply_to_status_id = tweet.in_reply_to_status_id; |
|
filteredTweet.in_reply_to_screen_name = tweet.in_reply_to_screen_name; |
|
filteredTweet.in_reply_to_url = `https://twitter.com/${tweet.in_reply_to_screen_name}/status/${tweet.in_reply_to_status_id_str}`; |
|
} |
|
|
|
if (tweet.extended_entities && tweet.extended_entities.media) { |
|
filteredTweet.media = tweet.extended_entities.media.map(mediaItem => |
|
filterMediaProperties(mediaItem, tweet.id) |
|
); |
|
} |
|
|
|
if (tweet.entities) { |
|
if (tweet.entities.urls) { |
|
filteredTweet.entities = tweet.entities.urls.map(u => u.expanded_url) |
|
} |
|
} |
|
|
|
if (tweet.children && tweet.children.length > 0) { |
|
filteredTweet.children = tweet.children.map(filterTweetProperties); |
|
} |
|
|
|
return filteredTweet; |
|
} |
|
|
|
function filterMediaProperties(mediaItem, tweetId) { |
|
const filename = mediaItem.media_url_https.split('/').pop(); |
|
|
|
return { |
|
media_url_https: mediaItem.media_url_https, |
|
type: mediaItem.type, |
|
source_user_id: mediaItem.source_user_id, |
|
display_url: mediaItem.display_url, |
|
relative_media_path: `data/tweets_media/${tweetId}-${filename}` |
|
}; |
|
} |
|
|
|
function buildFilteredTweetTree(tweetDataArray) { |
|
const tweetTree = buildTweetTree(tweetDataArray); |
|
const filteredTweetTree = tweetTree |
|
.map(filterTweetProperties) |
|
.filter(tweet => tweet !== null); |
|
|
|
return filteredTweetTree; |
|
} |
|
|
|
/******************************* |
|
* CSV FORMATTING FUNCTIONALITY |
|
*/ |
|
|
|
function flattenTweets(tweets) { |
|
const flattened = []; |
|
|
|
for (const tweet of tweets) { |
|
flattened.push(tweet); |
|
if (tweet.children && tweet.children.length > 0) { |
|
flattened.push(...flattenTweets(tweet.children)); |
|
} |
|
} |
|
|
|
return flattened; |
|
} |
|
|
|
function tweetToCsvRow(tweet) { |
|
const fields = [ |
|
tweet.id, |
|
`"${tweet.full_text.replace(/"/g, '""')}"`, |
|
tweet.favorite_count, |
|
tweet.retweet_count, |
|
tweet.created_at, |
|
tweet.url, |
|
tweet.in_reply_to_status_id || '', |
|
tweet.in_reply_to_screen_name || '', |
|
tweet.in_reply_to_url || '', |
|
`"${tweet.entities ? tweet.entities.join(';') : ''}"`, |
|
`"${tweet.media ? tweet.media.map(m => m.relative_media_path).join('\n') : ''}"`, |
|
`"${tweet.media ? tweet.media.map(m => m.media_url_https).join('\n') : ''}"`, |
|
]; |
|
|
|
return fields.join(','); |
|
} |
|
|
|
function generateCsvHeader() { |
|
return [ |
|
'id', |
|
'full_text', |
|
'favorite_count', |
|
'retweet_count', |
|
'created_at', |
|
'url', |
|
'in_reply_to_status_id', |
|
'in_reply_to_screen_name', |
|
'in_reply_to_url', |
|
'entities', |
|
'media_paths', |
|
'media_urls' |
|
].join(','); |
|
} |
|
|
|
function writeTweetsToCsv(tweets, outputFile) { |
|
const flattenedTweets = flattenTweets(tweets); |
|
const header = generateCsvHeader(); |
|
const rows = flattenedTweets.map(tweetToCsvRow); |
|
const csv = [header, ...rows].join('\n'); |
|
|
|
fs.writeFileSync(outputFile, csv); |
|
} |
|
|
|
// Write preprocessed data to tweets.json |
|
fs.writeFileSync('tweets.json', jsonData, 'utf8'); |
|
console.log('Extracted Twitter archive tweet data to: tweets.json'); |
|
|
|
const roots = buildFilteredTweetTree(tweetData); |
|
console.log('Final count of original tweets:', count); |
|
|
|
// Write outputs |
|
fs.writeFileSync('tweets_simplified.json', JSON.stringify(roots, null, 4), 'utf8'); |
|
console.log('JSON file created successfully at: tweets_simplified.json'); |
|
writeTweetsToCsv(roots, 'tweets.csv'); |
|
console.log('CSV file created successfully at: tweets.csv'); |