Skip to content

Instantly share code, notes, and snippets.

@danielepolencic
Created April 16, 2018 14:39
Show Gist options
  • Save danielepolencic/3e5acd8e8c7409301ad9c5015e69dd07 to your computer and use it in GitHub Desktop.
Save danielepolencic/3e5acd8e8c7409301ad9c5015e69dd07 to your computer and use it in GitHub Desktop.
Crunch tweets on Huginn
var Agent = {} as any;
interface ITweet {
urls: string[]
text: string
id: string
timestamp: number
media: string[]
user: string
}
interface IContainer {
oldestTweet: number
tweets: ITweet[]
texts: string[]
urls: string[]
}
interface IContainerGroup {
containers: IContainer[]
value: string
}
enum Const {
NO_URL = 'NO_URL',
NO_MEDIA = 'NO_MEDIA',
}
function onlyUnique(value, index, self) {
return self.indexOf(value) === index;
}
function groupByKey(tweets: IContainer[], fn: (t: IContainer) => string[]) {
var groupedTweets = tweets.reduce(function(acc, container) {
fn(container).forEach(function(value) {
if(!acc[value]) {
acc[value] = {
containers: [container],
value: value
};
} else {
acc[value].containers = acc[value].containers.concat(container)
}
});
return acc;
}, {} as {[url: string]: {containers: IContainer[], value: string}});
var containerGroups: IContainerGroup[] = Object.keys(groupedTweets).map(function(key) {
return groupedTweets[key];
});
return containerGroups;
}
function reduceGroup(containerGroups: IContainerGroup[]): IContainer[] {
if (containerGroups.length === 0) return [];
return containerGroups
.filter(function(containerGroup) {
return containerGroup.containers.length > 0;
})
.map(function(containerGroup) {
var tweets = containerGroup.containers.reduce(function(acc, container) {
return acc.concat(container.tweets);
}, [] as ITweet[])
return {
oldestTweet: Math.min.apply(null, tweets.map(function(tweet) {
return tweet.timestamp;
})),
texts: containerGroup.containers.reduce(function(acc, tweet) {
return acc.concat(tweet.texts)
}, [] as string[]).filter(onlyUnique),
urls: containerGroup.containers.reduce(function(acc, tweet) {
return acc.concat(tweet.urls)
}, [] as string[]).filter(onlyUnique),
tweets: tweets,
};
});
}
function uniquify(containers: IContainer[], predicate: (c: IContainer) => string): IContainer[] {
var result = containers.reduce(function(acc, container) {
var hash = predicate(container);
if (!acc.hashes[hash]) {
acc.list.push(container);
acc.hashes[hash] = true;
}
return acc;
}, {list: [] as IContainer[], hashes: {} as {[hash: string]: boolean}});
return result.list;
}
function groupAll(containers: IContainer[]): IContainer[] {
const groupByText = groupByKey(containers, function(container) {
return container.texts.map(removeUrls);
});
const uniqueTweetsByText = reduceGroup(groupByText);
const groupByUrl = groupByKey(uniqueTweetsByText, function(container) {
return container.urls;
});
const uniqueTweetsByUrl = reduceGroup(groupByUrl);
return uniquify(uniqueTweetsByUrl, function(container) {
return container.tweets.map(function(tweet) {
return tweet.id;
}).join('');
});
}
function removeUrls(text: string): string {
return text.replace(/(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})/gi, '');
}
function liftTweet(id: string, text: string, user: string, urls: string[], media: string[], timestamp: string): IContainer {
var tweet: ITweet = {
id: id,
text: text,
user: user,
urls: urls.length > 0 ? urls : [Const.NO_URL],
media: media.length > 0 ? media : [Const.NO_MEDIA],
timestamp: parseInt(timestamp, 10),
};
return {
oldestTweet: tweet.timestamp,
urls: tweet.urls,
texts: [tweet.text],
tweets: [tweet]
};
}
var ONE_HOUR_IN_MILLISECONDS = 1000 * 60 * 60;
var ONE_DAY_IN_MILLISECONDS = 1000 * 60 * 60 * 24;
Agent.check = function() {
var items = Array.isArray(this.memory('items')) ? this.memory('items') : [];
if (items.length === 0) {
return;
}
var now = new Date().getTime();
const containers = groupAll(items).reduce(function(acc, container) {
if (container.oldestTweet < (now - ONE_HOUR_IN_MILLISECONDS)) {
acc.discard.push(container);
} else {
acc.keep.push(container);
}
return acc;
}, {keep: [] as IContainer[], discard: [] as IContainer[]});
if (containers.discard.length > 0) {
this.createEvent({tweets: containers.discard});
}
this.memory('items', containers.keep);
};
Agent.receive = function() {
var items = Array.isArray(this.memory('items')) ? this.memory('items') : [];
var events = this.incomingEvents()
.map(function(event) {
return event.payload;
})
.map(function(event) {
var urls, media;
if (!!event.extended_tweet) {
urls = event.extended_tweet.entities.urls;
media = event.extended_tweet.entities.media;
} else {
urls = event.entities.urls;
media = event.entities.media;
}
urls = Array.isArray(urls) ? urls : [];
media = Array.isArray(media) ? media : [];
return liftTweet(
event.id_str,
event.text,
event.user.screen_name,
urls.map(function(url) {
return url.expanded_url
}),
media.map(function(media) {
return media.media_url_https;
}),
event.timestamp_ms
);
}, this);
this.memory('items', items.concat(events));
}
console.assert(groupByKey([
liftTweet('1', 'text1', 'user1', ['url1'], [], '0'),
], function(container) {return container.texts;}).length === 1)
console.assert(groupByKey([
liftTweet('1', 'text1', 'user1', ['url1', 'url2'], [], '0'),
liftTweet('1', 'text1', 'user1', ['url1'], [], '0'),
], function(container) {return container.texts;}).length === 1)
console.assert(groupByKey([
liftTweet('1', 'text1', 'user1', ['url1', 'url2'], [], '0'),
liftTweet('1', 'text1', 'user1', ['url1'], [], '0'),
liftTweet('1', 'text1', 'user1', ['url2'], [], '0'),
liftTweet('1', 'text2', 'user1', ['url2', 'url3'], [], '0'),
], function(tweet) {return tweet.texts;})[0].containers.length === 3)
console.assert(reduceGroup([{containers: [], value: '1'}]).length === 0);
console.assert(reduceGroup([{containers: [
liftTweet('1', 'user', 'me', ['url1', 'url2'], [], '0'),
liftTweet('2', 'user', 'me', ['url1'], [], '0'),
], value: '1'}]).length === 1);
var containersTest1 = reduceGroup([
{
containers: [
liftTweet('1', 'text1', 'user1', ['url1', 'url2'], [], '1'),
liftTweet('2', 'text2', 'user2', ['url1'], [], '2'),
], value: '1'
},
{
containers: [
liftTweet('3', 'text3', 'user3', ['url3'], [], '0'),
], value: '2'
},
]);
console.assert(containersTest1.length === 2);
console.assert(containersTest1[0].texts.join('') === 'text1text2');
console.assert(containersTest1[1].texts.join('') === 'text3');
console.assert(containersTest1[0].urls.join('') === 'url1url2');
console.assert(containersTest1[1].urls.join('') === 'url3');
console.assert(containersTest1[0].oldestTweet === 1);
console.assert(containersTest1[1].oldestTweet === 0);
console.assert(groupAll([
liftTweet('1', 'Just in! Introduction to Kubernetes and Minikube. Learn how to use Kubernetes to deploy a small microservices-based… https://t.co/iRqhIRfu0k', 'mimacom', ['http://bit.ly/2HmgZdA'], [], '1523866451489'),
liftTweet('2', 'Just in! Introduction to Kubernetes and Minikube. Learn how to use Kubernetes to deploy a small microservices-based… https://t.co/RkwPllbdxC', 'mimacom', ['http://bit.ly/2HAQhLS'], [], '1523866451489')
]).length === 1)
console.assert(groupAll([
liftTweet('985841185938919425', 'TRAINING: 2 days #Kubernetes Dojo Training! Plus here is a discount code for you: K8SLAB20 For more information a… https://t.co/z2zodbQGa6', 'kloia_com', ['https://kloia.com/training/kubernetes/'], [], '1523877829428'),
liftTweet('985841185938919425', 'TRAINING: 2 days #Kubernetes Dojo Training! Plus here is a discount code for you: K8SLAB20 For more information a… https://t.co/9jCsyU8YM0', 'kloia_com', ['https://kloia.com/training/kubernetes/'], [], '1523878186274'),
]).length === 1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment