-
-
Save antonini/0f2620d8378905b19df41e5bbe4c9fae to your computer and use it in GitHub Desktop.
Whatsapp chat log parser for pisg
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Whatsapp log parser for pisg, made by Tim van Werkhoven | |
# Features: parses logs, detects subject changes | |
# Known issues: only works without images, does not filter out 'image omitted' texts | |
# Use Charset = "utf-8" in config file to enable emoji | |
package Pisg::Parser::Format::whatsapp; | |
use strict; | |
$^W = 1; | |
sub new | |
{ | |
my ($type, %args) = @_; | |
my $self = { | |
cfg => $args{cfg}, | |
# Tested at http://www.regexplanet.com/advanced/perl/index.html | |
# Example line | |
# 27/11/13 21:40:56: Tim: Zo, even whatsapp geleegd | |
# 27/11/2013, 21:40:55: Timmeh: Zo, even whatsapp geleegd | |
normalline => '^\d+\/\d+\/\d+,\s(\d+):\d+\:\d+\:\s+([^:]+)\: (.+)', | |
actionline => '^NA', | |
thirdline => '^\d+\/\d+\/\d+\s(\d+):(\d+)\:\d+\:\s+(.+)', | |
}; | |
#$self->{cfg}->{botnicks} .= ' Hub-Security'; | |
bless($self, $type); | |
return $self; | |
} | |
# Parse a normal line - returns a hash with 'hour', 'nick' and 'saying' | |
sub normalline | |
{ | |
my ($self, $line, $lines) = @_; | |
my %hash; | |
if ($line =~ /$self->{normalline}/o) { | |
# Most log formats are regular enough that you can just match the | |
# appropriate things with parentheses in the regular expression. | |
$hash{hour} = $1; | |
$hash{nick} = $2; | |
$hash{saying} = $3; | |
# Fix <image ommitted>, replace with <picture> (single word that can be logged and tallied) | |
if ($3 =~ ".*image omitted.*") { | |
$hash{saying} = "<picture>"; | |
} | |
if ($self->{cfg}->{botnicks} =~ /\b\Q$hash{nick}\E\b/) { | |
return; | |
} | |
return \%hash; | |
} else { | |
return; | |
} | |
} | |
# Parse an action line - returns a hash with 'hour', 'nick' and 'saying' | |
sub actionline | |
{ | |
my ($self, $line, $lines) = @_; | |
my %hash; | |
if ($line =~ /$self->{actionline}/o) { | |
# Most log formats are regular enough that you can just match the | |
# appropriate things with parentheses in the regular expression. | |
$hash{hour} = $1; | |
$hash{nick} = $2; | |
$hash{saying} = $3; | |
return \%hash; | |
} else { | |
return; | |
} | |
} | |
# Parses the 'third' line - (the third line is everything else, like | |
# topic changes, mode changes, kicks, etc.) | |
# thirdline() has to return a hash with the following keys, for | |
# every format: | |
# hour - the hour we're in (for timestamp logging) | |
# min - the minute we're in (for timestamp logging) | |
# nick - the nick | |
# kicker - the nick which kicked somebody (if any) | |
# newtopic - the new topic (if any) | |
# newmode - deops or ops, must be '+o' or '-o', or '+ooo' | |
# newjoin - a new nick which has joined the channel | |
# newnick - a person has changed nick and this is the new nick | |
# | |
# It should return a hash with the following (for formatting lines in html) | |
# | |
# kicktext - the kick reason (if any) | |
# modechanges - data of the mode change ('Nick' in '+o Nick') | |
# | |
# The hash may also have a "repeated" key indicating the number of times | |
# the line was repeated. (Used by eggdrops log for example.) | |
sub thirdline | |
{ | |
my ($self, $line, $lines) = @_; | |
my %hash; | |
if ($line =~ /$self->{thirdline}/o) { | |
$hash{hour} = $1; | |
$hash{min} = $2; | |
$hash{nick} = $3; | |
# Example line: Tim changed the subject to βππ#!testππβ | |
# Format-specific stuff goes here. | |
if ($3 =~ /^(.*?) changed the subject to (.+)/) { | |
$hash{nick} = $1; | |
$hash{newtopic} = $2; | |
} | |
return \%hash; | |
} else { | |
return; | |
} | |
} | |
1; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment