Skip to content

Instantly share code, notes, and snippets.

@kimmel
kimmel / method_bench.pl
Created September 19, 2012 12:22
A Perl benchmark of foreach loops
#!/usr/bin/perl
use v5.16;
use warnings;
use autodie qw( :all );
use utf8::all;
use List::MoreUtils qw( uniq any );
use Benchmark qw( cmpthese :hireswallclock );
my %file_names = ();
@kimmel
kimmel / gist:3689276
Created September 10, 2012 06:42
simple text matching with index
#!/usr/bin/perl
use v5.14;
use warnings;
use utf8::all;
use File::Slurp qw( read_file );
my $pattern_list = do 'fw.pl';
my @patterns = keys $pattern_list;
my $content = read_file( 'dracula.txt' );
@kimmel
kimmel / gist:3689246
Created September 10, 2012 06:32
scan dracula for 4k patterns
#!/usr/bin/perl
use v5.14;
use warnings;
use utf8::all;
use File::Slurp qw( read_file );
my $pattern_list = do 'fw.pl';
my @patterns = keys $pattern_list;
my $content = read_file( 'dracula.txt' );
@kimmel
kimmel / gist:3688579
Created September 10, 2012 02:57
brute force all match patterns
#!/usr/bin/perl
use v5.14;
use warnings;
use utf8::all;
use File::Slurp qw( read_file );
...
my @patterns = map {qr/\b$_\b/ixms} keys $pattern_list;
@kimmel
kimmel / gist:3688004
Created September 9, 2012 23:55
text normalization and token splitting
#!/usr/bin/perl
use v5.14;
use warnings;
use utf8::all;
use List::Util qw( reduce );
use List::MoreUtils qw( uniq any );
use Path::Class::Rule;
use File::Slurp qw( read_file );
@kimmel
kimmel / gist:3681026
Created September 8, 2012 23:28
decode_json() takes a binary encoded string
#!/usr/bin/perl
use v5.14;
use warnings;
use utf8::all;
use Encode;
use Data::Dumper;
use JSON::XS qw( decode_json );
my $wl = '{"creche":"crèche", "¥":"£", "₡":"волн"}';
@kimmel
kimmel / gist:3482317
Created August 26, 2012 18:23
perl regexp html parsing
$html =~ m{
>\s*$num\.</td>\s*<td>\s*<center>\s*<a\s+id=up_
(\d+) # $1 -> id
\s+href="vote\?for=\g1&dir=up&whence=[%a-e0-9]+">\s*<img\s+src="http://yc
ombinator\.com/images/grayarrow\.gif"\s+border=\d+\s+vspace=\d+\s+hspace=
\d+>\s*</a>\s*<span\s+id=down_\g1>\s*</span>\s*</center>\s*</td>\s*<td\s+
class="title">\s*<a\s+href="
([^"]+) # $2 -> uri
">
([^<]+) # $3 -> desc
@kimmel
kimmel / gist:3482230
Created August 26, 2012 18:10
python HTMLParser regexp
locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
(?:[\s/]* # optional whitespace before attribute name
(?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
(?:\s*=+\s* # value indicator
(?:'[^']*' # LITA-enclosed value
|"[^"]*" # LIT-enclosed value
|(?!['"])[^>\s]* # bare value
)
)?(?:\s|/(?!>))*
@kimmel
kimmel / gist:3482220
Created August 26, 2012 18:09
python Beautiful Soup regexp 2
# Methods for supporting CSS selectors.
tag_name_re = re.compile('^[a-z0-9]+$')
# /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
# \---/ \---/\-------------/ \-------/
# | | | |
# | | | The value
# | | ~,|,^,$,* or =
# | Attribute
@kimmel
kimmel / gist:3482211
Created August 26, 2012 18:08
python Beautiful Soup regexp 1
from datetime import datetime
import BeautifulSoup as soup
import requests
...
r = requests.get(host + page)
doc = soup.BeautifulSoup(r.content)
titles = doc.table.findAll(True, {'class': 'title'})