Skip to content

Instantly share code, notes, and snippets.

@JEEN
Created June 15, 2011 15:04
Show Gist options
  • Save JEEN/1027294 to your computer and use it in GitHub Desktop.
Save JEEN/1027294 to your computer and use it in GitHub Desktop.
TokyoCabinet and corr
#!/usr/bin/env perl
use TokyoCabinet;
use strict;
use warnings;
use Text::CSV_XS;
$| = 1;
if(@ARGV < 1) {
print "Usage: $0 corrfile\n";
exit 1;
}
my $corrfile = shift;
unless(-e $corrfile) {
print "$corrfile not found.\n";
exit 2;
}
my (undef, $wc) = split /\s+/, `wc -l $corrfile`;
open my $fh_corr, '<', $corrfile;
my $csv = Text::CSV_XS->new({ sep_char => " ", eol => $/ });
my ($probe1, $probe2, $corr);
while(my $row = $csv->get_line($fh_corr)) {
chomp;
($probe1, $probe2, $corr) = @{ $row };
insert($probe1, $probe2, $corr);
insert_batch($probe2, $probe1, $corr);
printf "%10d\r", $wc unless $wc-- % 10000;
insert_flush() unless $wc % 100000;
}
insert_flush(1);
close $fh_corr;
# -----------------------------------------------------------------------
my %_cache;
sub insert_batch {
push @{ $_cache{ $_[0] } }, [ $_[1], $_[2] ];
}
sub insert_flush {
my ($full) = @_;
my $hdb;
foreach my $probe1 (keys %_cache) {
if(@{ $_cache{$probe1} } > 1000) {
foreach my $array_ref (@{ $_cache{$probe1} }) {
insert($probe1, @$array_ref);
}
delete $_cache{ $probe1 };
}
}
if($full) {
foreach my $probe1 (keys %_cache) {
foreach my $array_ref (@{ $_cache{$probe1} }) {
insert($probe1, @$array_ref);
}
delete $_cache{ $probe1 };
}
}
}
my %_dbs;
sub open_db {
my ($probe) = @_;
return $_dbs{ $probe } if exists $_dbs{ $probe };
close_db() if keys %_dbs > 20;
# create the object
my $hdb = TokyoCabinet::hdb_new();
# open the database
my $db = TokyoCabinet::hdb_open($hdb, "db/$probe.tch", $hdb->OWRITER | $hdb->OCREAT) or die "Fuck";
$_dbs{ $probe } = $db;
return $db;
}
sub close_db {
foreach(keys %_dbs) {
TokyoCabinet::hdb_close($_dbs{$_});
delete $_dbs{ $_ };
}
}
sub insert {
my ($probe1, $probe2, $corr) = @_;
my $hdb = open_db($probe1);
# store records
TokyoCabinet::hdb_put($hdb, $probe2, $corr);
}
@JEEN
Copy link
Author

JEEN commented Jun 15, 2011

$hdb->open $dbh->put 등의 경우에는 인수체크 등등으로 쓸데없는 시간이 소모되기에...

TokyoCabinet::hdb_open, TokyoCabinet::hdb_put 등을 이용해서 XS 코드를 직접 호출하도록 합니다.

나중에 putasync 도 한번 테스트를...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment