Created
June 15, 2011 15:04
-
-
Save JEEN/1027294 to your computer and use it in GitHub Desktop.
TokyoCabinet and corr
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use TokyoCabinet; | |
use strict; | |
use warnings; | |
use Text::CSV_XS; | |
$| = 1; | |
if(@ARGV < 1) { | |
print "Usage: $0 corrfile\n"; | |
exit 1; | |
} | |
my $corrfile = shift; | |
unless(-e $corrfile) { | |
print "$corrfile not found.\n"; | |
exit 2; | |
} | |
my (undef, $wc) = split /\s+/, `wc -l $corrfile`; | |
open my $fh_corr, '<', $corrfile; | |
my $csv = Text::CSV_XS->new({ sep_char => " ", eol => $/ }); | |
my ($probe1, $probe2, $corr); | |
while(my $row = $csv->get_line($fh_corr)) { | |
chomp; | |
($probe1, $probe2, $corr) = @{ $row }; | |
insert($probe1, $probe2, $corr); | |
insert_batch($probe2, $probe1, $corr); | |
printf "%10d\r", $wc unless $wc-- % 10000; | |
insert_flush() unless $wc % 100000; | |
} | |
insert_flush(1); | |
close $fh_corr; | |
# ----------------------------------------------------------------------- | |
my %_cache; | |
sub insert_batch { | |
push @{ $_cache{ $_[0] } }, [ $_[1], $_[2] ]; | |
} | |
sub insert_flush { | |
my ($full) = @_; | |
my $hdb; | |
foreach my $probe1 (keys %_cache) { | |
if(@{ $_cache{$probe1} } > 1000) { | |
foreach my $array_ref (@{ $_cache{$probe1} }) { | |
insert($probe1, @$array_ref); | |
} | |
delete $_cache{ $probe1 }; | |
} | |
} | |
if($full) { | |
foreach my $probe1 (keys %_cache) { | |
foreach my $array_ref (@{ $_cache{$probe1} }) { | |
insert($probe1, @$array_ref); | |
} | |
delete $_cache{ $probe1 }; | |
} | |
} | |
} | |
my %_dbs; | |
sub open_db { | |
my ($probe) = @_; | |
return $_dbs{ $probe } if exists $_dbs{ $probe }; | |
close_db() if keys %_dbs > 20; | |
# create the object | |
my $hdb = TokyoCabinet::hdb_new(); | |
# open the database | |
my $db = TokyoCabinet::hdb_open($hdb, "db/$probe.tch", $hdb->OWRITER | $hdb->OCREAT) or die "Fuck"; | |
$_dbs{ $probe } = $db; | |
return $db; | |
} | |
sub close_db { | |
foreach(keys %_dbs) { | |
TokyoCabinet::hdb_close($_dbs{$_}); | |
delete $_dbs{ $_ }; | |
} | |
} | |
sub insert { | |
my ($probe1, $probe2, $corr) = @_; | |
my $hdb = open_db($probe1); | |
# store records | |
TokyoCabinet::hdb_put($hdb, $probe2, $corr); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
$hdb->open
$dbh->put
등의 경우에는 인수체크 등등으로 쓸데없는 시간이 소모되기에...TokyoCabinet::hdb_open
,TokyoCabinet::hdb_put
등을 이용해서 XS 코드를 직접 호출하도록 합니다.나중에 putasync 도 한번 테스트를...