-
-
Save JEEN/26331164e99e3d74d7a3 to your computer and use it in GitHub Desktop.
use strict; | |
use warnings; | |
use Lingua::Sentence; | |
use Data::Printer; | |
open my $fh, "<", $ARGV[0] or die "OH FILE!!"; | |
my $body = do { local $/; <$fh> }; | |
my $keyword = $ARGV[1]; | |
die "OH KEYWORD" unless $keyword; | |
#$body =~ s/ / /g; | |
#$body =~ s/\n+/\n/g; | |
#my ($date_of_name_change) = $body =~ /DATE OF NAME CHANGE:[\t\s]+([0-9-]+)/gmsi; | |
#print $accession_number."\n"; | |
#print $date_of_name_change."\n"; | |
my $splitter = Lingua::Sentence->new("en"); | |
my @arr = $splitter->split_array($body); | |
my $idx = 0; | |
my ($accession_number) = $body =~ /ACCESSION NUMBER:[\t\s]+([0-9-]+)/gmsi; | |
my @sentences = (); | |
for my $row (@arr) { | |
if ($row =~ /$keyword/) { | |
my @target_arr = @arr[$idx-1 .. $idx+1]; | |
if (grep { m{\$[0-9,\.]+ ?(?:million|billion|trillion)?}; } @target_arr) { | |
push @sentences, \@target_arr; | |
} | |
} | |
++$idx; | |
} | |
my $count = 0; | |
for my $r (@sentences) { | |
for my $sentence (@{ $r }) { | |
my (@amounts) = $sentence =~ /(\$[0-9,\.]+ ?(?:million|billion|trillion)?)/; | |
for my $amount (@amounts) { | |
print join("\t", $accession_number, ++$count, $amount, $sentence)."\n"; | |
} | |
} | |
} |
JEEN
commented
Sep 29, 2014
1,2,4 ,7,8,9,10 번 같은 경우는 "consolidated" 단어가 없는데도 뽑혀 나오는데 어찌된일인지 알수 있겠스미?
perl html_convert.pl 0000796343-09-000007.txt consolidated
0000796343-09-000007 1 $5.2 million As such, we recognized $5.2 million and $3.0 million in liabilities, related to the extended East and West Towers and Almaden Tower leases, respectively.
0000796343-09-000007 2 $35.0 million Specifically, there was a reclassification totaling $35.0 million from purchased intangibles to long-term and short-term other assets. See Notes 5 and 6 for additional information regarding this reclassification.
0000796343-09-000007 3 $4.3 million During fiscal 2008, we completed one business combination for cash consideration of approximately $4.3 million. This acquisition was not material to our consolidated balance sheet and results of operations.
0000796343-09-000007 4 $77.0 million During fiscal 2007, we completed two business combinations and one asset acquisition for cash consideration of $77.0 million.
0000796343-09-000007 5 $1.5 million Related to the acquisition that occurred during the second quarter of fiscal 2007, $1.5 million of in-process research and development was included in our amortization of purchased intangibles on our consolidated statements of income.
0000796343-09-000007 6 $1.5 million Related to the acquisition that occurred during the second quarter of fiscal 2007, $1.5 million of in-process research and development was included in our amortization of purchased intangibles on our consolidated statements of income.
0000796343-09-000007 7 $63.0 million In addition to the acquisition of Macromedia, during fiscal 2006, we completed three business combinations and five asset acquisitions for cash consideration of approximately $63.0 million.
0000796343-09-000007 8 $55.5 million Specifically, we reclassified $55.5 million of cost and $20.5 million of accumulated amortization ($35.0 million, net) from purchased intangibles to long-term and short-term other assets associated with certain technology license arrangements.
0000796343-09-000007 9 $35.0 million Specifically, there was a reclassification associated with certain technology licensing arrangements totaling $35.0 million, net from purchased intangibles of which $28.7 million and $4.7 million were reclassified to acquired rights to use technology and long-term prepaid royalties, respectively.
0000796343-09-000007 10 $5.2 million As such, we recognized $5.2 million and $3.0 million in liabilities, related to the extended East and West Towers and Almaden Tower leases, respectively.
0000796343-09-000007 11 $3.9 The adoption of FIN 48 resulted in an increase of $3.9 million to both assets and unrecognized tax benefits in our consolidated balance sheet as of the beginning of fiscal 2008.
0000796343-09-000007 12 $218.4 Upon adoption, the gross liability for unrecognized tax benefits at December 1, 2007 was $218.4 million, exclusive of interest and penalties.
0000796343-09-000007 13 $3.9 Thus, we recognized additional deferred income tax assets of $3.9 million to present the unrecognized tax benefits as gross amounts on our consolidated balance sheet.
0000796343-09-000007 14 $32.1 million In fiscal 2008, we recorded restructuring charges totaling $32.1 million of which $29.2 related to fiscal 2008 restructuring charges and $2.9 million related to changes in estimates associated with pre-existing facilities accruals for the Macromedia acquisition.
0000796343-09-000007 15 $13.1 million Accrued restructuring charges of $13.1 million at November 28, 2008 includes $6.9 million recorded in accrued restructuring, current and $6.2 million, related to long-term facilities obligations, recorded in accrued restructuring, non-current in the accompanying consolidated balance sheets.
0000796343-09-000007 16 $17.7 million Accrued restructuring charges of $17.7 million at November 30, 2007 included $3.7 million recorded in accrued restructuring, current and $14.0 million, related to long-term facilities obligations, recorded in accrued restructuring, non-current in the accompanying consolidated balance sheets.
0000796343-09-000007 17 $31.4 million At December 1, 2006, accrued restructuring charges of $31.4 million included $9.8 million recorded in accrued restructuring, current and $21.6 million, related to long-term facilities obligations, recorded in accrued restructuring, non-current in the accompanying consolidated balance sheets.
0000796343-09-000007 18 $0.3 million Accrued restructuring charges as of December 1, 2006 included $0.3 million recorded in accrued restructuring, current and $0.3 million, related to long-term facilities obligations recorded in accrued restructuring, non-current in the accompanying consolidated balance sheets.
0000796343-09-000007 19 $126.8 million As part of the lease extensions, we purchased the lease receivable from the lessor of the East and West Towers for $126.8 million and a portion of the lease receivable from the lessor of the Almaden Tower for $80.4 million, both of which are recorded as investments in lease receivables on our consolidated balance sheet.
0000796343-09-000007 20 $5.2 million As such, we recognized $5.2 million and $3.0 million in liabilities, related to the extended East and West Towers and Almaden Tower leases, respectively.
0000796343-09-000007 21 $47.8 million Royalty expense, which was recorded under our cost of products revenue on our consolidated statements of income, was approximately $47.8 million, $37.4 million and $19.1 million in fiscal 2008, 2007 and 2006, respectively.
0000796343-09-000007 22 $350.0 As of November 28, 2008 and November 30, 2007, the amount outstanding under this credit facility was $350.0 million and zero, respectively, which is included in long-term liabilities on our consolidated balance sheet.
어... 이런 걸 언제 남겼대...
43라인에서 if ($row =~ /$keyword/) {
체크를 한번 더 해야할 듯