Last active
October 2, 2023 11:05
-
-
Save Ovid/a2e828965ce3586a83bfedd7e86a8d20 to your computer and use it in GitHub Desktop.
Molecular Assembly Number In Pure Perl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use v5.14.0; | |
use warnings; | |
use JSON::PP qw(decode_json); | |
use Data::Dumper; | |
use Getopt::Long; | |
GetOptions( | |
'perl' => \my $perl, | |
'verbose' => \my $verbose, | |
) or die "Bad options"; | |
my $molecule = join ' ', @ARGV or die "Usage: $0 molecule"; | |
my $inchi = get_inchi( $molecule, $verbose ); | |
my $ma_data = get_ma_data( $inchi, $verbose ); | |
if ($perl) { | |
local $Data::Dumper::Indent = 1; | |
local $Data::Dumper::Sortkeys = 1; | |
local $Data::Dumper::Terse = 1; | |
print Dumper( decode_json($ma_data) ); | |
} | |
else { | |
print $ma_data; | |
} | |
sub uri_encode { | |
my $molecule = shift; | |
my %escapes = map { chr($_) => sprintf( "%%%02X", $_ ) } 0 .. 255; | |
return join '', @escapes{ split //, $molecule }; | |
} | |
sub get_inchi { | |
my ( $molecule, $verbose ) = @_; | |
my $encoded_molecule = uri_encode($molecule); | |
my $url = "https://cactus.nci.nih.gov/chemical/structure/$encoded_molecule/stdinchi"; | |
my $command = qq{curl -s $url}; | |
if ($verbose) { | |
warn $command; | |
} | |
chomp( my $inchi = `$command` ); | |
if ( !$inchi ) { | |
warn "Could not determine InChI for $molecule"; | |
exit 1; | |
} | |
elsif ( $inchi !~ /\AInChI=/ ) { | |
warn "Could not determine InChI for '$molecule'"; | |
exit 2; | |
} | |
if ($verbose) { | |
warn "InChI: $inchi"; | |
} | |
return $inchi; | |
} | |
sub get_ma_data { | |
my ( $inchi, $verbose ) = @_; | |
my $command | |
= qq{curl -s -G https://croninburgh.chem.gla.ac.uk/batch_lookup --data-urlencode i0="$inchi" --data-urlencode n=1}; | |
if ($verbose) { | |
warn $command; | |
} | |
my $response = `$command`; | |
if ( !$response ) { | |
warn "Could not determine molecular assembly index for $molecule"; | |
exit 2; | |
} | |
return $response; | |
} | |
__END__ | |
=head1 NAME | |
ma.pl - Given a molecule name, print the molecular assembly number | |
=head1 USAGE | |
$ perl ma.pl tryptophan | |
[{"MA":11,"inchi":"InChI=1S/C11H12N2O2/c12-9(11(14)15)5-7-6-13-10-4-2-1-3-8(7)10/h1-4,6,9,13H,5,12H2,(H,14,15)/t9-/m0/s1","method":"exact"}] | |
$ perl ma.pl iso-propyl cyanide | |
[{"MA":3,"inchi":"InChI=1S/C4H7N/c1-4(2)3-5/h4H,1-2H3","method":"exact"}] | |
Given a molecule name, this script attempts to print out the molecular assembly data. (The C<MA> | |
number is the assembly index. | |
This data is currently being used, amongst other things, for searching for extraterrestrial | |
life. | |
http://molecular-assembly.com/ | |
=head1 OPTIONS | |
=over 4 | |
=item B<--perl,-p> | |
Print the data as Perl code. | |
=item B<--verbose,-v> | |
Print out the commands being run, along with InChI code found (if any) | |
=back |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment