Skip to content

Instantly share code, notes, and snippets.

@nickpettican
Last active November 14, 2015 19:20
Show Gist options
  • Save nickpettican/55cc6b05fa9ea186a0b3 to your computer and use it in GitHub Desktop.
Save nickpettican/55cc6b05fa9ea186a0b3 to your computer and use it in GitHub Desktop.
Programming Project
#!/usr/bin/perl -w
use strict;
use warnings;
my $filename = 'dummy.pepmasses';
my $from = 0;
my $to = 0;
print "What range of information do you want to know ?\n";
{
my $str = <STDIN>;
chomp $str;
($from, $to) = split /\.\./, $str;
}
# open file
my $ifile_fh;
open $ifile_fh, '<', $filename or
die "Cannot open file $filename: $!\n";
# skip header
readline $ifile_fh;
my $prepnumber = 0;
my $summass = 0;
while (defined (my $line = <$ifile_fh>) ) {
# read each line in file
chomp $line;
my $mz_value = (split /\s+/, $line)[2];
if ($mz_value >= $from and $mz_value <= $to) {
# print matched line
print "$line\n";
++$prepnumber;
$summass = $mz_value+$summass;
}
}
print "prepnumber=$prepnumber\n" if $prepnumber;
print "average m/z number = $summass/$prepnumber\n";
close $ifile_fh;
exit;
#!/usr/bin/perl -w
#
BEGIN{
use File::Basename;
push @INC,
use Getopt::Long
#
my $binwidth;
my $bintarget;
GetOptions ("binwidth" => \$binwidth,
"bintarget" => \$bintarget);
if (GetOptions("binwidth" => \$binwidth,
"bintarget" => \$bintarget) {
print "OK";}
exit;
#!/usr/bin/perl -w
#
use strict;
use warnings;
use Statistics::Descriptive::LogScale;
my $stat = Statistics::Descriptive::LogScale->new (
base => 1.01,
linear_width => 0.001,
);
while (<>) {
$stat->add_data($_) for /(-?\d+(?:\.\d*)?)/g;
};
# add more data....
printf " minimal value: %f\n", $stat->min;
printf "%3uth percentile: %f\n", $_*10, $stat->percentile($_*10)
for 1..10;
#!/usr/bin/env perl
# SUMMARY
#
# This is a script that demonstrates the very basic abilities
# of Statistics::Secriptive::LogScale.
#
# It reads numbers from STDIN or input files
# and adds them to statistical data pool
# then prints a summary with mean, median, percentiles etc.
use strict;
use warnings;
# We would to have some extra modules, but that's optional.
my $can_size = eval { require Devel::Size; 1; };
my $can_getopt = eval { require Getopt::Long; 1; };
# Always prefer local version of our module
use FindBin qw($Bin);
use lib "$Bin/../lib";
use Statistics::Descriptive::LogScale;
# These are our storage parameters
my $base;
my $linear;
# Parse options, if absolutely needed
my $want_options = grep { qr/^-/ } @ARGV;
if ( $want_options ) {
if (grep { $_ eq '--help' } @ARGV) {
print "Usage: $0 [--base <1+small o> --precision <nnn>]\n";
print "Read numbers from STDIN, output stat summary\n";
print "NOTE that specifying options requires Getopt::Long";
exit 1;
};
$can_getopt or die "Options given, but Getopt::Long not detected!";
Getopt::Long->import;
GetOptions (
'base=s' => \$base,
'precision=s' => \$linear,
);
};
# HERE WE GO
# Initialize statistical storage
my $stat = Statistics::Descriptive::LogScale->new(
base => $base, linear_width => $linear);
# Read input
# We want to catch scientific notation as well.
my $re_num = qr/(?:[-+]?(?:\d+\.?\d*|\.\d+)(?:[Ee][-+]?\d+)?)/;
while (<>) {
$stat->add_data(/($re_num)/g);
};
# Print the most basic statistics. These can be done precisely in O(1) memory.
# See Statistics::Descriptive.
printf "Count: %u\nAverage/std. deviation: %f +- %f\nRange: %f .. %f\n",
$stat->count, $stat->mean, $stat->standard_deviation,
$stat->min, $stat->max;
# These two can be done in O(1) as well... But nobody does.
printf "Skewness: %f; kurtosis: %f\n",
$stat->skewness, $stat->kurtosis;
# The following requires storing data in memory
# Trimmed mean is the average of data w/o outliers
# in this case, 25% lowest and 25% highest numbers were discarded
printf "Trimmed mean(0.25): %f\n",
$stat->trimmed_mean(0.25);
# Print percentiles.
# Xth percentile is the point below which X% or data lies.
foreach (0.5, 1, 5, 10, 25, 50, 75, 90, 95, 99, 99.5) {
my $x = $stat->percentile($_);
$x = "-inf" unless defined $x;
printf "%4.1f%%: %f\n", $_, $x;
};
# Print how much memory we used (if possible)
if ($can_size) {
print "Memory usage: ".Devel::Size::total_size($stat)."\n";
};
# downloads Boston dataset from STATLIB
use LWP::Simple;
getstore(
"http://lib.stat.cmu.edu/datasets/boston",
"boston.raw" );
# corrects for record spanning two lines
open( IN, "boston.raw" );
open( OUT, ">boston.asc" );
do { $line = <IN> } until $. == 22
or eof; # Skips the first 22 lines of header
while ( $line = <IN> ) {
chomp $line;
$line .= <IN>; # joins two lines
print OUT $line;
}
close(OUT);
# sends data to R for regression analysis
open( RPROG,
"| c:/rw1081/bin/Rterm.exe --no-restore --no-save"
);
select(RPROG);
print <<’CODE’;
bost<-read.table("boston.asc",header=F)
names(bost)<- c( "CRIM", "ZN", "INDUS",
"CHAS", "NOX", "RM",
"AGE", "DIS", "RAD",
"TAX", "PTRAT", "B",
"LSTAT", "MEDV")
attach(bost)
boston.fit <- lm(log(MEDV) ~ CRIM + ZN + INDUS +
CHAS + I(NOX^2) + I(RM^2) + AGE + log(DIS) +
log(RAD) + TAX + PTRAT + B + log(LSTAT))
sum <- summary(boston.fit)$coe[,1:2]
write.table(sum,"boston.out",quote = FALSE)
q()
CODE
close(RPROG);
# creates LaTeX table with regression results
open( TABLE, "boston.out" );
open( TEX, ">table.tex" );
$prec = 3; # sets number of decimals
$width = 9; # sets the width of the field
do { <TABLE> } until $. == 1 or eof; # Skips the first line of header
while (<TABLE>) {
chomp;
@line = split;
printf TEX "%11s & %${width}.${prec}g & %${width}.${prec}g\\\\ \n", $line[0],
$line[1], $line[2];
}
close(TABLE);
#!/usr/bin/perl -w
use strict;
use Getopt::Long;
print "Please enter criteria for bin width\n";
my $data = "data_file";
my $binwidth = <>;
print "Please enter criteria for bin target\n";
my $bintarget = <>;
GetOptions ("length=i" => \$binwidth,
"file=s" => \$data,
"binlength" => \$bintarget)
or die("Error, could not run. Try again.\n");
exit;
#!/usr/bin/perl
# Program to extract ion intensities from a .peptSpectra.xml file
# Copyright (C) 2005 Jacques Colinge
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
# Contact:
# Prof. Jacques Colinge
# Upper Austria University of Applied Sciences at Hagenberg
# Hauptstrasse 117
# A-4232 Hagenberg, Austria
# http://www.fhs-hagenberg.ac.at
Show 31 lines of Pod
BEGIN{
use File::Basename;
push @INC, (dirname $0).'/../../lib';
}
use strict;
use XML::Parser;
use Getopt::Long;
use InSilicoSpectro;
use InSilicoSpectro::InSilico::MassCalculator;
use InSilicoSpectro::InSilico::Peptide;
use InSilicoSpectro::Spectra::PeakDescriptor;
use InSilicoSpectro::Spectra::ExpSpectrum;
use InSilicoSpectro::InSilico::MSMSTheoSpectrum;
use InSilicoSpectro::InSilico::MSMSOutput;
my ($help, $peptFile, $imposedCharge, $relint, $rFormat);
my $fragTypes = 'a,b,b-NH3,b-H2O,b++,y,y-NH3,y-H2O,y++';
my $tol = 600;
my $minTol = 0.2;
my $intSel = 'order';
my $matchSel = 'closest';
my $withPlots;
my $configFile;
if (!GetOptions('help' => \$help,
'h' => \$help,
'configfile=s' => \$configFile,
'pept=s' => \$peptFile,
'charge=i' => \$imposedCharge,
'tol=f' => \$tol,
'mintol=f' => \$minTol,
'intsel=s' => \$intSel,
'matchsel=s' => \$matchSel,
'withplots' => \$withPlots,
'rformat' => \$rFormat,
'frag=s' => \$fragTypes) || defined($help) || !defined($peptFile)){
print STDERR "Usage: getIonIntensities.pl --pept=peptfile [options]
--configFile=string xml configuration file for Masscalculator.pm
--frag=string list of fragment types separated by comas, default is [$fragTypes]
--charge=int only look at peptides at this charge state, default is no imposed charge
--tol=float relative mass error tolerance in ppm, default [$tol]
--mintol=float absolute mass error tolerance in Da, default [$minTol]
--intsel=string select the intensity normalization (original|log|order|relative)
--matchsel=string select the match algorithm (closest|greedy|mostintense), the order is given by frag in case of greedy
-withplots generates individual plots for all the matches (png format)
-rformat formats the output for R
-help
-h\n";
print "'$peptFile'\n";
exit(0);
}
if($configFile){
InSilicoSpectro::init($configFile);
}
else{
InSilicoSpectro::init();
}
my @fragTypes = split(/,/, $fragTypes);
my %intensStat;
my $nPlot = 0;
open(F, $peptFile);
my $parser = new XML::Parser(Style => 'Stream');
$parser->parse(\*F);
close(F);
my $maxLen = 0;
foreach my $frag (keys(%intensStat)){
if ((@{$intensStat{$frag}} > $maxLen)){
$maxLen = scalar(@{$intensStat{$frag}});
}
}
if (defined($rFormat)){
foreach my $frag (sort InSilicoSpectro::InSilico::MSMSOutput::cmpFragTypes (keys(%intensStat))){
print "\"$frag\"\t";
}
print "\n";
for (my $i = 1; $i <= $maxLen; $i++){
print "\"$i\"";
foreach my $frag (sort InSilicoSpectro::InSilico::MSMSOutput::cmpFragTypes (keys(%intensStat))){
if (defined($intensStat{$frag}[$i-1])){
printf "\t%.3f", $intensStat{$frag}[$i-1];
}
else{
print "\tNA";
}
}
print "\n";
}
}
else{
foreach my $frag (sort InSilicoSpectro::InSilico::MSMSOutput::cmpFragTypes (keys(%intensStat))){
print $frag;
foreach (@{$intensStat{$frag}}){
print "\t$_";
}
print "\n";
}
}
# ------------------------------------------------------------------------
# XML parsing
# ------------------------------------------------------------------------
my ($curChar, $massIndex, $intensityIndex, $charge, $peptide, $modif, $peaks, $itemIndex);
sub Text
{
$curChar .= $_;
} # Text
sub StartTag
{
my ($p, $el) = @_;
if ($el eq 'ple:ItemOrder'){
$itemIndex = 0;
}
elsif ($el eq 'ple:item'){
if ($_{type} eq 'mass'){
$massIndex = $itemIndex;
}
elsif (($_{type} eq 'intensity') || ($_{type} eq 'height')){
$intensityIndex = $itemIndex;
}
$itemIndex++;
}
undef($curChar);
} # StartTag
sub EndTag
{
my($p, $el)= @_;
if ($el eq 'idi:sequence'){
$peptide = $curChar;
}
elsif ($el eq 'idi:modif'){
$modif = $curChar;
}
elsif ($el eq 'idi:charge'){
$charge = $curChar;
}
elsif ($el eq 'ple:peaks'){
$peaks = $curChar;
}
elsif (($el eq 'idi:OneIdentification') && (!defined($imposedCharge) || ($charge == $imposedCharge))){
# Builds the peak list
my @lines = split(/\n/, $peaks);
my (@peaks, $maxIntensity);
foreach my $line (@lines){
$line =~ s/\r//g;
my @part = split(/\s+/, $line);
if ($part[$massIndex] > 0.0){
push(@peaks, [$part[$massIndex], $part[$intensityIndex]]);
$maxIntensity = $part[$intensityIndex] if ($part[$intensityIndex] > $maxIntensity);
}
}
if (defined($maxIntensity)){
# Matches with theoretical masses
my %spectrum;
if ($matchSel eq 'closest'){
matchSpectrumClosest(pept=>$peptide, modif=>$modif, spectrum=>\%spectrum, expSpectrum=>\@peaks, fragTypes=>\@fragTypes);
}
elsif ($matchSel eq 'greedy'){
matchSpectrumGreedy(pept=>$peptide, modif=>$modif, spectrum=>\%spectrum, expSpectrum=>\@peaks, fragTypes=>\@fragTypes, order=>\@fragTypes, tol=>$tol);
}
elsif ($matchSel eq 'mostintense'){
matchSpectrumGreedy(pept=>$peptide, modif=>$modif, spectrum=>\%spectrum, expSpectrum=>\@peaks, fragTypes=>\@fragTypes, tol=>$tol);
}
else{
CORE::die("Unknown match type [$matchSel]");
}
if (defined($withPlots)){
my $msms = new InSilicoSpectro::InSilico::MSMSOutput(spectrum=>\%spectrum, prec=>2, modifLvl=>1, expSpectrum=>\@peaks, intSel=>'order', tol=>$tol, minTol=>$minTol);
$msms->plotSpectrumMatch(fname=>"$peptide-$$-$nPlot", format=>'png', fontChoice=>'default:Large', changeColModifAA=>1, legend=>'right', plotIntern=>1);
$nPlot++;
}
# Normalizes intensities
my %normInt;
normalizeIntensities($intSel, \@peaks, \%normInt);
# Extracts statistics
my $len = length($peptide);
foreach my $frag (keys(%{$spectrum{mass}{term}})){
for (my $i = 0; $i < @{$spectrum{ionType}{$frag}}; $i++){
for (my $j = $i*$len; $j < ($i+1)*$len; $j++){
if (defined($spectrum{mass}{term}{$frag}[$j]) && defined($spectrum{match}{term}{$frag}[$j])){
my $theoMass = $spectrum{mass}{term}{$frag}[$j];
my $expMass = $spectrum{match}{term}{$frag}[$j][0];
if ((abs($theoMass-$expMass)/($theoMass+$expMass)*2.0e6 <= $tol) || (abs($theoMass-$expMass) <= $minTol)){
push(@{$intensStat{$spectrum{ionType}{$frag}[$i]}}, $normInt{$spectrum{match}{term}{$frag}[$j][0]});
}
}
}
}
}
foreach my $frag (keys(%{$spectrum{match}{intern}})){
foreach my $aa (keys(%{$spectrum{match}{intern}{$frag}})){
if (defined($spectrum{match}{intern}{$frag}{$aa})){
my $theoMass = $spectrum{mass}{intern}{$frag}{$aa};
my $expMass = $spectrum{match}{intern}{$frag}{$aa}[0];
if ((abs($theoMass-$expMass)/($theoMass+$expMass)*2.0e6 <= $tol) || (abs($theoMass-$expMass) <= $minTol)){
push(@{$intensStat{"$frag($aa)"}}, $normInt{$spectrum{match}{intern}{$frag}{$aa}[0]});
}
}
}
}
}
}
} # EndTag
#!/usr/bin/perl -w
#
use strict;
use Getopt::Long;
# Defining bin size and parameters
my $data = "table.tsv";
my $AAfilter = "None";
print "Before handling options, the AA filter is: $AAfilter\n";
handleopts();
print "After handling options, the AA filter is: $AAfilter\n";
print "Please enter criteria for bin width\n";
my $binwidth = <>;
print "Please enter criteria for bin length\n";
my $binlength = <>;
sub handleopts {
if (!GetOptions (
"binwidth=i" => \$binwidth,
"file=s" => \$data,
"binlength=i" => \$binlength,
"aa-filter=s" => \$AAfilter)){
print STDERR "Error, failed to obtain information from user\n";
print "Help_page\n";
exit(0);
}
}
print "\n";
# Opening the data file
open(IFILE, $data) or die "Error, could not open input file\n";
my @lines = <IFILE>;
foreach my $line (@lines) {
processMzValue($line, $AAfilter);
}
# Reading the data
sub processMzValue {
# take the first parameter to the subroutine as a scalar variable, called line
my $line = shift;
my $AAfilter = shift;
if ($AAfilter ne "None") {
# For now this is not implemented
# Will filter for sequence column to generate stats
print "$AAfilter";
}
next if ($line =~ /^$/);
# this error-proofs the read so that at every loop it will ignore the spaces
my @column = split( /\t|\n/, $line );
# splits the column
my $mzcol = $column[2];
my ($mzcolmatch) = $mzcol =~ m#(\d+)#;
if ( $mzcol =~ /(\d+)/ ) {
my $mzcolmatch = $1;
# capture m/z value for the current row in table
print "$mzcolmatch\n" or die "Nope";
}
}
print "\n\nAll done\n";
exit;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment