#!perl -T use strict; use warnings; use bignum; use Getopt::Long; use Pod::Usage; my %opt; GetOptions( \%opt, # functions 'N|n|count', 'max', 'mean|avg|m', 'median', 'min', 'mode', 'percentile=i', 'quartile=i', 'sd|stdev', 'sum|s', 'variance|var', # predefined sets 'summary|five', 'complete|all', 'default|basic', # output control 'delimiter|d=s', 'format|fmt|f=s', 'no-header|nh', 'transverse-output|to', 'quiet|q', 'help|h', ) or pod2usage(1); my %predefined = ( complete => [ qw/N min q1 median q3 max sum mean sd var percentile quartile mode/ ], summary => [ qw/min q1 median q3 max/ ], default => [ qw/N min max sum mean sd/ ], ); pod2usage(1) if $opt{help}; my ($min,$max); my ($n, $sum, $sum_square, $mean) = (0, 0, 0, 0); my (%count, $most_common); # some functions require the full dataset my @data; my $keep_data = ( defined $opt{median} || defined $opt{percentile} || defined $opt{quartile} || defined $opt{summary} || defined $opt{complete} ); # some functions require frequencies my $keep_frequencies = ( defined $opt{mode} ); my $M2 = 0; # read data while (my $num = <>) { chomp $num; if ($num !~ m{^ [+-]? (?: \. ? [0-9]+ | [0-9]+ \. [0-9]* | \. ? [0-9]+ E [+-]? [0-9]+ | [0-9]* \. [0-9]+ E [+-]? [0-9]+ ) $}x) { warn "Invalid number '$num'\n" unless $opt{quiet}; next; } $n++; my $delta = $num - $mean; $mean += $delta / $n; $M2 += $delta * ($num - $mean); if ($keep_data) { push @data, $num; } if ($keep_frequencies) { $count{$num}++; $most_common = $count{$num} if (!defined $most_common or $count{$num} > $most_common); } $min = $num if (!defined $min or $num < $min); $max = $num if (!defined $max or $num > $max); $sum += $num; } # silently exit if $n == 0 exit if (!$n); my ( %summary, $percentile, $quartile, $median ); if ( $opt{summary} or $opt{complete} ) { my ($q1, $q2, $q3) = percentiles(\@data, 25, 50, 75); %summary = ( min => $min, q1 => $q1, median => $q2, q3 => $q3, max => $max, ); } elsif (defined $opt{percentile}) { ($percentile) = percentiles( \@data, $opt{percentile} ); } elsif (defined $opt{quartile}) { die "Invalid quartile\n" if $opt{quartile} < 0 or $opt{quartile} > 4; ($quartile) = percentiles( \@data, $opt{quartile} * 25 ); } elsif (defined $opt{median}) { ($median) = percentiles( \@data, 50 ); } my $variance = $n > 1 ? $M2 / ($n - 1) : undef; my $sd = defined $variance ? sqrt($variance) : undef; my @mode = grep { $count{$_} == $most_common } keys %count; my $mode = scalar @mode == 1 ? $mode[0] : undef; # don't deal with multimodes my %st = ( N => $n, max => $max, mean => $mean, median => $median, min => $min, mode => $mode, percentile => $percentile, quartile => $quartile, sd => $sd, sum => $sum, variance => $variance, %summary, ); my $delimiter = delete $opt{'delimiter'} || "\t"; my $format = delete $opt{'format'} || '%.2f'; my $no_header = delete $opt{'no-header'}; my $transverse = delete $opt{'transverse-output'}; my $quiet = delete $opt{'quiet'}; if ($delimiter =~ /^\\[a-z]$/) { $delimiter = $delimiter eq '\t' ? "\t" : $delimiter eq '\n' ? "\n" : die "Invalid delimiter: '$delimiter'\n"; } if ($format =~ m{( \%[0-9]*\.?[0-9]* [deEfgGi] )}x) { $format = $1; } else { die "Invalid format: '$format'\n"; } my @opt = !%opt || $opt{default} ? @{ $predefined{default} } : $opt{complete} ? @{ $predefined{complete} } : $opt{summary} ? @{ $predefined{summary} } : grep { defined $opt{$_} } @{ $predefined{complete} }; @opt = grep { defined $st{$_} } @opt; if (scalar @opt == 1) { print "$st{$opt[0]}\n"; exit; } if ($transverse) { for my $opt (@opt) { print "$opt$delimiter" unless $no_header; print sprintf( $format, $st{$opt} ), "\n"; } } else { print join($delimiter, @opt), "\n" unless $no_header; print join($delimiter, map { sprintf ($format, $st{$_}) } @opt), "\n"; } exit; ### sub percentiles { my ($data, @p) = @_; my @data = sort { $a <=> $b } @{$data}; my @percentiles = (); for my $p (@p) { if ($p < 0 or $p > 100) { die "Invalid percentile ($p)\n"; } my $index = $p * $#data / 100; my $percentile = $index == int($index) ? $data[$index] : ($data[$index] + $data[$index+1]) / 2; push @percentiles, $percentile; } return @percentiles; } __END__ =head1 NAME st - statistics from the command line interface (CLI) =head1 DESCRIPTION "st" is a command-line tool to calculate statistics from a file or standard input. =head1 USAGE st st [options] =head2 OPTIONS If no options are used, C will print: n min max sum mean sd The following options are available: =head2 OUTPUT --N|n|count --max --mean|avg|m --median --min --mode --sd|stdev --sum|s --variance|var --percentile=<0..100> --quartile=<1..3> --summary # five-number summary: min q1 median q3 max --complete # everything =head2 FORMAT --delimiter|d= # default: "\t" --format|fmt|f= # default: "%.2f" --no-header|nh # don't display header --transverse-output|to # output in multiple lines --quiet|q # silently skip invalid input =head2 EXAMPLES st st --summary st --complete st --complete --transverse-output st --no-header --delimiter='\n' --format='%.1e' =head1 AUTHOR Nelson Ferraz L<> =head1 CONTRIBUTORS imurray, who suggested a different algorithm for calculating variance. asgeirn, who suggested a input filter and helped to remove some warnings. gabeguz, who modified the script to make it more portable. Send comments, suggestions and bug reports to: https://github.com/nferraz/st/issues Or fork the code on github: https://github.com/nferraz/st =head1 COPYRIGHT Copyright (c) 2013 Nelson Ferraz. This program is free software; you can redistribute it and/or modify it under the MIT License (see LICENSE).