#!/bin/sh
version=2.2.1
license="Copyright (C) 1997, 2001, 2006-2007, 2009, 2011-2012 Dimitar Ivanov

License: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law."
#set -vx
################################################################################
#
# mintegrate - compute integral or derivative of 1-d numerical data using awk
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
################################################################################
xc=1              # x column
yc=1              # f(x) column: default is to compute the sum of the 1 column
dx=1.0            # x delta interval
x0=00; x1=00      # x data range
pr_digits="%.10g" # Print format of the result data
################################################################################

### Procedure for testing that program is working correctly
#
test_myself_functional () {
    LC_ALL=C
    export LC_ALL
    IFS=";"
    grep "^_TEST_CASE_" $0 \
    | while read test
      do
         set -- $test
         result=`\
                   sed -n '/^#_DATA_START_/,/^#_DATA_END_/p' $0 \
                   |grep -v '#' |eval $0 $3 |tr -d ' '
                `
         expected="`echo $4 |tr -d ' '`"
         [ "$result" = "$expected" ] && status=ok || status=failed
         echo Testing $2 \: $status
      done
}

### Get options function
#
getoptions() {
   opts=$1
   shift
   getopt_rc=0

   getopt=`which getopt 2>&1| grep "^/"`
   if [ $getopt ]; then
         options=`$getopt $opts $*` || { options="-h"; getopt_rc=1; }
   else # build-in function
      options=
      while getopts $opts Option
      do
         [ $Option != '?' ] || { Option=h; getopt_rc=1; }
         options="$options -$Option $OPTARG"
      done
      shift `expr $OPTIND - 1`
      options="$options -- $*"
   fi
   echo $options

return $getopt_rc
}

# To create a man page try:
# mintegrate --help |sed -ne "s/Usage:/[SYNOPSIS]\n/;s/mintegrate/\nmintegrate/g;/SYNOPSIS/,/Options:/p" |grep -v Options: > /tmp/synopsis.txt ; help2man -N -n "`mintegrate -h |grep ^mintegrate |cut -f2- -d:`" mintegrate -I /tmp/synopsis.txt |sed 's/ \\fB/ /g' |man -l -

separator=`echo |awk '{printf( "%080s", 0 )}' |tr 0 -`
### Print usage
#
_print_usage_ () {
cat << HELP
$separator
$1
$separator

Usage: $progname [OPTION]... [FILE]

Options:
    -a         $2 compute mean value (arithmetic average) and standard deviation
    -c         $2 compute integral on closed x-data interval;
                 In case that dx is not specified by the '-d' flag, the data
                 are supposed to be from an irregular x-grid, and dx is computed
                 separately for every x-interval. The integral is computed
                 by the trapezoidal rule.
    -d <float> $2 compute integral on open x-data interval with the specified dx;
                 Can be used also in combination with '-D' and '-c'.
    -D         $2 compute difference btw. numbers or derivative of the y-data;
                 In the default scenario where x- and y-data column are same,
                 the difference btw. the current and the previous data value
                 will be output. In this case when '-d' is defined as 0, the 
                 x-data value will be print out in front of the calculated
                 difference. If x-and the y-column are different and if the
                 x-data resolution is not defined or it is !=0, then the
                 derivative of the y-data is calculated. When the x-data
                 resolution is constant, specify it explicitly by '-d' to
                 achieve a higher numerical precision by a 'leapfrog' algorithm.
    -x <int>   $2 x-data column (default is $xc). If 0, the x-range is an index;
    -y <int>   $2 y-data column, where y=f(x) (default is $yc)
    -r x_0:x_1 $2 x-data range to consider
    -s         $2 print out accumulated y_i sums: x_i versus accumulated f(x_i);
                 In the case of a closed integral you have to specify also the
                 x-data resolution dx (see '-d' above).
    -S         $2 compute the accumulated y_i-sums and add it to the output
    -p <str>   $2 print format of the result ("$pr_digits" is default)
    -t <str>   $2 output text in front of the result (invalid with '-s' or '-S');
                 A blank can be printed by using a double underscore character
                 '__'.
    -T         $2 run a self-test that the program is working correctly
    -V         $2 print version number
    --version  $2 output version and license message
    --help|-H  $2 display help
    -h         $2 display short help (options summary)


If none of the options '-a', '-D', '-d', or '-c' is used, then the sum of the
provided data will be computed. Empty lines or lines starting with '#' are
skipped.

This program is perfectly suitable as a basic tool for initial data analysis
and will meet the expected accuracy of a numerical solution for the most
demanding computer users and professionals. Yet be aware that, although the
computations are carried with double floating precision, the computational
techniques used for evaluating an integral or a standard deviation are
analytically low-order approximations, and thus not intended to be used for
numerical computations in engineering or mathematical sciences for cases
where an ultimate numerical precision is a must. For deeper understanding of
the topic see http://en.wikipedia.org/wiki/Numerical_analysis.

HELP
}

### Print only version number
#
_print_version_ () {
      cat << !VERSION
$progname $version
$license
!VERSION
}

################################################################################
#
# MAIN
#
progname=`basename $0`

case $1 in
     --help|-H) separator=""
                _print_usage_ "$progname is a program to compute averages, sums, integrals or derivatives of numerical 1-d data in situations where ultimate numerical precision is not needed." " "
                exit
     ;;
     --version) _print_version_
                exit
     ;;
esac
#
# Find proper awk flavor
#
exec 3>&2 2>&-
for a in gawk nawk awk
do
  [ "`echo |$a -v a=a '{}' 2>&1`" = "" ] && AWK=$a
done
exec 2>&3
[ x"$AWK" = x ] && \
  echo "Error: can't find 'awk' supporting assignments" && \
  exit 2

gops="VhDd:x:r:y:sSt:Tcap:"
options=`getoptions $gops $*` && urc=0 || urc=1
set -- $options

while [ $# -gt 0 ]
do
   case $1 in
   -h ) _print_usage_ "$progname ${version}: evaluate average/sum/integral/derivative of 1-d numerical data" "-" \
              |egrep "^($progname|Usage:|Options:|\ *-)"
        exit $urc
        ;;
   -d ) dx=$2
        dx_defined=1
        shift
        ;;
   -D ) derivat=1
        closed=1
        ;;
   -c ) closed=1
        ;;
   -a ) dx=0
        average=1
        ;;
   -y ) yc=$2
        shift
        ;;
   -x ) xc=$2
        shift
        ;;
   -r ) x0=`echo $2 |cut -f1 -d:`
        x1=`echo $2 |cut -f2 -d:`
        data_range=1
        shift
        ;;
   -p ) pr_digits=$2
        shift
        ;;
   -s ) accu_sum=1
        ;;
   -S ) accu_sum=2
        ;;
   -t ) text=$2
        text=`echo $text |sed "s/__/ /"`
        shift
        ;;
   -T ) test_myself_functional
        exit
        ;;
   -V ) echo $version
        exit
        ;;
   -- ) shift
        break ;;
   esac
   shift
done

################################################################################
#
# MAIN
#
cat $1 |$AWK -v dx=$dx -v dx_defined=$dx_defined -v data_range=$data_range \
             -v yc=$yc -v xc=$xc -v x0=$x0 -v x1=$x1 -v text="$text" \
             -v pr_digits=$pr_digits -v accu_sum=$accu_sum \
             -v closed=$closed -v average=$average -v derivat=$derivat \
'BEGIN \
{
  sum=0; sum1=0; sum2=0; I=0; i=0; closed_values_read=0;

  if( x0 != "00" && x1 != "00" ) range_defined=1; 
  else                           consider_data=1;
     # Specify computational accuracy in awk
  OFMT="%.15g"
     # Print out results with accuracy
  if( pr_digits ) pd = pr_digits;
  else            pd = OFMT;
}
#
# Functions
#
function _derivat_method_find_out_()
{
  if ( range_defined && x_i > x_value_first )
       start_index = 1;
  else
       start_index = 2;
  if( xc == yc ) derivat_y_only = 1;
  if( xc != yc ) {
       if( dx_defined && dx == 0 ) derivat_x_and_y = 1;
           # Use leapfrog method dx_i = [ y_(i+1) - y_(i-1) ] / [ 2*dx ]
       if( dx_defined && dx != 1.0 ) derivat_dx_const = 1;
       if( ! dx_defined ) derivat_dx_var = 1;
  }
  if( dx ) one_two_dx = 0.5 / dx;
}

function _x_y_save_first_and_last_values_()
{
  if( ! x_value_first ) x_value_first = x_i;
  else                  x_value_last = x_i;
  if( y_i_save_current ) { y_i_save_current = 0; y_i2 = y_i; }
}

function _derivat_save_cur_prev_values_()
{
  dx_i1 = dx;
  x_i0  = x_i1;
  y_i0  = y_i1;
  x_i1  = x_i;
  y_i1  = y_i;
  y_i2  = y_i;
}

function _print_x_y_( x, y )
{
  printf( "%s "pd"\n", x, y );
}

function _print_y_( y )
{
  printf( pd"\n", y );
}

function _calculate_current_dx_( x1, x0 )
{
  _dx = x1 - x0;
  if( _dx < 0.0 ) _dx = -_dx;
  return _dx;
}

#
# Main loop: skip empty lines or comments
#
$0 !~ /^ *#/ && $0 !~ /^ *$/\
{
     # In case x-data column = 0, then index is used
  if( xc ) { x_i = $xc;    }
  else     { I++; x_i = I; }
  y_i = $yc;

  _x_y_save_first_and_last_values_();

  if( range_defined && x_i <= x1 && x_i >= x0 ) consider_data=1;
  
  if( closed && ! dx_defined )
      dx = _calculate_current_dx_( x_i, x_i1 );

     # If x-data are in the specified range
  if( consider_data ) {
      i++;
         # Integral / Sum
      sum  += y_i;

         # sum2 is used for computing the standard deviation
      if( average ) sum2 += y_i * y_i;

         # Integral or derivative over closed interval:
         # irregular grid sampling allowed
      if( closed ) {

          if( derivat ) {
              if( closed_values_read == 0 )
                  _derivat_method_find_out_();
              if( closed_values_read == start_index && ! range_defined ) {
                  if( derivat_y_only ) {
                      _print_y_( y_i1 - y_i0 );
                  } else if( derivat_x_and_y ) {
                         # The difference at the first x-data point 
                         # is defined as 'zero'
                      _print_x_y_( x_i0, "0" );
                  } else {
                         # Print out the derivative value at the first x-data
                         # point: use the same linear extrapolation backwards
                         # in both cases "derivat_dx_var" and 
                         # "derivat_dx_const"
                      dy_i1 = (y_i1 - y_i0)  / dx_i1; 
                      dy_i  = (y_i  - y_i1)  / dx;
                      slope = (dy_i - dy_i1) / dx_i1;
                      _print_x_y_( x_i0, dy_i1 - (slope * dx_i1) );
                  }
              }
              if( closed_values_read >= start_index ) {
                  if( derivat_y_only )
                      _print_y_( y_i - y_i1 );
                         # For all other print out the value at
                         # previous x-data point
                  else if( derivat_x_and_y )
                      _print_x_y_( x_i1, y_i1 - y_i0 );
                  else if( derivat_dx_const )
                      _print_x_y_( x_i1, (y_i - y_i0) * one_two_dx );
                  else
                      _print_x_y_( x_i1, (y_i1 - y_i0) / dx_i1 );
              }
          } else if ( closed_values_read > 0 ) {
              if( ! dx_defined || accu_sum )
                  sum1 += (y_i + y_i1) * dx;
              else
                  sum1 += (y_i + y_i1);
          }

          closed_values_read++;
      }

         # Accumulated values
      if( accu_sum ) {
          if( accu_sum < 2 && xc != 0 ) printf( "%s%s", $xc, FS ); # x
          else if( I )                  printf( "%s%s",   I, FS ); # index
          else                          printf( "%s%s",  $0, FS ); # line
          if( closed )                  printf( pd"\n", sum1 * 0.5 );
          else                          printf( pd"\n", sum * dx );
      }
  }

  if( closed ) {
      if( range_defined && x_i  > x1 ) next;
      if( range_defined && x_i == x1 ) y_i_save_current = 1;
          # Save current and previous values used for integral or derivative
      _derivat_save_cur_prev_values_();
  }

  if( range_defined ) consider_data=0;
}

END \
{
  if( i < 1 ) {
      print "(awk) Error: no data to process"
      exit 7;
  }

  if( derivat ) {
      if( derivat_y_only ) {
              # For the special case of computing differences where only two
              # data points are available
          if( i == 2 && ! data_range ) print x_i1 - x_i0
          exit
      }
      if( derivat_x_and_y ) {
          _print_x_y_( x_i1, y_i1 - y_i0 );
      } else if( derivat_dx_const && range_defined && x_i1 < x_value_last ) {
          _print_x_y_( x_i1, (y_i2 - y_i0) * one_two_dx );
      } else {
             # Print out the derivative value at the last x-data point:
             # the same value in both cases "derivat_dx_var" and 
             # "derivat_dx_const"
          _print_x_y_( x_i1, (y_i1 - y_i0) / dx_i1 );
      }
      exit;
  }

  if( accu_sum ) exit;

  if( text ) printf( "%s", text );

  if( average ) {
      if( i > 1 ) {
          dx=1/i;
             # Variance = [ Sum(y_i^2) - N*(y_i_mean)^2 ] / (N-1)
             # This formula magnifies roundoff errors significantly, but
             # it is an one-pass algorithm.
          sum2 = ( sum2 - dx * sum*sum ) / ( i - 1 );
          sum2 = sqrt( sum2 );
          sum  = sum * dx;
      } else {
          sum2 = 0;
      }
         # Mean value and standard deviation
      printf( pd"%s+/-%s"pd"\n", sum, FS, FS, sum2 );
  } else {
         # Integral (closed) - trapezoidal rule
      if( dx_defined && ! accu_sum ) sum1 = sum1 * dx;
      if( closed ) printf( pd"\n", sum1 * 0.5 );
         # Integral (open) / Sum
      else         printf( pd"\n", sum * dx );
  }
}'

exit $?

   # In order to satisfy 'sh -n', the whole test suit is put in a variable
Data_and_Tests="

#_DATA_START_
-1 101083 1.01083
-0.9 75518 0.75518
-0.8 53452 0.53452
-0.7 44716 0.44716
-0.6 39858 0.39858
-0.5 36796 0.36796
-0.4 34758 0.34758
-0.3 33388 0.33388
-0.2 32504 0.32504
-0.1 32004 0.32004
0 31846 0.31846
0.1 32004 0.32004
0.2 32504 0.32504
0.3 33388 0.33388
0.4 34758 0.34758
0.5 36796 0.36796
0.6 39858 0.39858
0.7 44716 0.44716
0.8 53452 0.53452
0.9 75518 0.75518
1 101083 1.01083
#_DATA_END_

_TEST_CASE_a ;arithmetic average; -a ; 0 +/- 0.6204836823
_TEST_CASE_b ;sum               ; -y3 ; 10
_TEST_CASE_c ;integral (range)  ; -x1 -y2 -r 0.3:0.9 ; 318486
_TEST_CASE_d ;integral (closed) ; -p %.12g -c -d 0.1 -x1 -y3 ; 0.898917
_TEST_CASE_e ;derivative        ; -D -d 0.1 -y2 |$0 -y2 ; -34990
_TEST_CASE_f ;derivative (range); -D -d 0.1 -y2 -r -0.5:0.6 |$0 -y2 ; 39600
_TEST_CASE_g ;difference        ; -D -x2 -y2 |$0 ; 0
_TEST_CASE_h ;difference (x,y)  ; -D -d 0 -r 0:1 -y2 |$0 ; 5.5 
_TEST_CASE_i ;difference (2 pts); -D -x1 -y2 -r-0.6:-0.5 |$0 -D -x2 -y2 ; 17960
_TEST_CASE_j ;accumulate (range); -S -y3 -r -0.5:0.6 |$0 -y4 ; 26.3508

"
