#!/bin/sh
# PCP QA Test No. 956
# Exercise pmcd attribute PDU handling after agent failure.
#
# Copyright (c) 2015 Red Hat.
#

seq=`basename $0`
echo "QA output created by $seq"

# get standard environment, filters and checks
. ./common.product
. ./common.filter
. ./common.check

_get_libpcp_config
$unix_domain_sockets || _notrun "No unix domain socket support available"

_filter()
{
    sed -e 's/\.\.\.*/\.\.\./g'
}

_cleanup()
{
    _service pmcd stop | _filter_pcp_stop
    unset PMCD_PORT
    unset PMCD_SOCKET
    _restore_config $PCP_PMCDOPTIONS_PATH
    _service pcp start 2>&1 | _filter_pcp_start
    _wait_for_pmcd
    _restore_auto_restart pmcd
    _wait_for_pmlogger
    _restore_auto_restart pmlogger

    if pmprobe -I pmcd.agent.status | grep '"dynamic"' >/dev/null
    then
	# need to uninstall dynamic PMDA
	#
        cd $here/pmdas/dynamic
        $sudo ./Remove >>$here/$seq.full 2>&1
        cd $here
    fi

    if pminfo | grep '^dynamic' >/dev/null
    then
	# need to cleanup PMNS
	#
        cd $here/pmdas/dynamic
        $sudo ./Remove >>$here/$seq.full 2>&1
        cd $here
    fi

    $sudo rm -f $tmp.*
}

status=1	# failure is the default!
$sudo rm -rf $tmp.* $seq.full
signal=$PCP_BINADM_DIR/pmsignal
trap "_cleanup; exit \$status" 0 1 2 3 15

port=`_get_port tcp 6060 6070`
if [ -z "$port" ]
then
    echo "Arrggh ... no free TCP port in the range 6060 ... 6070"
    $NETSTAT -a
    exit 1
fi
echo "port=$port" >>$seq.full
_save_config $PCP_PMCDOPTIONS_PATH
cat <<End-of-File >$tmp.newoptions
# New pmcd.options file created by PCP QA test $seq
#
-s $tmp.socket
End-of-File

_stop_auto_restart pmcd
_stop_auto_restart pmlogger
_service pcp stop | _filter_pcp_stop

$sudo cp $tmp.newoptions $PCP_PMCDOPTIONS_PATH

# need to start pmcd indirectly in a shell script because sudo
# may cleanse the environment, which also means we cannot use
# the _service wrapper
#
echo "export PMCD_PORT=$port" >$tmp.start
echo "export PMCD_SOCKET=$tmp.socket" >>$tmp.start
echo "$PCP_RC_DIR/pmcd restart" >>$tmp.start
$sudo sh $tmp.start 2>&1 | _filter_pcp_start
export PMCD_PORT=$port
export PMCD_SOCKET=$tmp.socket
_wait_for_pmcd

cd $here/pmdas/dynamic
if [ -f GNUmakefile.install ]
then
    $PCP_MAKE_PROG -f GNUmakefile.install clean >>$here/$seq.full 2>&1
    $PCP_MAKE_PROG -f GNUmakefile.install >>$here/$seq.full 2>&1
else
    $PCP_MAKE_PROG clean >>$here/$seq.full 2>&1
    $PCP_MAKE_PROG >>$here/$seq.full 2>&1
fi
$sudo ./Install < /dev/null >$tmp.out 2>&1
cat $tmp.out | tee -a $here/$seq.full | _filter_pmda_install | _filter
cd $here

# real QA test starts here
echo "Initial check of some metric access"
pmprobe -h unix:$tmp.socket -i hinv.ncpu

pmsleep 0.2

echo "Terminate a PMDA needing attributes" | tee -a $here/$seq.full
date >>$here/$seq.full
$PCP_PS_PROG $PCP_PS_ALL_FLAGS | grep pmdadynamic >> $here/$seq.full
$signal -s KILL -an pmdadynamic >> $here/$seq.full 2>&1
$sudo $signal -s KILL -a pmdadynamic >> $here/$seq.full 2>&1

# wait up to 2 seconds for dynamic PMDA to die
i=0
while [ $i -lt 10 ]
do
    $PCP_PS_PROG $PCP_PS_ALL_FLAGS >$tmp.tmp
    echo "ps probe #$i" >>$here/$seq.full
    egrep '[P]ID|[p]mdadynamic' $tmp.tmp >>$here/$seq.full
    # looking for a defunct PMDA process
    #
    if egrep '(pmdadynamic.*defunct)|( Z .*pmdadynamic)|\(pmdadynamic\)' $tmp.tmp >/dev/null
    then
	rm -f $tmp.tmp
	break
    elif grep "pmdadynamic" $tmp.tmp >/dev/null
    then
	# still running, keep trying
	:
    else
	# not running at all
	rm -f $tmp.tmp
	break
    fi
    pmsleep 0.2
    i=`expr $i + 1`
done
if [ -f $tmp.tmp ]
then
    echo "Arrgh ... dynamic PMDA won't die"
    egrep '[P]ID|[p]mdadynamic' $tmp.tmp
    exit
fi
date >>$here/$seq.full

# and a bit more for pmcd to notice the PMDA has exited
pmsleep 3.75

echo "Tickle access to the failed PMDA, must see 'Try Again'"
pmprobe -h unix:$tmp.socket -i hinv.ncpu | tee $tmp.tmp
nval=`$PCP_AWK_PROG <$tmp.tmp '{print $2}'`
if [ -z "$nval" -o "$nval" -ge 0 ]
then
    # this is not expected
    #
    echo "Error: nval=$nval not as expected ... see $seq.full"
    pmprobe -h unix:$tmp.socket -i hinv.ncpu >>$here/$seq.full
    pminfo -f pmcd.agent.status pmcd.agent.fenced >>$here/$seq.full
    echo "+++ Tickle failed `date` +++" >>$here/$seq.full
    cat $PCP_LOG_DIR/pmcd/pmcd.log >>$here/$seq.full
    cat $PCP_LOG_DIR/pmcd/dynamic.log >>$here/$seq.full
fi

echo "Verify subsequent return to healthy state"
pmprobe -h unix:$tmp.socket -i hinv.ncpu

# success, all done
status=0
exit
