#! /bin/sh
#
# This script tests each of the machines in the machines/machine.$1 list
# to make sure that it is accessible and can run programs.  This is only
# a partial test
#
# In order for this to run in the background, we're careful to use -n 
# in the remote shell commands
#
verbose=0
rsh="rsh"
# Could use LINUX
# and ch_p4 to see if we should check....
for arg in "$@" ; do
    case $arg in 
        -echo) set -x ;;
	-v) verbose=1 ;;
        -help) cat <<.
Usage: 
     tstmachines [ -echo ] [ -v ] architecture
Tests that you can run remote shells with $rsh on the systems in 
your machines file and that executables are all cross-mounted.  To run this
test, you must be able to create files in the current directory.

If there are problems with some of the machines, this test may take several
minutes per problem machine.   If there are no problems, there will be no
output.  The option -v may be used to see what tests are being run.
.
        exit 1 
	;;
        *) arch=$arg ;;
    esac
done
#
MPIR_HOME="/usr/lib/mpich"
if [ -z "$arch" ] ; then
    arch=`$MPIR_HOME/bin/tarch`
fi
#
machineFile="${MPIR_HOME}/util/machines/machines.${arch}"
if [ ! -f $machineFile ] ; then
    echo "Cannot read list of nodes $machineFile"
    exit 1
fi
list=`cat $machineFile | sed -e '/\#/d' | tr -s '\012' ' '`
/bin/rm -f mpichfoo
echo "A test" > mpichfoo
# Use same mechanism as in mpirun to get the value of pwd...
if [ -n "sed -e s@/tmp_mnt/@/@g" ] ; then
    PWDtest=`pwd | sed -e s@/tmp_mnt/@/@g`
    if [ ! -d $PWDtest ] ; then
        PWDtest=`pwd`
    fi
    if [ -n "$PWD" ] ; then
        /bin/rm -f $PWDtest/.mpirtmp $PWD/.mpirtmp
        echo "test" > $PWD/.mpirtmp
        if [ ! -s $PWDtest/.mpirtmp ] ; then
	    /bin/rm -f $PWD/.mpirtmp
            PWD=$PWDtest
        fi
        /bin/rm -f $PWDtest/.mpirtmp $PWD/.mpirtmp
    else 
        PWD=$PWDtest
    fi
else
    PWD=`pwd`
fi
#
if [ -n "$PWD" ] ; then
    PWD_TRIAL=$PWD
else
    PWD_TRIAL=$PWDtest
fi
if [ ! -d $PWD_TRIAL ] ; then 
    echo "Warning: your default path uses the automounter; this may"
    echo "cause some problems if you use other NFS-connected systems."
    PWD_TRIAL=`pwd`
fi
#
# First try running a simple program
# (test for access or stty/who am i problems)
myprog=$PWD_TRIAL/mpichfoo
errcnt=0
errsimple=0
livelist=""
printedheader=""
for machine in $list ; do
    # Strip cluster size from machine name
    ntest=`expr $machine : '.*:\([0-9]*\)'`
    if [ -n "$ntest" ] ; then
        machine=`expr $machine : '\(.*\):.*'`
    fi
    if [ $verbose = 1 ] ; then 
	echo "Trying true on $machine ..."
    fi
    output=`$rsh $machine -n true 2>&1`
    if [ -n "$output" ] ; then
	if [ -z "$printedheader" ] ; then
	    echo "Errors while trying to run true"
	    printedheader=1
	fi
	echo "Unexpected response from $machine:"
        echo "--> $output"
	# Check for stty or who am i problems"
	iswho=`echo $output | grep -i 'am i'`
        if [ -n "$iswho" ] ; then
	    echo "You may have a command like"
	    echo "    who am i"
            echo "in your .login or .cshrc file.  This command can only be"
	    echo "used when a process is attached to a terminal."
            echo "See the Users Manual for ways to fix this."
        fi
        isstty=`echo $output | grep -i stty`
	if [ -n "$isstty" ] ; then
	    echo "You may have a command like"
	    echo "    stty ...."
            echo "in your .login or .cshrc file.  This command can only be"
	    echo "used when a process is attached to a terminal."
            echo "See the Users Manual for ways to fix this."
        fi
	errcnt=`expr $errcnt + 1`
	errsimple=`expr $errsimple + 1`
    else
	livelist="$livelist $machine"
    fi
done
if [ $errsimple -gt 0 ] ; then
cat <<EOF
    The test of $rsh <machine> true  failed on some machines.
    This may be due to problems in your .login or .cshrc files; 
    some common problems are described when detected.  Look at the 
    output above to see what the problem is.

    If the problem is something like 'permission denied', then the 
    remote shell command $rsh does not allow you to run programs.
    See the documentation about remote shell and rhosts.

EOF
fi
# Next try running ls
# (Test for consistent filesystem)
#
myprog=$PWD_TRIAL/mpichfoo
errcnt=0
errls=0
livelist=""
printedheader=""
#
# Get the output form to expect from ls.  
# Use /bin/ls to avoid any alias problems
tstout=`/bin/ls $myprog`
for machine in $list ; do
    # Strip cluster size from machine name
    ntest=`expr $machine : '.*:\([0-9]*\)'`
    if [ -n "$ntest" ] ; then
        machine=`expr $machine : '\(.*\):.*'`
    fi
    if [ $verbose = 1 ] ; then 
	echo "Trying ls on $machine ..."
    fi
    output=`$rsh $machine -n /bin/ls $myprog 2>&1`
    if [ "$output" != "$tstout" ] ; then
	if [ -z "$printedheader" ] ; then
	    echo "Errors while trying to run ls $myprog"
	    printedheader=1
	fi
	echo "Unexpected response from $machine:"
        echo "--> $output"
	errcnt=`expr $errcnt + 1`
	errls=`expr $errls + 1`
    else
	livelist="$livelist $machine"
    fi
done
/bin/rm -f mpichfoo
if [ $errls -gt 0 ] ; then
cat <<EOF
    The ls test failed on some machines.
    This usually means that you do not have a common filesystem on 
    all of the machines in your machines list; MPICH requires this
    for mpirun (it is possible to handle this in a procgroup file; see
    the documentation for more details).

    Other possible problems include:
        The remote shell command $rsh does not allow you to run ls.
           See the documentation about remote shell and rhosts.
        You have a common file system, but with inconsistent names.
           See the documentation on the automounter fix.

EOF
fi
#
# Now, try running a simple USER program
/bin/rm -f tstfoo.c 
cat >tstfoo.c <<.
main(){return 0;}
.
cc -c tstfoo.c
cc -o tstfoo tstfoo.o
if [ ! -x tstfoo ] ; then
    echo "Could not build a sample program using cc and cc!"
    /bin/rm -f tstfoo.c
    exit 1
fi
myprog=$PWD_TRIAL/tstfoo
list="$livelist"
livelist=""
printedheader=""
erruser=0
for machine in $list ; do
    if [ $verbose = 1 ] ; then 
	echo "Trying user program on $machine ..."
    fi
    output=`$rsh $machine -n $myprog 2>&1`
    if [ "$output" != "" ] ; then
	if [ -z "$printedheader" ] ; then
	    echo "Errors while trying to run a simple C program"
	    printedheader=1
	fi
	echo "Unexpected response from $machine:"
        echo "--> $output"
	errcnt=`expr $errcnt + 1`
        erruser=`expr $erruser + 1`
    else
	livelist="$livelist $machine"
    fi
done
/bin/rm -f tstfoo tstfoo.c tstfoo.o
if [ $erruser -gt 0 ] ; then
    cat <<EOF
    The simple program test failed.

    This test tries to run a simple program on the machines in your machines
    list with the command
        $rsh machinename -n program 

    This can fail if you do not have a common filesystem (this should have
    been detected above) or if the remote shell command $rsh does not allow 
    you to run the program on the indicated remote machines.  
        See the documentation about remote shell and rhosts for possible 
    fixes.
    
EOF
fi
#
#
if [ $errcnt -gt 0 ] ; then
    echo " "
    echo "$errcnt errors were encountered while testing the machines list for $arch"
    if [ -n "$livelist" ] ; then
	echo "Only these machines seem to be available"
	for machine in $livelist ; do
	    echo "    $machine"
	done
    else
	echo "No machines seem to be available!"
    fi
    exit 1
fi
exit 0

