📄 mpirun.ch_v3.in
字号:
#! /bin/sh## MPICH-V3# Copyright (C) 2002, 2003 Groupe Cluster et Grid, LRI, Universite de Paris Sud### This file is part of MPICH-V3.## MPICH-V3 is free software; you can redistribute it and/or modify# it under the terms of the GNU General Public License as published by# the Free Software Foundation; either version 2 of the License, or# (at your option) any later version.## MPICH-V3 is distributed in the hope that it will be useful,# but WITHOUT ANY WARRANTY; without even the implied warranty of# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the# GNU General Public License for more details.## You should have received a copy of the GNU General Public License# along with MPICH-V3; if not, write to the Free Software# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA## $Id: mpirun.ch_v3.in,v 1.2 2004/03/23 19:47:36 herault Exp $exitstatus=1# Default configurations for ch_v2 device# Event logger relatedrel=4del=1threadsEL=3useExplicitELonly="0"# Checkpoint server relatedrcs=4dcs=1useExplicitCSonly="0"just_testing=0MPI_HOST=$HOSTnolocal=1curarch=2dispatcherHost=$HOSTdispatcherPort=5555bindir=${MPIRUN_HOME}if [ -z "$rshcmd" ] ; then rshcmd=`which rsh`fielcmd="$bindir/mpirun.v2eventlogger"wrapcmd="$bindir/mpirun.v2d"killcmd="$bindir/mpirun.v2kill"cscmd="$bindir/mpirun.v2checkpointserver"sccmd="$bindir/mpirun.v2checkpointscheduler"tmpCS="/tmp"if [ -z "$pel_default" ] ; then pel_default=5001 ; fiif [ -z "$pcs_default" ] ; then pcs_default=4001 ; fiif [ -z "$pdp_default" ] ; then pdp_default=5555; fiif [ -z "$pComm"]; then pComm=9000; fiif [ -z "$cschedPort"]; then cschedPort=2003; fidispatcherPort=$pdp_defaultMAX_CLUSTER_SIZE=16xwworkdir="."mpirun_verbose=0xwpgfile="v2pgfile.$job_id"if [ -z "$argsset" ] ; then . $MPIRUN_HOME/mpirun.args argsset=1fiif [ -z "$keep_xwfile" ] ; then keep_xwfile=0fiif [ -z "$machineFile" ] ; then machineFile="machines.ch_v2"fiif [ -z "$job_id" ] ; then job_id=$$fiv2_tmp="$PWD/.$job_id/"xwavailfile="$xwpgfile.avail"xwcommandsfile="$xwpgfile.commands"# We now proceed with the different initializations of# the number of event loggers and checkpoint serversif [ -n "$eventLoggerRatio" ] ; then rcm=`expr $eventLoggerRatio + 0`fiif [ -n "$eventLoggerNumber" ] ; then npEL=`expr $eventLoggerNumber + 0` rel=`expr $np / $npEL`else npEL=`expr $np / $rel + 1`fiif [ -n "$checkPointServerRatio" ] ; then rcs=`expr $checkPointServerRatio + 0`fiif [ -n "$checkPointServerNumber" ] ; then npCS=`expr $checkPointServerNumber + 0` rcs=`expr $np / $npCS`else npCS=`expr $np / $rcs + 1`finpTotal=`expr $np + $npEL + $npCS`if [ -n "$xwSecurityRatio" ] ; then# NOTE: we calculate on the number of processors, but the margin is a number of NODES of course... xwSecurityMargin=`expr $xwSecurityRatio \* $npTotal / 100 `else if [ -z "$xwSecurityMargin" ] ; then # The default ratio is 30% xwSecurityMargin=`expr 30 \* $npTotal / 100 ` fifi## Construct the procgroup file.# If p4 was built with comm=shared, set MPI_MAX_CLUSTER_SIZE to 16 unless# it has already been set.if [ "$COMM" = "shared" ] ; then if [ -z "$MPI_MAX_CLUSTER_SIZE" ] ; then MPI_MAX_CLUSTER_SIZE=16 fielse MPI_MAX_CLUSTER_SIZE=1fi. $MPIRUN_HOME/mpirun.pg_v2# machinelist has the hosts# eventloggerPortList has the event logger ports# checkpointServerPortList has the checkpoint servers ports# dualcapabilitylist has the list indicating if a machine can be both EL and CS# archuselist has the architectures# nprocuselist has the number of processors## We use this form instead of "local 0" in-case the user is trying to# select a second network whose names are not those returned by# "hostname". For example, a system with a DEC Gigiswitch, Myricom # network, or IP over the IBM SP2 switch (HPS).prognamemain=$prognameif [ -z "$xwpgfileGiven" ] ; then #xwpgfile="$PWD_TRIAL/PI$$" # We need to explicitly redirect stderr. # Under Solaris, an echo that fails aborts the script (!) There is # no way around this using echo, so we use cat instead. cat <<EOF 2>/dev/null > $xwpgfileTestEOF if [ ! -s $xwpgfile ] ; then # May not be able to write there. Try the user's home directory xwpgfile=$HOME/$xwpgfile else /bin/rm -rf $xwpgfile fi # cnt is the "index" into the list of machines procNum=1 nprocval=`echo $nprocuselist | cut -d' ' -f1` # for the "local" entry, the number is the number of ADDITIONAL # processes. if [ -z "$nprocval" ] ; then # just in case... nprocval=1 fi nprocval=`expr $nprocval - 1` archval=$archlocal proginstance=`echo $progname | sed "s/%a/$archval/g"` prognamemain=$proginstance if [ "$just_testing" = 1 ] ; then if [ "$nolocal" = 0 ] ; then if [ -z "$nproclocal" ] ; then nprocval=0 else nprocval=`expr $nproclocal - 1` fi else procNum=2 fi else if [ "$nolocal" = 0 ] ; then if [ -z "$MPI_HOST" ] ; then echo "No value for MPI_HOST!" echo "MPI_HOST is set either from your environment or by" echo "processing for an MPI machine type of ch_p4, ch_tcp, " echo "ch_nexus, or sgi_mp. The machine type you used was $machine ." echo "Use the -machine <machinename> argument to select a " echo "specific machine type." exit 1 fi if [ -z "$nproclocal" ] ; then nprocval=0 else nprocval=`expr $nproclocal - 1` fi else procNum=2 fi fi if [ $just_testing = 1 ] ; then cmdline="echo" else cmdline="eval" fi# The following function is used to get the address of the machine if it's connected to a high performance network. Returns the normal ip if non existsgetFastIP() { if [ -n "$ipTranslationFile" ] ; then newIP=`cat $ipTranslationFile | grep $hostIP | cut -d' ' -f 2` if [ -z "$newIP" ] ; then fastIP=$hostIP else fastIP=$newIP fi else fastIP=$hostIP fi} # We first take care of the event loggers eventLoggerUsed="" fastEventLoggerUsed="" eventLoggerPortUsed="" elidx=0 elinc=0#DEBUG elDistribution=1# elNb=2#ENDDEBUG i=1 elNb=$npEL while [ "$elidx" -lt $elNb ] ; do port=`exec echo $eventLoggerPortList | cut -d' ' -f $i` freecpus=`exec echo $nprocuselist | cut -d' ' -f $i` if [ "$port" = -1 -o "$freecpus" = 0 ] ; then i=`expr $i + 1` #TODO: check for infinite loops! else hostName=`exec echo $machinelist | cut -d' ' -f $i` hostIP=`exec host "$hostName" | grep address | cut -d' ' -f 4` # ugly way to get the IP address, isn't it? getFastIP eventLoggerUsed="$eventLoggerUsed $hostIP" fastEventLoggerUsed="$fastEventLoggerUsed $fastIP" eventLoggerPortUsed="$eventLoggerPortUsed $port" echo "EL $hostIP $fastIP $port" # Now we update the cpu list as well as the checkpointserver property # if the node cannot perform both capabilities at the same time newcpulist="" cpuidx=0 dc=`exec echo $dualCapabilityList | cut -d' ' -f $i` # We first update the procs used list for nprocs in $nprocuselist ; do cpuidx=`expr $cpuidx + 1` if [ $cpuidx = $i ] ; then cpuval=`expr $freecpus - 1` newcpulist="$newcpulist $cpuval" cspe="-1" else newcpulist="$newcpulist $nprocs" cspe=`exec echo $checkpointServerPortList | cut -d" " -f $cpuidx` fi newCSPList="$newCSPList $cspe" done nprocuselist="$newcpulist" if [ "$dc" != "1" ] ; then checkpointServerPortList="$newCSPList"; fi ; elidx=`expr $elidx + 1` i=`expr $i + $elDistribution` fi done > "$xwpgfile" # We now take care of the checkpoint servers checkpointServerUsed="" fastCheckpointServerUsed="" checkpointServerPortUsed="" csidx=0 csinc=0#DEBUG csDistribution=1# csNb=2#ENDDEBUG i=1 csNb=$npCS while [ "$csidx" -lt $csNb ] ; do port=`exec echo $checkpointServerPortList | cut -d' ' -f $i` freecpus=`exec echo $nprocuselist | cut -d' ' -f $i` if [ "$port" = -1 -o "$freecpus" = 0 ] ; then i=`expr $i + 1` #TODO: check for infinite loops! else hostName=`echo $machinelist | cut -d' ' -f $i` hostIP=`exec host $hostName | grep address | cut -d' ' -f 4` getFastIP checkpointServerUsed="$checkpointServerUsed $hostIP" fastCheckpointServerUsed="$fastCheckpointServerUsed $fastIP" checkpointServerPortUsed="$checkpointServerPortUsed $port" echo "CS $hostIP $fastIP $port $tmpCS" # Now we update the cpu list newcpulist="" cpuidx=0 for nprocs in $nprocuselist ; do cpuidx=`expr $cpuidx + 1` if [ $cpuidx = $i ] ; then cpuval=`expr $freecpus - 1` newcpulist="$newcpulist $cpuval" else newcpulist="$newcpulist $nprocs" fi done nprocuselist="$newcpulist" csidx=`expr $csidx + 1` i=`expr $i + $csDistribution` fi done >> "$xwpgfile" # Then the checkpoint scheduler(s) checkpointSchedulerUsed=`echo $eventLoggerUsed | cut -d' ' -f 1` fastCheckpointSchedulerUsed="" { hostIP="$checkpointSchedulerUsed" getFastIP fastCheckpointSchedulerUsed="$fastCheckpointSchedulerUsed $fastIP" echo "SC $hostIP $fastIP $cschedPort" } >> "$xwpgfile"#DEBUG current_scs=$fastIP current_scp=$cschedPort#DEBUG rank=0 i=0 # The index of the machine in machinelist firstpass=1 allMachines="" while [ $rank -lt $np ] ; do i=`expr $i + 1` if [ $i -gt $nnodes ]; then i=1 firstpass=0 fi # No need to go further if there are no CPUs free on this machine freecpus=`exec echo $nprocuselist | cut -d' ' -f $i` if [ "$freecpus" = 0 ] ; then continue ; fi machine=`exec echo $machinelist | cut -d' ' -f $i` # As we would rather use as many machines as possible, try to avoid using # the EL and CS machines, for the first pass only of course if [ $firstpass -eq "1" ]; then eltest=`echo $eventLoggerUsed | grep -e ' $machine '` cstest=`echo $checkpointServerUsed | grep -e ' $machine '` if [ "$eltest" != "" -o "$cstest" != "" ]; then continue fi fi elidx=`expr $rank / $rel + 1` current_els=`exec echo $fastEventLoggerUsed | cut -d' ' -f $elidx` current_elp=`exec echo $eventLoggerPortUsed | cut -d' ' -f $elidx` cpidx=`expr $rank / $rcs + 1` current_cps=`exec echo $fastCheckpointServerUsed | cut -d' ' -f $cpidx` current_cpp=`exec echo $checkpointServerPortUsed | cut -d' ' -f $cpidx` hostIP=`exec host "$machine" | grep address | cut -d' ' -f 4` getFastIP myPort=`expr $pComm + $rank` echo "CN $rank $machine $hostIP $fastIP $myPort $current_els $current_elp $current_cps $current_cpp $current_scs $current_scp" # We now remove 1 processor from the machine in the nprocuselist variable ncpus1=`exec echo $nprocuselist | cut -d' ' -f -$(($i-1))` ncpus2=`exec echo $nprocuselist | cut -d' ' -f $(($i+1))-` newnp=`expr $freecpus - 1` nprocuselist="$ncpus1 $newnp $ncpus2" rank=`expr $rank + 1` done >> "$xwpgfile" # We now write the command file { echo "pwd=$PWD" echo "rshcmd=$rshcmd" echo "cscmd=$cscmd" echo "sccmd=$sccmd" echo "elcmd=$elcmd" echo "prog=$progname" echo "wrapper=$wrapcmd" echo "kill=$killcmd" echo "just_testing=$just_testing" echo "v2tmp=$v2_tmp" echo "keep_v2file=$keep_xwfile" } > $xwcommandsfile # The avail file, specifying which nodes can be used, and with what options IFS=" " for node in $security ; do IFS=":" set -- $node hostname=$1 ip=`exec host "$hostname" | grep address | cut -d' ' -f 4` nprocs=$2 elp=$3 csp=$4 dc=$5 echo "$hostname $ip $nprocs $elp $csp $dc" done > $xwavailfile # make sure that all the files needed by runprog have been written if [ ! -r "$xwpgfile" ] ; then echo Failed to write "$xwpgfile" : Exiting. exit 1 fi if [ ! -r "$xwcommandsfile" ] ; then echo Failed to write "$xwcommandsfile" : Exiting. exit 1 fi if [ ! -r "$xwavailfile" ] ; then echo Failed to write "$xwavailfile" : Exiting. exit 1 fi # NOW we launch the program if [ -z "$debug_command" ] ; then debug_command=`exec host "$HOST" | grep address | cut -d' ' -f 4` debug_command="$debug_command:1976" fi progArguments="$bindir/mpirun.v2run -f $xwpgfile -g $job_id -p $dispatcherPort -debug $debug_command" if [ -n "$checkpointTime" ] ; then progArguments="$progArguments -checkpoint $checkpointTime" fi if [ -n "$debugFile" ] ; then progArguments="$progArguments -debugfile $debugFile" echo $progArguments fi if [ -z "$noRun" ] ; then eval $progArguments fi # when the execution is finished, it's time to clean if [ $keep_xwfile -ne "1" ]; then rm -f $xwpgfile rm -f $xwcommandsfile rm -f $xwavailfile fifiexit $exitstatus
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -