Viewing: launcher.sh

#!/usr/bin/bash
# SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause
#
#
# Copyright(c) 2016 Intel Corporation.
#
#
# Contact Information:
# Cong Xu, cong.xu@intel.com
#
# BSD LICENSE
#
# Copyright(c) 2016 Intel Corporation.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
# * Neither the name of Intel Corporation nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.




function usage() {
	cat << EOF
Usage: $0 [-a] [-d] [-l] [-h] [-m] [-n] [-o] [-u]
	-a  command to launch application
	-d  shared nfs directory to store LIOProf logs
	-l  lowest Lustre OSS node [Hostname]
	-h  highest Lustre OSS node [Hostname]
	-m  lowest Lustre Client [Hostname]
	-n  highest Lustre Client [Hostname]
	-o  use Obdfilter-survey to measure Lustre bandwidth
	-u  user name
EOF
	exit 0
}


while getopts ":a:d:l:h:m:n:ou:" arg; do
	case "${arg}" in
		a)
			a=${OPTARG};;
		d)
			d=${OPTARG};;
		l)
			l=${OPTARG};;
		h)
			h=${OPTARG};;
		m)
			m=${OPTARG};;
		n)
			n=${OPTARG};;
		o)
			o="Obdfilter-survey";;
		u)
			u=${OPTARG};;
		*)
			usage;;
	esac
done
shift $((OPTIND-1))

if [ -n "${o}" ]; then
	# Launch OBDfilter-survey to measure Lustre bandwidth
	if [ -n "${a}" ] || [ -z "${d}" ] || [ -z "${l}" ] || [ -z "${h}" ] \
		|| [ -z "${u}" ]; then
		usage
	fi
else
	# Launch application
	if [ -z "${a}" ] || [ -z "${d}" ] || [ -z "${l}" ] || [ -z "${h}" ] \
		|| [ -z "${m}" ] || [ -z "${n}" ] || [ -z "${u}" ]; then
		usage
	fi
fi


# Cluster Name
cluster_name=$(cut -d- -f1 <<<"${l}")

# Lustre OSS Nodes
OSS_MIN=$(cut -d- -f2 <<<"${l}")
OSS_MAX=$(cut -d- -f2 <<<"${h}")

# Lustre Clients
CLIENT_MIN=$(cut -d- -f2 <<<"${m}")
CLIENT_MAX=$(cut -d- -f2 <<<"${n}")

# Input user name
USER_NAME=${u}

# Commands information
mpi_cmd=mpirun
pdsh_cmd=/usr/bin/pdsh

# Job ID (Based on job time)
job_id=job-`date +%s`
echo "Launch" ${job_id}


if [ -n "${o}" ]; then
	# OBDfilter-survey (Obtain maximum available bandwidth of Lustre)
	echo "Running OBDfilter-survey in the background"

	HOMEOBDFILTER=${d}/${job_id}/obdfilter
	sudo -u ${USER_NAME} mkdir -p $HOMEOBDFILTER
	sudo -u ${USER_NAME} chmod 777 -R ${d}/${job_id}
	${pdsh_cmd} -R ssh -w $cluster_name-[$OSS_MIN-$OSS_MAX] " \
		size=65536 nobjlo=1 nobjhi=2 thrlo=32 thrhi=64 \
		obdfilter-survey > ${HOMEOBDFILTER}/\`hostname -s\` & \
	"
	exit 0
fi


# rpc and brw logs directories
LOCALRPC=/lioprof_loc/${job_id}/rpc
LOCALBRW=/lioprof_loc/${job_id}/brw
LOCALIOSTAT=/lioprof_loc/${job_id}/iostat

HOMERPC=${d}/${job_id}/rpc
HOMEBRW=${d}/${job_id}/brw
HOMEIOSTAT=${d}/${job_id}/iostat

# Create logs directories
${pdsh_cmd} -R ssh -w $cluster_name-[$OSS_MIN-$OSS_MAX] " \
	mkdir -p ${LOCALRPC} ${LOCALBRW} ${LOCALIOSTAT}; \
	"

# Change log directories permissions
sudo -u ${USER_NAME} mkdir -p ${HOMERPC} ${HOMEBRW} ${HOMEIOSTAT}
sudo -u ${USER_NAME} chmod 777 -R ${d}/${job_id}

# Enable RPC Tracing
${pdsh_cmd} -R ssh -w $cluster_name-[$OSS_MIN-$OSS_MAX] \
	"lctl set_param debug=rpctrace"

# Evaluate Performance

# Clear Lustre cache
${pdsh_cmd} -R ssh -w $cluster_name-[$OSS_MIN-$CLIENT_MAX] " \
	echo 3 > /proc/sys/vm/drop_caches; echo 0 > /proc/sys/vm/drop_caches;
"

# Start RPC log service and brw_stats
${pdsh_cmd} -R ssh -w $cluster_name-[$OSS_MIN-$OSS_MAX] " \
	echo > /proc/fs/lustre/obdfilter/*/brw_stats; \
	lctl clear; lctl debug_daemon start ${LOCALRPC}/rpc.log 1024; \
	"

# Start iostat
${pdsh_cmd} -R ssh -w $cluster_name-[$OSS_MIN-$OSS_MAX] " \
	iostat 1 > ${LOCALIOSTAT}/iostat.log&
	"
sleep 2

######################## Launch Application ########################
${a} > ${d}/${job_id}/job-output
sleep 2
####################################################################

# Collect Lustre RPC and btw_stats logs
${pdsh_cmd} -R ssh -w $cluster_name-[$CLIENT_MIN-$CLIENT_MAX] " \
	lctl set_param ldlm.namespaces.*.lru_size=clear
	"
sleep 5
${pdsh_cmd} -R ssh -w $cluster_name-[$OSS_MIN-$OSS_MAX] " \
	lctl debug_daemon stop; \
	cat /proc/fs/lustre/obdfilter/*/brw_stats > \
			${HOMEBRW}/brw-\`hostname -s\`; \
	lctl debug_file ${LOCALRPC}/rpc.log ${HOMERPC}/rpc-\`hostname -s\`; \
"

# Stop iostat and collect data
${pdsh_cmd} -R ssh -w $cluster_name-[$OSS_MIN-$OSS_MAX] " \
	pkill iostat; cp -r ${LOCALIOSTAT}/iostat.log \
	${HOMEIOSTAT}/iostat-\`hostname -s\` \
"

# Change log file mode
sleep 1
sudo -u root chmod 755 -R ${HOMERPC}/* ${HOMEBRW}/* ${HOMEIOSTAT}/*

###################################################################
#### Warning! Pay much more attention to rm commands with root ####
###################################################################
# Clear local history logs
sleep 2
LOCAL_LIOPROF=/lioprof_loc
${pdsh_cmd} -R ssh -w $cluster_name-[$OSS_MIN-$OSS_MAX] " \
	pkill iostat; \
	rm -rf ${LOCAL_LIOPROF}; \
"