Viewing: sgpdd-survey
#!/usr/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# This file is part of Lustre, http://www.lustre.org/
#
######################################################################
# customize per survey
# CHOOSE EITHER scsidevs or rawdevs
# the SCSI devices to measure - WARNING: will be erased.
# The raw devices to use
# rawdevs=${rawdevs:-"/dev/raw/raw1"}
# scsidevs=`ls /dev/sd[a-z] /dev/sd[a-z][a-z]` # all devices, if you use udev
# result file prefix.
# NB ensure the path exists on all servers if it includes subdirs
rslt_loc=${rslt_loc:-"/tmp"}
rslt=${rslt:-"$rslt_loc/sgpdd_survey_`date +%F@%R`"}
# what to do (read or write)
actions=${actions:-"write read"}
# total size per device (MBytes)
# NB bigger than device cache is good
size=${size:-8192}
# record size (KBytes)
rszlo=${rszlo:-1024}
rszhi=${rszhi:-1024}
# Concurrent regions per device
crglo=${crglo:-1}
crghi=${crghi:-256}
# boundary blocks between concurrent regions per device
boundary=${boundary:-1024}
# threads to share between concurrent regions per device
# multiple threads per region simulates a deeper request queue
# NB survey skips over #thr < #regions and #thr/#regions > SG_MAX_QUEUE
thrlo=${thrlo:-1}
thrhi=${thrhi:-4096}
# NUMA support
# User provided script that returns a cpu list from a specified device.
# Implementation depends on the type of device (scsi/raw, with/without
# multipath, technology fc/sas/ib)
# For example:
# $ cat bin/dev2cpus
# #!/usr/bin/bash
# dev=$(basename $1)
# pci=$(readlink -f /sys/class/block/$dev | cut -d/ -f1-5)
# cat ${pci}/local_cpulist
dev2cpus=${dev2cpus:-""}
#####################################################################
# leave the rest of this alone unless you know what you're doing...
# and max # threads one instance will spawn
SG_MAX_QUEUE=16
# numactl command
NUMACTL=${NUMACTL:-"/usr/bin/numactl"}
unique () {
echo "$@" | xargs -n1 echo | sort -u
}
split_hostname () {
local name=$1
case $name in
*:*) host=`echo $name | sed 's/:.*$//'`
name=`echo $name | sed 's/[^:]*://'`
;;
*) host=localhost
;;
esac
echo "$host $name"
}
DSH=${DSH:-"ssh"}
dsh () {
local node="$1"
local user="$2"
shift 2
local command="$@"
command="export PATH=/sbin:/usr/sbin:\$PATH; $command"
case $DSH in
ssh)
if [ -n "$user" ]; then
user="$user@"
fi
$DSH $user$node "$command"
;;
rsh)
if [ -n "$user" ]; then
user="-l $user"
fi
$DSH $user $node "$command"
;;
esac
}
# how to run commands on other nodes
remote_shell () {
local host=$1
shift
local cmds="$@"
if [ "$host" = "localhost" -o "$host" = `uname -n` ]; then
eval "$cmds"
else
# split $host into $host and $user
local user=""
if [[ $host == *@* ]]; then
user=${host%@*}
host=${host#*@}
fi
dsh $host "$user" "$cmds"
fi
}
# check either scsidevs or rawdevs is specified
# but only one of them
if [ -n "$scsidevs" -a -n "$rawdevs" -o -z "$scsidevs$rawdevs" ]; then
echo "Must either specify scsidevs or rawdevs"
exit 1
fi
# retrieve host and device if specified as "hostname:device"
ndevs=0
devs=()
for d in $scsidevs $rawdevs; do
str=(`split_hostname $d`)
hosts[$ndevs]=${str[0]}
devs[$ndevs]=${str[1]}
ndevs=$((ndevs+1))
done
unique_hosts=(`unique ${hosts[@]}`)
# get device cpu list
devcpus=()
if [ -n "$dev2cpus" ]; then
for ((i=0; i < $ndevs; i++)); do
devcpus[$i]=$(remote_shell ${hosts[$i]} $dev2cpus ${devs[$i]})
done
fi
# map given device names into SG device names
if [ "$scsidevs" ]; then
# make sure sg kernel module is loaded
for host in ${unique_hosts[@]}; do
sg_is_loaded=$(remote_shell $host grep -q "^sg " /proc/modules \
&& echo true || echo false)
if ! $sg_is_loaded; then
echo "loading the sg kernel module on $host"
remote_shell $host modprobe sg
sg_was_loaded_on="$sg_was_loaded_on $host"
fi
done
for ((i=0; i < $ndevs; i++)); do
# resolve symbolic link if any
devs[$i]=$(remote_shell ${hosts[$i]} readlink -f ${devs[$i]})
# retrieve associated sg device
# we will test for a LUN, the test for a partition
# if the partition number is > 9 this will fail
tmp=$(remote_shell ${hosts[$i]} sg_map | \
awk -v dev=${devs[$i]} '{if ($2 == dev) print $1}')
if [ -z "$tmp" ]; then
echo "Can't find SG device for ${hosts[$i]}:${devs[$i]}, " \
"testing for partition"
pt=`echo ${devs[$i]} | sed 's/[0-9]*$//'`
# Try again
tmp=$(remote_shell ${hosts[$i]} sg_map | \
awk -v dev=$pt '{if ($2 == dev) print $1}')
if [ -z "$tmp" ]; then
echo -e "Can't find SG device ${hosts[$i]}:$pt.\n" \
"Do you have the sg module configured for your kernel?"
exit 1
fi
fi
devs[$i]=$tmp
done
elif [ "$rawdevs" ]; then
for ((i=0; i < $ndevs; i++)); do
RES=$(remote_shell ${hosts[$i]} raw -q ${devs[$i]})
if [ $? -ne 0 ];then
echo "Raw device ${hosts[$i]}:${devs[$i]} not set up"
exit 1
fi
done
fi
# determine block size of each device. This should also work for raw devices
# If it fails, set to 512
for ((i=0; i < $ndevs; i++)); do
# retrieve device size (in kbytes) and block size (in bytes)
tmp=( `remote_shell ${hosts[$i]} sg_readcap -lb ${devs[$i]}` )
bs[$i]=$((tmp[1]))
if [ ${bs[$i]} == 0 ]; then
echo "sg_readcap on device ${hosts[$i]}:${devs[$i]} failed, " \
"setting block size to 512"
bs[$i]=512
fi
devsize=$((tmp[0]*bs[$i]/1024))
# check record size is a multiple of block size
if [ $((rszlo*1024%bs[$i])) -ne 0 ]; then
echo "Record size is not a multiple of block size (${bs[$i]} bytes) " \
"for device ${hosts[$i]}:${devs[$i]}"
exit 1
fi
# check device size
if [ $devsize -lt $((size*1024)) ]; then
echo -e "device ${hosts[$i]}:${devs[$i]} not big enough: " \
"$devsize < $((size*1024)).\nConsider reducing \$size"
exit 1
fi
done
rsltf=${rslt}.summary
workf=${rslt}.detail
cmdsf=${rslt}.script
echo -n > $rsltf
echo -n > $workf
print_summary () {
if [ "$1" = "-n" ]; then
minusn=$1; shift
else
minusn=""
fi
echo $minusn "$*" >> $rsltf
echo $minusn "$*"
}
print_summary "$(date) sgpdd-survey on $rawdevs$scsidevs from $(hostname)"
for ((rsz=$rszlo;rsz<=$rszhi;rsz*=2)); do
for ((crg=$crglo;crg<=$crghi;crg*=2)); do
for ((thr=$thrlo;thr<=$thrhi;thr*=2)); do
if ((thr < crg || thr/crg > SG_MAX_QUEUE)); then
continue
fi
# compute total size (in kbytes)
total_size=0
for ((i=0; i < $ndevs; i++)); do
tsize=$((size*1024*1024/bs[$i]/crg*crg*bs[$i]/1024))
total_size=$((total_size+tsize))
done
# show test parameters
str=`printf 'dev %2d sz %8dK rsz %4dK crg %5d thr %5d ' \
$ndevs $total_size $rsz $((crg*ndevs)) $((thr*ndevs))`
echo "==============> $str" >> $workf
print_summary -n "$str"
# check memory for each host
for host in ${unique_hosts[@]}; do
numdevs=0
for ((i=0; i < $ndevs; i++)); do
if [ ${hosts[$i]} == $host ]; then
numdevs=$((numdevs+1))
fi
done
freemem=$(remote_shell $host cat /proc/meminfo | \
awk '/^MemTotal:/ {printf "%d\n", $2}')
if (((rsz*thr/crg + 64)*crg*numdevs > freemem)); then
echo "ENOMEM on $host" >> $workf
print_summary "ENOMEM"
continue 2
fi
done
# run tests
for action in $actions; do
declare -a pidarray
print_summary -n "$action "
echo "=====> $action" >> $workf
tmpf=${workf}_tmp
# create per-host script files
for host in ${unique_hosts[@]}; do
echo -n > ${cmdsf}_${host}
done
for ((i=0; i < $ndevs; i++)); do
bpt=$((rsz*1024/bs[$i]))
blocks=$((size*((1024*1024)/bs[$i])/crg))
count=$blocks
host=${hosts[$i]}
dev=${devs[$i]}
if [ $action = read ]; then
inf="if=$dev"
outf="of=/dev/null"
skip=skip
else
inf="if=/dev/zero"
outf="of=$dev"
skip=seek
fi
if [ -n "${devcpus[$i]}" -a -x "$NUMACTL" ]; then
numacmd="$NUMACTL --physcpubind=${devcpus[$i]} --localalloc"
else
numacmd=""
fi
for ((j=0;j<crg;j++)); do
echo >> ${cmdsf}_${host} \
"$numacmd " \
"sgp_dd 2> ${tmpf}_${i}_${j} $inf $outf " \
"${skip}=$((boundary+j*blocks)) " \
"thr=$((thr/crg)) count=$count bs=${bs[$i]} " \
"bpt=$bpt time=1&"
done
done
for host in ${unique_hosts[@]}; do
echo "wait" >> ${cmdsf}_${host}
done
# run of all the per-host script files
t0=`date +%s.%N`
pidcount=0
for host in ${unique_hosts[@]}; do
remote_shell $host bash < ${cmdsf}_${host} &
pidarray[$pidcount]=$!
pidcount=$((pidcount+1))
done
pidcount=0
for host in ${unique_hosts[@]}; do
wait ${pidarray[$pidcount]}
pidcount=$((pidcount+1))
done
t1=`date +%s.%N`
# clean up per-host script files
for host in ${unique_hosts[@]}; do
rm ${cmdsf}_${host}
done
# collect/check individual stats
echo > $tmpf
ok=0
for ((i=0;i<ndevs;i++)); do
for ((j=0;j<crg;j++)); do
rtmp=${tmpf}_${i}_${j}_local
remote_shell ${hosts[$i]} cat ${tmpf}_${i}_${j} > $rtmp
if grep 'error' $rtmp > /dev/null 2>&1; then
echo "Error found in $rtmp"
elif grep 'time to transfer data' $rtmp > /dev/null 2>&1; then
ok=$((ok + 1))
fi
cat ${rtmp} >> $tmpf
cat ${rtmp} >> $workf
rm ${rtmp}
remote_shell ${hosts[$i]} rm ${tmpf}_${i}_${j}
done
done
if ((ok != ndevs*crg)); then
print_summary -n "$((ndevs*crg - ok)) failed "
else
# compute bandwidth in MiB/s from total data / elapsed time
bw=`awk "BEGIN {printf \"%7.2f \", \
$total_size / (( $t1 - $t0 ) * 1024); exit}"`
# compute global min/max stats
minmax=`awk < $tmpf \
'/time to transfer data/ {mb=$8/1.048576; \
if (n == 0 || mb < min) min = mb; \
if (n == 0 || mb > max) max = mb; \
n++} \
END {printf "[ %7.2f, %7.2f] ",min,max;}'`
print_summary -n "$bw $minmax "
fi
rm $tmpf
done
print_summary ""
done
done
done
for host in $sg_was_loaded_on; do
echo "unloading sg module on $host"
remote_shell $host rmmod sg
done