Search Posts

Check Your AIX LPAR

Very useful script to check LPAR credits (Henrik Morsing)

#!/bin/ksh93

# Performance recommendation tool
#
# Copyright Henrik Morsing, 2022
#
# Initial version 1.0
# 09-11-2022    Henrik Morsing  1.1     Added more informative output
#                                       and correct when to alert (6 digits, not 5)

# Set a reference to current days up

ref="$(uptime | grep days | awk '{ print $3 }')"

#

# If less than a day or two, exit, less than twenty, warn

if [[ "${ref}" == "" ]]
then
   echo "System uptime too low."
   exit 1
elseif [[ "${ref}" -lt 20 ]]
   echo "System uptime too low to give accurate results."
fi

echo
echo "Starting System Performance Analyser v1.0"
echo
echo "System Name: $(uname -n) - System Uptime Days: ${ref}"
echo
echo "Please bear in mind, as stats used are accumulated over time,"
echo "they can be a view of the past and issues may already have been rectified."
echo
echo

#####################
# MEMORY
#####################

echo "\t *** MEMORY CHECKS ***"
echo
echo "Add more memory to rectify these"
echo

# Start by checking some memory variables
# Read paging space page outs, revolutions of the clock hand, free frame waits

vmstat -s | grep -E 'paging space page outs|revolutions of the clock hand|free frame waits' | awk '{ print $1 } ' | tr '\n' ' ' | read page_outs revolutions frame_waits

# First, convert to 90 day reference
page_outs_90=$(( ${page_outs}/${ref}*90 ))
revolutions_90=$(( ${revolutions}/${ref}*90 ))
frame_waits_90=$(( ${frame_waits}/${ref}*90 ))

# echo ${page_outs_90}
# echo ${revolutions_90}
# echo ${frame_waits_90}

# Then, find number of digits
page_outs_digits=${#page_outs_90}
frame_waits_digits=${#frame_waits_90}

# echo "${page_outs_digits}"
# echo "${frame_waits_digits}"

# Check on numbers and warn as needed

if [[ ${page_outs_digits} -gt 7 || ${revolutions} -gt $(( ${ref}*100 )) || ${frame_waits_digits} -gt 6 ]]
then
   echo "You are extremely memory constrained:"
   [[ ${page_outs_digits} -gt 7 ]] && echo "- \033[1;31m'paging space page outs' extremely is high:\033[m ${page_outs} -> ${page_outs_90} per 90 days (${page_outs_digits} digits)"
   [[ ${revolutions} -gt $(( ${ref}*100 )) ]] && echo "- \033[1;31m'revolutions of the clock hand' is extremely high:\033[m ${revolutions} -> ${revolutions_90} per 90 days"
   [[ ${frame_waits_digits} -gt 6 ]] && echo "- \033[1;31m'free frame waits' is extremely high:\033[m ${frame_waits} -> ${frame_waits_90} per 90 days (${frame_waits_digits} digits)"

elif [[ ${page_outs_digits} -gt 6 || ${revolutions} -gt $(( ${ref}*10 )) || ${frame_waits_digits} -gt 5 ]]
then
   echo "You are very memory constrained:"
   [[ ${page_outs_digits} -gt 6 ]] && echo "- \033[1;33m'paging space page outs' very is high:\033[m ${page_outs} -> ${page_outs_90} per 90 days (${page_outs_digits} digits)"
   [[ ${revolutions} -gt $(( ${ref}*10 )) ]] && echo "- \033[1;33m'revolutions of the clock hand' is very high:\033[m ${revolutions} -> ${revolutions_90} per 90 days"
   [[ ${frame_waits_digits} -gt 5 ]] && echo "- \033[1;33m'free frame waits' is very high:\033[m ${frame_waits} -> ${frame_waits_90} per 90 days (${frame_waits_digits} digits)"

elif [[ ${page_outs_digits} -gt 5 || ${revolutions} -gt ${ref} || ${frame_waits_digits} -gt 4 ]]
then
   echo "You could benefit from adding more memory:"
   [[ ${page_outs_digits} -gt 5 ]] && echo "- 'paging space page outs' is high: ${page_outs} -> ${page_outs_90} per 90 days  (${page_outs_digits} digits)"
   [[ ${revolutions} -gt ${ref} ]] && echo "- 'revolutions of the clock hand' is high: ${revolutions} -> ${revolutions_90} per 90 days"
   [[ ${frame_waits_digits} -gt 4 ]] && echo "- 'free frame waits' is high: ${frame_waits} -> ${frame_waits_90} per 90 days (${frame_waits_digits} digits)"
fi


#####################
# PROCESSOR
#####################

echo
echo "\t *** PROCESSOR CHECKS ***"
echo

# Checking for LPAR SRAD spreading

num_srads="$(lssrad -a | grep -v SRAD | wc -l)"
vCPUs_online="$(lparstat -i | grep 'Online Virtual CPUs' | awk '{ print $NF }').0"
vCPUs_max="$(lparstat -i | grep "Maximum Virtual CPUs" | awk '{ print $NF }')"
Entitlement="$(lparstat -i | grep "Entitled Capacity" | grep -v "Pool" | awk '{ print $NF }')"

if [[ ${num_srads} -gt "2" ]]
then
        echo "LPAR is spread across multiple SRADs (${num_srads}). If memory (2TB?) and max processor allocations (less than 15 vCPUs, currently ${vCPUs_max}) suggests it can be contained within one SRAD, powering the LPAR off and on again might align it correctly."
fi

echo
printf "*** Checking spreading factor ***"

if [[ ${vCPUs_online} -gt "1" ]]
then
   if [[ ${spreading} -gt 2 ]]
   then
      echo "\t[\033[1;33mWARNING\033[m]"
      echo "Number of virtual processors is high compared to entitlement."
   else
      echo "\t[\033[1;32mOK\033[m]"
   fi
fi


#####################
# I/O
#####################

# Starting from the top, VGs first

echo
echo "\t *** I/O CHECKS ***"
echo

for volgroup in $(lsvg -o)
do

   printf "*** Checking ${volgroup} ***"
   msg=false

   ##################
   # Checking pbufs #
   ##################

   # Count blocked I/Os with no pbuf
   pervg_blocked_io_count=$(/usr/sbin/lvmo -v ${volgroup} -o pervg_blocked_io_count)

   # Reference to 90 days
   pbio_90=$(( ${pervg_blocked_io_count}/${ref}*90 ))

   # Find number of digits
   pbio_digits=${#pbio_90}

   # Recommendation based on number of digits
   if [[ ${pbio_digits} -gt 6 ]]
   then
      url=true
      echo "\t[\033[1;33mWARNING\033[m]"

      # Calculate recommended pv_pbuf_count for VG
      pbuf_curr=$(lvmo -v ${volgroup} -o pv_pbuf_count)
      pbuf_vg=$(( ${pbuf_curr}+16384 ))

      echo "Volume group ${volgroup} is extremely low on pbufs"
      echo "- \033[1;31m'pending disk I/Os blocked with no pbuf' is extremely high:\033[m ${pbuf_curr}. Increase 'pv_pbuf_count' to ${pbuf_vg}.\n"
   else
      echo "\t[\033[1;32mOK\033[m]"
   fi
done


   ###################
   # Checking psbufs #
   ###################

   # Count blocked paging space I/O with no psbuf

   vmstat -v | grep -E 'paging space I/Os blocked with no psbuf|external pager filesystem I/Os blocked with no fsbuf' | awk '{ print $1 } ' | tr '\n' ' ' | read psbuf fsbuf

   # Reference to 90 days
   psio_90=$(( ${psbuf}/${ref}*90 ))

   # Any psbufs blocked is bad
   if [[ ${#psio_90} -gt 1 ]]
   then
      url=true
      printf "[\033[1;33mWARNING\033[m] "
      echo "\033[1;31mpsbufs is above 10\033[m, indicating severe memory restriction causing excessive paging. If you cannot add memory, alleviate by adding parallel paging spaces."
   fi


   ###################
   # Checking fsbufs #
   ###################
   echo
   # Count blocked external pager filesystem I/O with no fsbuf

   # Reference to 90 days
   fsio_90=$(( ${fsbuf}/${ref}*90 ))

   # Any fsbufs blocked is bad
   if [[ ${#fsio_90} -gt 2 ]]
   then
      url=true
      printf "[\033[1;33mWARNING\033[m] "
      echo "\033[1;31mfsbufs is above 100\033[m, indicating filesystem I/O over-load. Increase j2_dynamicBufferPreallocation with ioo to fix this. Start by doubling value."
      echo "Also consider splitting into smaller file systems."
   fi

   [[ "${url}" == "true" ]] && echo "Info on I/O buffers: https://www.ibm.com/support/pages/blocked-ios-due-buffers-shortage"

   ###################
   # Fibre Adapters  #
   ###################

adapters=$(lsdev -Ccadapter | grep fcs | awk '{ print $1 }')

# Check No Command Resource Count (Update num_cmd_elems)

   for adapter in ${adapters}
   do
      ncrc=$(fcstat -D ${adapter} | grep "No Command Resource Count" | awk '{ print $NF }')

      # Reference to 90 days
      ncrc_90=$(( ${ncrc}/${ref}*90 ))

      # No sure how many is bad, let's start with 6 digits

      if [[ ${#ncrc_90} -gt 6 ]]
      then
         url=true
         printf "[\033[1;33mWARNING\033[m] "
         echo "- \033[1;31mNo Command Resource Count for adapter ${adapter} is extremely high:\033[m ${ncrc} -> ${ncrc_90} per 90 days (${#ncrc_90} digits)"
         echo "Increase num_cmd_elems on ${adapter} to fix, but not higher than num_cmd_elems on the VIO physical adapter."
      elif [[ ${#ncrc_90} -gt 5 ]]
      then
         url=true
         printf "[\033[1;33mWARNING\033[m] "
         echo "- \033[1;31mNo Command Resource Count for adapter ${adapter} is very high:\033[m ${ncrc} -> ${ncrc_90} per 90 days (${#ncrc_90} digits)"
         echo "Increase num_cmd_elems on ${adapter} to fix, but not higher than num_cmd_elems on the VIO physical adapter."
      fi
   done

   [[ "${url}" == "true" ]] && echo "Info on fcs buffers: https://www.ibm.com/support/pages/no-command-resource-count-and-high-water-mark-active-and-pending-commands"
   url=false

   echo


# Check High water mark of active/pending commands (Update num_cmd_elems)

   for adapter in ${adapters}
   do
      hwmac=$(fcstat -D ${adapter} | grep -p "FC SCSI Adapter Driver Queue" | grep "High water mark  of active commands" | awk '{ print $NF }')
      hwmpc=$(fcstat -D ${adapter} | grep -p "FC SCSI Adapter Driver Queue" | grep "High water mark of pending commands" | awk '{ print $NF }')

      # Reference to 90 days
      hwmac_90=$(( ${hwmac}/${ref}*90 ))
      hwmpc_90=$(( ${hwmpc}/${ref}*90 ))

      hwm_summ=$(( ${hwmac} + ${hwmpc} ))

      # We need the current num_cmd_elems setting

      nce=$(lsattr -El fcs0 -a num_cmd_elems -F value)

      if [[ ${hwm_summ} -gt ${nce} ]]
      then
         url=true
         printf "[\033[1;33mWARNING\033[m] "
         echo "- \033[1;31mHigh water mark for active/pending command for adapter ${adapter} is higher than num_cmd_elems:\033[m ${hwm_summ} vs. ${nce}"
         echo "Increase num_cmd_elems on ${adapter} to fix, but not higher than num_cmd_elems on the VIO physical adapter."
      fi
   done

   # Link to helpful web page.
   echo
   [[ "${url}" == "true" ]] && echo "Info on fcs buffers: https://www.ibm.com/support/pages/no-command-resource-count-and-high-water-mark-active-and-pending-commands"
   url=false
   echo


# Check No DMA Resource Count (Update max_xfer_size)

   for adapter in ${adapters}
   do
      nodma=$(fcstat -D ${adapter} | grep "No DMA Resource Count" | awk '{ print $NF }')

      # Reference to 90 days
      nodma_90=$(( ${nodma}/${ref}*90 ))

      if [[ ${#nodma_90} -gt 3 ]]
      then
         url=true
         printf "[\033[1;33mWARNING\033[m] "
         echo "- \033[1;31mNo DMA Resource Count for adapter ${adapter} is higher than 3 digits per 90 days:\033[m ${nodma_90}"
         echo "Increase max_xfer_size on ${adapter} to fix, but not higher than max_xfer_size on the VIO physical adapter."
      fi
   done

   # Link to helpful web page.
   echo
   [[ "${url}" == "true" ]] && echo "Info on fcs buffers: https://www.ibm.com/support/pages/no-command-resource-count-and-high-water-mark-active-and-pending-commands"
   url=false

echo
exit 0