#!/bin/sh
# @description check if rudder agent has no problem and is running properly
# @man Check that rudder agent is working properly.
# @man - generate missing UUID or keys
# @man - kill cfengine if there are too many processes
# @man - run cfengine if its daemon is missing
# @man - clean lock file if it is too big
# @man - check that policies have been properly copied
# @man - will sleep a random time (max half the run interval) when not run in interactive mode
# @man +
# @man *Options*:
# @man +
# @man *-q*: run the agent in quiet mode (display only error and warning messages)
# @man +
# @man *-c*: run the agent without color output
# @man +
# @man *-f*: prevent sleeping in non-interactive mode
# @man +
# @man *-u*: only check the uuid existence
# @man +
# @man *-r*: reset, do not try to restore backups


. "${BASEDIR}/../lib/common.sh"

QUIET=false
SLEEP=true
ONLY_UUID=false
OPT=""
RESET=false
STOP_AT=""

while getopts "qcfurs:" opt; do
  case $opt in
    q)
      QUIET=true
      OPT="${OPT} -q"
      ;;
    c)
      clear_colors
      OPT="${OPT} -c"
      ;;
    f)
      SLEEP=false
      ;;
    u)
      ONLY_UUID=true
      ;;
    r)
      RESET=true
      ;;
    s)
      STOP_AT=${OPTARG}
      ;;
  esac
done

# Variables
BACKUP_DIR="/var/backups/rudder/"
CFE_DIR="${RUDDER_VAR}/cfengine-community"
CFE_BIN="/opt/rudder/bin"
CFE_DISABLE_FILE="${RUDDER_DIR}/etc/disable-agent"
UUID_FILE="${RUDDER_DIR}/etc/uuid.hive"
LAST_UPDATE_FILE="${CFE_DIR}/last_successful_inputs_update"

## Paths for Rudder Server Roles
RUDDER_SERVER_ROLES="${RUDDER_DIR}/etc/server-roles.d"

# set this to true to force an inventory at the end of the check
INVENTORY_NEEDED="false"

# Get common commands from common.sh
init_commands

# Ensure script is executed by root
MYUID=`id | cut -d\( -f2 | cut -d\) -f1`
if [ "${MYUID}" != 'root' ];then echo "You must be root"; exit; fi

# only enable lock/sleep mechanisms in non-interactive mode
if [ ! -t 0 ]; then
  ## If we force, we *really* want it to be run (it's during upgrade)
  if [ "$SLEEP" = true ]; then
    ## Lock agent check to avoid piling up check processes
    ## This might be removable when 6.2 becomes the oldest supported (or not) because of #16859
    LOCKFILE=/tmp/agent-check.lock
    if [ -e "${LOCKFILE}" ] && kill -0 `cat "${LOCKFILE}"`; then
        echo "rudder agent check is already running"
        exit 1
    fi
    # make sure the lockfile is removed when we exit and then claim it
    trap "rm -f ${LOCKFILE}; exit" INT TERM EXIT
    echo "$$" > "${LOCKFILE}"

    # sleep for a random time, if not interactive
    # we half the interval in the hope of not running at the same time as the agent
    MAX_SLEEP=`expr ${AGENT_RUN_INTERVAL} \* 30` # in second
    SLEEP_DURATION=$(awk -v m="$MAX_SLEEP" 'BEGIN{srand(); print int(rand()*m)}')
    sleep $SLEEP_DURATION
  fi
fi

##################################################################
#### Test functions, skip to 'TEST' tag to read testing order ####
##################################################################

# remove cfengine lock files
clean_cf_lock_files() {
  rm -f ${CFE_DIR}/state/cf_lock.lmdb
  rm -f ${CFE_DIR}/state/cf_lock.lmdb.lock
}

# Restore file from backup if it doesn't exist, return an error if there is no backup
exist_or_restore() {
  file_path="$1"
  file_name=$(basename ${file_path})
  if [ ! -f "${file_path}" ] || [ ! -s "${file_path}" ]; then
    if [ -d "${BACKUP_DIR}" ] && [ "${RESET}" = "false" ]; then
      if echo "$file_path" | grep -q "ppkeys"; then
        # Some files are in a ppkeys-* folder, we need to handle them
        last_backup=$(ls -1 ${BACKUP_DIR}ppkeys-*/${file_name} 2> /dev/null | sort | tail -n1 || true)
        mkdir -p "${CFE_DIR}/ppkeys"
        chmod 700 "${CFE_DIR}/ppkeys"
      else
        last_backup=$(ls -1 ${BACKUP_DIR}${file_name}-* 2> /dev/null | sort | tail -n1 || true)
      fi

      if [ "${last_backup}" != "" ]; then
        [ "$QUIET" = false ] && printf "INFO: The file '${file_path}' does not exist. The latest backup '${last_backup}' will be recovered..."
        ${CP_A} "${last_backup}" "${file_path}" >/dev/null
        # any file restored here could be important for the inventory
        INVENTORY_NEEDED="true"
        [ "$QUIET" = false ] && echo " Done"
      else
        return 1
      fi
    else
      return 1
    fi
  fi
}

# Compare agent certificate UID and agent UUID (return shell boolean)
compare_uuid() {
  UUID=$(cat "${UUID_FILE}")
  SUBJECT=$(openssl x509 -in ${RUDDER_DIR}/etc/ssl/agent.cert -noout -subject | sed 's/.*UID *= *\([^,]*\)\(,.*\)*/\1/')
  [ "${UUID}" = "${SUBJECT}" ]
}

compare_key() {
  # Agent certificate must match the key pair
  modulus_cert=$(openssl x509 -noout -modulus -in "/opt/rudder/etc/ssl/agent.cert")
  modulus_key=$(openssl rsa -noout -modulus -passin "pass:Cfengine passphrase" -in "${CFE_DIR}/ppkeys/localhost.priv")
  [ "${modulus_cert}" = "${modulus_key}" ]
}

# Keys must be present with a certificate and have proper access rights
check_and_fix_keys() {
  # Agent must have a private key
  if ! exist_or_restore "${CFE_DIR}/ppkeys/localhost.priv"; then
    [ "$QUIET" = false ] && printf "INFO: Rudder key pair is missing, creating it..."
    rm -f "${CFE_DIR}/ppkeys/localhost.pub"
    "${CFE_BIN}/cf-key" -T 4096
    [ "$QUIET" = false ] && echo " Done"
    INVENTORY_NEEDED="true"
  fi

  # Key must not have a passphrase
  if grep -q "ENCRYPTED" ${CFE_DIR}/ppkeys/localhost.priv > /dev/null 2>&1; then
    openssl rsa -in ${CFE_DIR}/ppkeys/localhost.priv -passin "pass:Cfengine passphrase" > ${CFE_DIR}/ppkeys/localhost.priv.unenc 2>/dev/null
    /bin/mv ${CFE_DIR}/ppkeys/localhost.priv.unenc ${CFE_DIR}/ppkeys/localhost.priv
    INVENTORY_NEEDED="true"
  fi

  # Extract public key if missing
  if [ ! -f "${CFE_DIR}/ppkeys/localhost.pub" ]; then
    [ "$QUIET" = false ] && printf "INFO: Rudder public key is missing, creating it..."
    openssl rsa -passin "pass:Cfengine passphrase" -in "${CFE_DIR}/ppkeys/localhost.priv" -RSAPublicKey_out -out "${CFE_DIR}/ppkeys/localhost.pub"
    [ "$QUIET" = false ] && echo " Done"
  fi

  # First restore certificate if not there
  exist_or_restore /opt/rudder/etc/ssl/agent.cert || true

  # Regen if cert if missing or invalid
  if [ ! -f /opt/rudder/etc/ssl/agent.cert ] || ! compare_uuid || ! compare_key; then
    # Backup cert
    [ -f /opt/rudder/etc/ssl/agent.cert ] && mv /opt/rudder/etc/ssl/agent.cert "/var/backups/rudder/agent-`date +%Y%m%d`.cert"
    [ "$QUIET" = false ] && printf "INFO: Rudder certificate is missing or incorrect, creating one..."
    openssl req -new -batch -sha256 -key "${CFE_DIR}/ppkeys/localhost.priv" -out "/opt/rudder/etc/ssl/agent.cert" -x509 -days 3650 -extensions agent_cert -config /opt/rudder/etc/ssl/openssl-agent.cnf -subj "/UID=$(cat "${UUID_FILE}")"
    [ "$QUIET" = false ] && echo " Done"
  fi

  # CFEngine key directory must not be accessible by 'group' or 'other'
  if [ -d "${CFE_DIR}/ppkeys" ]; then
    chmod 700 "${CFE_DIR}/ppkeys"
    if [ `ls ${CFE_DIR}/ppkeys | wc -l` -gt 0 ]; then
      chmod 600 ${CFE_DIR}/ppkeys/*
    fi
  fi
}

# There must be a policy server otherwise we can't do anything
check_policy_server_or_exit() {
  if ! exist_or_restore ${CFE_DIR}/policy_server.dat
  then
    echo "********************************************************************************"
    echo "rudder-agent has been installed (not started). This host can be a Rudder node."
    echo "To get started, configure your Rudder server's hostname and continue the process:"
    echo "# echo 'rudder.server' > ${CFE_DIR}/policy_server.dat"
    echo "# rudder agent check"
    echo "This node will then appear in the Rudder web interface under 'Accept new nodes'."
    echo "********************************************************************************"
    # This is not an error
    exit 0
  fi
}

# There may be several cf-execd running because cf-execd now runs cf-agent from a child process
# We accept 6, as it allows for 5 agent running at the same time (historic max)
# and not too many agents
check_and_fix_cfengine_processes() {
  # rudder-cf-execd must be enabled in systemd to be able to start it
  service_enable rudder-cf-execd enable

  # If there are more than one cf-execd process, we must kill them
  # A standard kill won't kill them, so the -9 is necessary to make sure they are stopped
  # They will be restarted by the check below, if the disable file is not set
  # List the cf-execd processes running (without the path, they can be run manually)
  CF_EXECD_RUNNING=`${PS_COMMAND} | grep -E "(${RUDDER_DIR}|${CFE_BIN})/cf-execd" | sed -e '/grep/d' | cat`
  NB_CF_EXECD_RUNNING=`echo "${CF_EXECD_RUNNING}" | sed -e '/^$/d' | wc -l`
  if [ ${NB_CF_EXECD_RUNNING} -gt 6 ]; then
    [ "$QUIET" = false ] && printf "${YELLOW}WARNING${NORMAL}: Too many instance of Rudder cf-execd processes running. Killing them..."
    echo "${CF_EXECD_RUNNING}" | awk 'BEGIN { OFS=" "} {print $2 }' | xargs kill -9 || true
    [ "$QUIET" = false ] && echo " Done"
  fi

  # If no disable file AND no process of cf-execd, then relaunch the service
  if [ ! -f "${CFE_DISABLE_FILE}" ] && [ "${NB_CF_EXECD_RUNNING}" -eq 0 ]; then
    [ "$QUIET" = false ] && printf "INFO: No disable file detected and no agent executor process either. Restarting agent service..."
    rudder agent stop -q ${OPT}
    rudder agent start -q ${OPT}
    [ "$QUIET" = false ] && echo " Done"
  fi

  # List the CFEngine processes running
  CF_PROCESS_RUNNING=`${PS_COMMAND} | grep -E "(${RUDDER_DIR}|${CFE_DIR})/bin/cf-(agent|execd)" | cat`
  # Count the number of processes running, filtering empty lines
  NB_CF_PROCESS_RUNNING=`echo "${CF_PROCESS_RUNNING}" | sed -e '/^$/d' | wc -l`

  # Check for anomalous number of CFEngine processes
  # If there are more than 11 agent/executor processes, we should kill them, and purge the lock database
  # 11 means 1 cf-execd + 5 cf-execd spawn + 5 cf-agent
  if [ "${NB_CF_PROCESS_RUNNING}" -gt 11 ]; then
    [ "$QUIET" = false ] && printf "${YELLOW}WARNING${NORMAL}: Too many instance of Agent processes running. Killing them, and purging the Agent lock database..."
    echo "${CF_PROCESS_RUNNING}" | awk 'BEGIN { OFS=" "} {print $2 }' | xargs kill -9 || true
    rudder agent stop ${OPT}
    clean_cf_lock_files
    rudder agent start ${OPT}
    [ "$QUIET" = false ] && echo " Done"
  fi
}

# Agent must have been updated recently
check_and_fix_last_update() {
  CHECK_INTERVAL=`expr ${AGENT_RUN_INTERVAL} \* 2`

  # Check for the age of the last update file
  # if it is older than twice CHECK_INTERVAL minutes, and the disable file is not defined, we
  # need to kill the lock database
  if [ ! -f "${LAST_UPDATE_FILE}" ] || [ -f "${CFE_DISABLE_FILE}" ]; then
    # Either the file ${LAST_UPDATE_FILE} is not yet present, and this node is
    # probably not accepted yet, either the file ${CFE_DISABLE_FILE} is present, so
    # the agent won't update the ${LAST_UPDATE_FILE}.
    # In both case, do nothing
    :
  elif test `find ${LAST_UPDATE_FILE} -mmin +${CHECK_INTERVAL}`; then
    [ "$QUIET" = false ] && printf "${YELLOW}WARNING${NORMAL}: The file ${LAST_UPDATE_FILE} is older than twice ${CHECK_INTERVAL} minutes, the agent is probably stuck. Purging the CFEngine lock database..."
    clean_cf_lock_files
    [ "$QUIET" = false ] && echo " Done";
  fi
}

# Lock file must too big (10M is big)
check_and_fix_cf_lock() {
  MAX_CF_LOCK_SIZE=10485760
  if [ -f "${CFE_DIR}/state/cf_lock.lmdb" ]; then
    CF_LOCK_SIZE=$(perl -e'@a=stat $ARGV[0]; print $a[7]' "${CFE_DIR}/state/cf_lock.lmdb")

    if [ "${CF_LOCK_SIZE}" -ge "${MAX_CF_LOCK_SIZE}" ]; then
      [ "$QUIET" = false ] && printf "${YELLOW}WARNING${NORMAL}: The file ${CFE_DIR}/state/cf_lock.lmdb is too big (${CF_LOCK_SIZE} bytes), purging it..."
      clean_cf_lock_files
      [ "$QUIET" = false ] && echo " Done"
    fi
  fi
}

# Rudder uuid must exist
check_and_fix_rudder_uuid() {

  # Default variable about UUID backup
  LATEST_BACKUPED_UUID=""

  # Generate a UUID if we don't have one yet
  if ! exist_or_restore "${UUID_FILE}"
  then
    [ "$QUIET" = false ] && printf "INFO: The UUID of the node does not exist and no backup exist. A new one will be generated..."
    ${RUDDER_DIR}/bin/rudder-uuidgen > ${UUID_FILE}
    [ "$QUIET" = false ] && echo " Done"
  fi

  # UUID is valid only if it has been generetaed by uuidgen or if it is set to 'root' for policy server
  REGEX=`x="[a-f0-9][a-f0-9][a-f0-9][a-f0-9]" && echo "$x$x-$x-$x-$x-$x$x$x"`
  CHECK_UUID=`grep -E "^$REGEX|^root" ${UUID_FILE} | wc -l`
  # If the UUID is not valid, regenerate it
  if [ ${CHECK_UUID} -ne 1 ]; then
    [ "$QUIET" = false ] && printf "${YELLOW}WARNING${NORMAL}: Creating a new UUID for Rudder as the existing one is invalid..."
    # Keep a backup of UUID even if it is not valid
    mkdir -p "${BACKUP_DIR}"
    cp -f ${RUDDER_DIR}/etc/uuid.hive ${BACKUP_DIR}/uuid-`date +%Y%m%d`.hive
    # Generate a new one
    ${RUDDER_DIR}/bin/rudder-uuidgen > ${UUID_FILE}
    echo " Done."
  fi

}

# Important CFEngine input files must exist and pass cf-promises test
# This can run rudder agent update if necessary which will bootstrap then rerun to update ncf
check_and_fix_inputs() {
  # if file is absent or empty there have been a problem with update
  if [ ! -s "${CFE_DIR}/inputs/common/1.0/update.cf" ] || [ ! -s "${CFE_DIR}/inputs/failsafe.cf" ] || [ ! -s "${CFE_DIR}/inputs/promises.cf" ]
  then
    [ "$QUIET" = false ] && printf "INFO: Policies absent, restoring initial version, and updating ..."
    rudder agent reset ${OPT}
    [ "$QUIET" = false ] && echo " Done"
  fi
  if [ ! -f "${CFE_DISABLE_FILE}" ]; then
    if ! ${RUDDER_DIR}/bin/cf-promises -f failsafe.cf > /dev/null || ! ${RUDDER_DIR}/bin/cf-promises > /dev/null
    then
      [ "$QUIET" = false ] && printf "${YELLOW}WARNING${NORMAL}: Policies invalid, resetting to initial policies and updating..."
      rudder agent reset ${OPT}
      [ "$QUIET" = false ] && echo " Done"
    fi
  fi
}

# Inventory must have been sent less than 3 days ago
check_and_fix_inventory() {
  # age in hours
  inventory_age=$(perl -e '(@i) = stat($ARGV[0]); printf "%ld\n", (time-$i[9])/3600' ${RUDDER_VAR}/tmp/inventory_sent)
  # 72 hours = 3 days
  if [ "${inventory_age}" -gt 72 ] || [ "${INVENTORY_NEEDED}" = "true" ]
  then
    [ "$QUIET" = false ] && printf "INFO: Inventory older than 3 days, resending ..."
    rudder agent inventory ${OPT}
    [ "$QUIET" = false ] && echo " Done"
  fi
}

# function used below to test if a directory has space
check_space() {
  dir="$1"
  max_percent="$2"
  if [ -d "${dir}" ]; then
    space=$(df "${dir}" | tail -n 1 | awk '{print $5}' | sed 's/%//')
    if [ "${space}" -gt "${max_percent}" ]; then
      partition=$(df "${dir}" | tail -n 1 | awk '{print $1}')
      echo "FATAL: No space left on device for '${dir}' on partition '${partition}'" | logger -s
      stop_now=1
    fi
  fi
}

# There must be enough space for rudder components to run
check_varspace_or_exit() {
  # check max space available for databases and stop Rudder if there is a risk
  if [ -f ${RUDDER_SERVER_ROLES}/rudder-ldap ]; then
    check_space ${RUDDER_VAR}/ldap/ 98
  fi

  if [ -f ${RUDDER_SERVER_ROLES}/rudder-reports ]; then
    # Get the run path of postgresql
    PG_TABLES_PATH=$(su - postgres -s /bin/sh -c 'psql -t -c  "show data_directory;"')
    if [ -z "${PG_TABLES_PATH}" ]; then
      echo "FATAL: Cannot connect to PostgreSQL - please check its status" | logger -s
    else
      check_space "${PG_TABLES_PATH}" 98
    fi
  fi

  if [ "${stop_now}" = 1 ]; then
    rudder agent disable -s ${OPT}

    if [ -f ${RUDDER_SERVER_ROLES}/rudder-server-root ]; then
      systemctl stop rudder-server
      echo "FATAL: Rudder has been stopped and disabled to prevent data corruption" | logger -s
    elif [ -f ${RUDDER_SERVER_ROLES}/rudder-jetty ]; then
      systemctl stop rudder-jetty
      echo "FATAL: Rudder Web Interface has been stopped and disabled to prevent data corruption" | logger -s
    fi
    if [ -f ${RUDDER_SERVER_ROLES}/rudder-reports ]; then
      # Try with systemd
      POSTGRESQL_SERVICE_NAME=$(systemctl list-unit-files --type service | awk -F'.' '{print $1}' | grep -E "^postgresql-?[0-9]*$" | tail -n 1)

      # If nothing try with chkconfig (sles 12 only: postgresql is properly managed by systemd but cannot be detected with the line above)
      if [ -z "${POSTGRESQL_SERVICE_NAME}" ] && ! type chkconfig >/dev/null 2>/dev/null ; then
        POSTGRESQL_SERVICE_NAME=$(chkconfig 2>/dev/null | awk '{ print $1 }' | grep "postgresql" | tail -n 1)
      fi

      # If nothing try default name (should not happen)
      if [ -z "${POSTGRESQL_SERVICE_NAME}" ]; then
        POSTGRESQL_SERVICE_NAME="postgresql"
      fi

      systemctl stop ${POSTGRESQL_SERVICE_NAME}
      echo "FATAL: PostgreSQL has been shut down to prevent data corruption" | logger -s
    fi
    [ "$QUIET" = false ] && printf "${RED}ERROR${NORMAL}: Rudder couldn't find enough space to run and stopped services. Please free up space!\n"
    exit 1
  fi
}

check_hooks() {
  hooks_dir="/var/rudder/system-update/hooks.d"
  new_dir="/opt/rudder/var/system-update-hooks.d"
  # check for system update hooks
  if [ -e "${hooks_dir}" ] && ! [ -L "${hooks_dir}" ]
  then
    mkdir -p "${new_dir}"
    mv "${hooks_dir}"/* "${new_dir}"/
    rmdir "${hooks_dir}"
    ln -s "${new_dir}" "${hooks_dir}"
  fi
}

check_modules() {
  old_dir="/var/rudder/cfengine-community/modules"
  new_dir="/opt/rudder/var/modules"
  # check for cfengine modules
  if [ -e "${old_dir}" ] && ! [ -L "${old_dir}" ]
  then
    mkdir -p "${new_dir}"
    mv "${old_dir}"/* "${new_dir}"/
    rmdir "${old_dir}"
    ln -s "${new_dir}" "${old_dir}"
  fi
}

#########################
#### TEST start here ####
#########################

# Agent must have an UUID
check_and_fix_rudder_uuid
if [ "${ONLY_UUID}" != true ]
then
  # Agent must have a key pair and a correct certificate
  # Don't mess with user-configured stuff
  if ! is_cert_validated; then
    check_and_fix_keys
  fi
  [ "${STOP_AT}" = "keys" ] && exit
  # Policy server must be declared or next checks are useless
  check_policy_server_or_exit
  [ "${STOP_AT}" = "policy-server" ] && exit
  # Check for free space and stop if there is not enough
  check_varspace_or_exit
  [ "${STOP_AT}" = "varspace" ] && exit
  # check for migrated hooks
  check_hooks
  # check for migrated modules
  check_modules
  [ "${STOP_AT}" = "hooks" ] && exit
  # Do not go further if the agent is disabled
  [ -f "${CFE_DISABLE_FILE}" ] && exit 0
  # Important CFEngine input files must exist and pass cf-promises test
  check_and_fix_inputs
  [ "${STOP_AT}" = "inputs" ] && exit
  # There must be exactly one cf-execd process and not too many agente
  check_and_fix_cfengine_processes
  [ "${STOP_AT}" = "process" ] && exit
  # Agent must have been updated recently
  check_and_fix_last_update
  [ "${STOP_AT}" = "update" ] && exit
  # Lock file must too big
  check_and_fix_cf_lock
  [ "${STOP_AT}" = "lock" ] && exit
  # Inventory must have been sent recently (should be placed before cfengine check when shell based inventory is the norm)
  check_and_fix_inventory
  [ "${STOP_AT}" = "inventory" ] && exit
fi

if [ "$QUIET" = false ]; then
  printf "FINISH: Rudder agent check ran properly, please look at messages above to see if there has been any error.\n"
fi

