#!/bin/bash
#
#  Copyright (C) 2017-present ScyllaDB

# SPDX-License-Identifier: AGPL-3.0-or-later

# When a CLI tool is not installed, use relocatable CLI tool provided by Scylla
export PATH=$PATH:/opt/scylladb/bin

##Variables##
REPORT="./`hostname`-health-check-report.txt"
OUTPUT_PATH="./output_files"
OUTPUT_PATH1="$OUTPUT_PATH/system_checks"
OUTPUT_PATH2="$OUTPUT_PATH/scylladb_checks"
OUTPUT_PATH3="$OUTPUT_PATH/nodetool_commands"
OUTPUT_PATH4="$OUTPUT_PATH/data_model"
OUTPUT_PATH5="$OUTPUT_PATH/network_checks"
IS_FEDORA="0"
IS_DEBIAN="0"
IS_GENTOO="0"
JMX_PORT="7199"
CQL_PORT="9042"
PRINT_DM=NO
PRINT_NET=NO
PRINT_cfstats=NO
SCYLLA_SERVICE="0"
JMX_SERVICE="0"
LISTEN_ADDR=$(grep -e '^listen_address' /etc/scylla/scylla.yaml|awk -F':' '{ print $NF }')

while getopts ":hdncap:q:" opt; do
    case $opt in
        h)    echo ""
            echo "This script performs system review and generates health check report based on"
            echo "the configuration data (hardware, OS, Scylla SW, etc.) collected from the node."
            echo ""
            echo "Usage:"
            echo "-p   Port to use for nodetool commands (default: 7199)"
            echo "-q   Port to use for cqlsh (default: 9042)"
            echo "-c   Print cfstats output"
            echo "-d   Print data model info"
            echo "-n   Print network info"
            echo "-a   Print all"
            echo "-h   Display this help and exit"
            echo ""
            echo "Note: output for the above is collected, but not printed in the report."
            echo "If you wish to have them printed, please supply the relevant flag/s."
            echo ""
            exit 2
            ;;
        p)    JMX_PORT=$OPTARG ;;
        q)    CQL_PORT=$OPTARG ;;
        d)    PRINT_DM=YES ;;
        n)    PRINT_NET=YES ;;
        c)    PRINT_cfstats=YES ;;
        a)    PRINT_DM=YES
            PRINT_NET=YES
            PRINT_cfstats=YES
            ;;
        \?)    echo "Invalid option: -$OPTARG"
            exit 2
            ;;
        :)    echo "Option -$OPTARG requires an argument."
            exit 2
            ;;
    esac
done

# Get the CQL_ADDR from whatever is currently listening on $CQL_PORT. If it's a wildcard, 
# use the listen_address from scylla.yaml
CQL_ADDR=`ss -l |grep ${CQL_PORT}|awk -F"${CQL_PORT}" '{print $1}'|awk '{print $NF}'`

if [ "$CQL_ADDR" == '*:' -o "$CQL_ADDR" == '0.0.0.0:' ]; then
    CQL_ADDR=`grep -e '^listen_address:' /etc/scylla/scylla.yaml|awk '{print $NF}'`
else
    CQL_ADDR=${CQL_ADDR/%:/}
fi

##Check server release (Fedora/Oracle/Debian/Gentoo)##
cat /etc/os-release | grep -i fedora &> /dev/null
if [ $? -ne 0 ]; then
    cat /etc/os-release | grep -i oracle &> /dev/null
    if [ $? -ne 0 ]; then
        IS_FEDORA="1"
    fi
fi

cat /etc/os-release | grep -i debian &> /dev/null
if [ $? -ne 0 ]; then
    IS_DEBIAN="1"
fi

cat /etc/os-release | grep -i gentoo &> /dev/null
if [ $? -ne 0 ]; then
    IS_GENTOO="1"
fi

if [ "$IS_FEDORA" == "1" ] && [ "$IS_DEBIAN" == "1" ] && [ "$IS_GENTOO" == "1" ]; then
    echo "This s a Non-Supported OS, Please Review the Support Matrix"
    exit 222
fi

##Scylla-server service status##
echo "--------------------------------------------------"
echo "Checking Scylla-server Service"
echo "--------------------------------------------------"

ps -C scylla --no-headers &> /dev/null
if [ $? -ne 0 ]; then
    SCYLLA_SERVICE="1"
    echo "ERROR: Scylla-server is NOT Running"
    echo "Cannot Collect Data Model Info"
    echo "--------------------------------------------------"
else
    echo "Scylla-server Service: OK"
    echo "--------------------------------------------------"
fi


##Scylla-JMX service status##
echo "Checking Scylla-JMX Service on Port $JMX_PORT"
echo "--------------------------------------------------"

nodetool -p$JMX_PORT status &> /dev/null
if [ $? -ne 0 ]; then
    JMX_SERVICE="1"
    echo "ERROR: Scylla-JMX is NOT Running / NOT Listening on Port $JMX_PORT"
    echo "Cannot Collect Nodetool Info"
    echo "Use the '-p' Option to Provide the Scylla-JMX Port"
    echo "--------------------------------------------------"
else
    echo "Scylla-JMX Service (nodetool): OK"
    echo "--------------------------------------------------"
fi

#Create dir structure to save output_files#
echo "--------------------------------------------------"
echo "Creating Output Files Directory"
echo "--------------------------------------------------"
mkdir -p $OUTPUT_PATH
mkdir -p $OUTPUT_PATH1 $OUTPUT_PATH2 $OUTPUT_PATH3 $OUTPUT_PATH4 $OUTPUT_PATH5


##Output Collection##
#System Checks#
echo "Collecting System Info"
echo "--------------------------------------------------"
cp -p /etc/os-release $OUTPUT_PATH1
uname -r > $OUTPUT_PATH1/kernel-release.txt
lscpu > $OUTPUT_PATH1/cpu-info.txt
vmstat -s -S M | awk '{$1=$1};1' > $OUTPUT_PATH1/vmstat.txt
df -Th > $OUTPUT_PATH1/capacity-info.txt && echo "" >> $OUTPUT_PATH1/capacity-info.txt && sudo du -sh /var/lib/scylla/* >> $OUTPUT_PATH1/capacity-info.txt
cp -p /proc/mdstat $OUTPUT_PATH1
for f in `sudo find /sys -name scheduler`; do echo -n "$f: "; cat  $f; done > $OUTPUT_PATH1/io-sched-conf.txt && echo "" >> $OUTPUT_PATH1/io-sched-conf.txt
for f in `sudo find /sys -name nomerges`; do echo -n "$f: "; cat  $f; done >> $OUTPUT_PATH1/io-sched-conf.txt


#ScyllaDB Checks#
echo "Collecting Scylla Info"
echo "--------------------------------------------------"

scylla --version > $OUTPUT_PATH2/scylla-version.txt
cp -p /etc/scylla/* $OUTPUT_PATH2
cp -p /etc/scylla.d/* $OUTPUT_PATH2
ls -ltrh /var/lib/scylla/coredump/ > $OUTPUT_PATH2/coredump-folder.txt

if [ "$IS_FEDORA" == "0" ]; then
    rpm -qa | grep -i scylla > $OUTPUT_PATH2/scylla-pkgs.txt
    cp -p /etc/sysconfig/scylla-server $OUTPUT_PATH2
fi

if [ "$IS_DEBIAN" == "0" ]; then
    dpkg -l | grep -i scylla > $OUTPUT_PATH2/scylla-pkgs.txt
    cp -p /etc/default/scylla-server $OUTPUT_PATH2
fi

if [ "$IS_GENTOO" == "0" ]; then
    sudo emerge -1uq app-portage/portage-utils
    sudo qlist -ICv scylla > $OUTPUT_PATH2/scylla-pkgs.txt
    cp -p /etc/default/scylla-server $OUTPUT_PATH2
fi


#Scylla Logs#
echo "--------------------------------------------------"
echo "Collecting Logs"
echo "--------------------------------------------------"

journalctl --help &> /dev/null
if [ $? -eq 0 ]; then
    journalctl -t scylla > $OUTPUT_PATH/scylla-logs.txt
else
    if [ "$IS_GENTOO" == "0" ]; then
        cat /var/log/scylla/scylla.log > $OUTPUT_PATH/scylla-logs.txt
    else
        cat /var/log/syslog | grep -i scylla > $OUTPUT_PATH/scylla-logs.txt
    fi
fi

gzip -f $OUTPUT_PATH/scylla-logs.txt


#Nodetool commands#
if [ "$JMX_SERVICE" == "1" ]; then
    echo "Skipping Nodetool Info Collection"
    echo "--------------------------------------------------"
else
    echo "Collecting Nodetool Commands Info (using port $JMX_PORT)"
    echo "--------------------------------------------------"
    nodetool -p$JMX_PORT status > $OUTPUT_PATH3/nodetool-status.txt
    nodetool -p$JMX_PORT info > $OUTPUT_PATH3/nodetool-info.txt
    nodetool -p$JMX_PORT netstats > $OUTPUT_PATH3/nodetool-netstats.txt
    nodetool -p$JMX_PORT gossipinfo > $OUTPUT_PATH3/nodetool-gossipinfo.txt
    nodetool -p$JMX_PORT proxyhistograms > $OUTPUT_PATH3/nodetool-proxyhistograms.txt
    nodetool -p$JMX_PORT cfstats -H | grep Keyspace -A 4 > $OUTPUT_PATH3/nodetool-cfstats-keyspace.txt
    nodetool -p$JMX_PORT cfstats -H | egrep 'Table:|SSTable count:|Compacted|tombstones' | awk '{$1=$1};1' | awk '{print; if (FNR % 7 == 0 ) printf "\n --";}' > $OUTPUT_PATH3/nodetool-cfstats-table.txt
    sed -i '1s/^/ --/' $OUTPUT_PATH3/nodetool-cfstats-table.txt
    nodetool -p$JMX_PORT compactionstats > $OUTPUT_PATH3/nodetool-compactionstats.txt
    nodetool -p$JMX_PORT ring > $OUTPUT_PATH3/nodetool-ring.txt
fi
#not implemented: nodetool cfhistograms $KS $TN >> $OUTPUT_PATH3/nodetool-cfhistograms.txt#


#Data Model#
if [ "$SCYLLA_SERVICE" == "1" ]; then
    echo "Skipping Data Model Info Collection"
    echo "--------------------------------------------------"
else
    # TODO: handle connecting with authentication
    cqlsh $CQL_ADDR $CQL_PORT -e "HELP" &> /dev/null
    if [ $? -eq 0 ]; then
        echo "Collecting Data Model Info (using port $CQL_PORT)"
        echo "--------------------------------------------------"
        cqlsh $CQL_ADDR $CQL_PORT -e "DESCRIBE SCHEMA" > $OUTPUT_PATH4/describe-schema.txt
        cqlsh $CQL_ADDR $CQL_PORT -e "DESCRIBE TABLES" > $OUTPUT_PATH4/describe-tables.txt
    else
        echo "ERROR: CQL is NOT Listening on Port $CQL_PORT"
        echo "Cannot Collect Data Model Info"
        echo "Use the '-q' Option to Provide the CQL Port"
        echo "--------------------------------------------------"
    fi
fi


#Network Checks#
echo "Collecting Network Info"
echo "--------------------------------------------------"
ifconfig -a >> $OUTPUT_PATH5/ifconfig.txt
for i in `ls -I lo /sys/class/net/`; do echo "--$i"; ethtool -i $i; echo ""; done > $OUTPUT_PATH5/ethtool-NIC.txt
cat /proc/interrupts > $OUTPUT_PATH5/proc-interrupts.txt
for i in `ls -I default_smp_affinity /proc/irq`; do echo -n "--$i:"; sudo cat /proc/irq/$i/smp_affinity; echo ""; done > $OUTPUT_PATH5/irq-smp-affinity.txt
for i in `ls -I lo /sys/class/net/`; do echo "--$i"; cat /sys/class/net/$i/queues/rx-*/rps_cpus; echo ""; done > $OUTPUT_PATH5/rps-conf.txt
for i in `ls -I lo /sys/class/net/`; do echo "--$i"; cat /sys/class/net/$i/queues/tx-*/xps_cpus; echo ""; done > $OUTPUT_PATH5/xps-conf.txt
for i in `ls -I lo /sys/class/net/`; do echo "--$i"; cat /sys/class/net/$i/queues/rx-*/rps_flow_cnt; echo ""; done > $OUTPUT_PATH5/rfs-conf.txt
ps -elf | grep irqbalance > $OUTPUT_PATH5/irqbalance-conf.txt
sudo sysctl -a > $OUTPUT_PATH5/sysctl.txt 2>&1
if [ -e /usr/sbin/iptables ]; then
    sudo iptables -L -v > $OUTPUT_PATH5/iptables.txt
else
    touch $OUTPUT_PATH5/iptables.txt
fi
netstat -an | grep tcp > $OUTPUT_PATH5/netstat-tcp.txt


echo "Output Collection Completed Successfully"
echo "--------------------------------------------------"


##Generate Health Check Report##
echo "Generating Health Check Report"
echo "--------------------------------------------------"
echo "Print cfstats: $PRINT_cfstats"
echo "Print Data Model: $PRINT_DM"
echo "Print Network Info: $PRINT_NET"
echo "--------------------------------------------------"

echo "" > $REPORT
date "+DATE: %m/%d/%y" >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "              Health Check Report for node: `hostname`" >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT


echo "PURPOSE" >> $REPORT
echo "=======" >> $REPORT
echo "This document first serves as a system review and health check report." >> $REPORT
echo "It is based on the configuration data (hardware, OS, Scylla SW, etc.) collected from the node." >> $REPORT
echo "Based on the review and analysis of the collected data, ScyllaDB can recommend on possible" >> $REPORT
echo "ways to better utilize the cluster, based on both experience and best practices." >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT


echo "SYSTEM INFO" >> $REPORT
echo "===========" >> $REPORT
echo "" >> $REPORT

echo "Host Operating System" >> $REPORT
echo "---------------------" >> $REPORT
cat $OUTPUT_PATH1/os-release >> $REPORT
if [ -f /etc/redhat-release ]; then cat /etc/redhat-release >> $REPORT; fi
echo "" >> $REPORT
echo "" >> $REPORT

echo "Kernel Release" >> $REPORT
echo "--------------" >> $REPORT
cat $OUTPUT_PATH1/kernel-release.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT

echo "CPU Info" >> $REPORT
echo "--------" >> $REPORT
cat $OUTPUT_PATH1/cpu-info.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT

echo "Memory Info in MB" >> $REPORT
echo "-----------------" >> $REPORT
cat $OUTPUT_PATH1/vmstat.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT

echo "Storage/Disk Info" >> $REPORT
echo "-----------------" >> $REPORT
cat $OUTPUT_PATH1/capacity-info.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT

echo "RAID Configuration" >> $REPORT
echo "------------------" >> $REPORT
cat $OUTPUT_PATH1/mdstat >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT

echo "I/O Scheduler Configuration" >> $REPORT
echo "---------------------------" >> $REPORT
cat $OUTPUT_PATH1/io-sched-conf.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT


echo "ScyllaDB INFO" >> $REPORT
echo "=============" >> $REPORT
echo "" >> $REPORT

echo "SW Version (PKGs)" >> $REPORT
echo "-----------------" >> $REPORT
cat $OUTPUT_PATH2/scylla-version.txt >> $REPORT
echo "" >> $REPORT
cat $OUTPUT_PATH2/scylla-pkgs.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT

echo "Configuration files" >> $REPORT
echo "-------------------" >> $REPORT
echo "## /etc/scylla/scylla.yaml ##" >> $REPORT
cat $OUTPUT_PATH2/scylla.yaml | grep -v "#" | grep -v "^[[:space:]]*$" >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT

if [ "$IS_FEDORA" == "0" ]; then
    echo "## /etc/sysconfig/scylla-server ##" >> $REPORT
fi

if [ "$IS_DEBIAN" == "0" ] || [ "$IS_GENTOO" == "0" ]; then
    echo "## /etc/default/scylla-server ##" >> $REPORT
fi

cat $OUTPUT_PATH2/scylla-server | grep -v "^[[:space:]]*$" >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "## /etc/scylla/cassandra-rackdc.properties ##" >> $REPORT
cat $OUTPUT_PATH2/cassandra-rackdc.properties | grep -v "#" |grep -v "^[[:space:]]*$" >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT

echo "Check for Coredumps" >> $REPORT
echo "-------------------" >> $REPORT
cat $OUTPUT_PATH2/coredump-folder.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT

if [ "$JMX_SERVICE" == "0" ]; then
    echo "Nodetool Status/Info/Gossip" >> $REPORT
    echo "---------------------------" >> $REPORT
    echo "## Nodetool Status ##" >> $REPORT
    cat $OUTPUT_PATH3/nodetool-status.txt >> $REPORT
    echo "" >> $REPORT
    echo "" >> $REPORT
    echo "## Nodetool Info ##" >> $REPORT
    cat $OUTPUT_PATH3/nodetool-info.txt >> $REPORT
    echo "" >> $REPORT
    echo "" >> $REPORT
    echo "## Nodetool GossipInfo ##" >> $REPORT
    cat $OUTPUT_PATH3/nodetool-gossipinfo.txt >> $REPORT
    echo "" >> $REPORT
    echo "" >> $REPORT
fi

if [ $PRINT_DM == "YES" ]; then
    echo "DATA MODEL INFO" >> $REPORT
    echo "===============" >> $REPORT
    echo "" >> $REPORT

    cqlsh $CQL_ADDR $CQL_PORT -e "HELP" &> /dev/null
    if [ $? -eq 0 ]; then
        echo "Printing Data Model Info to Report"
        echo "--------------------------------------------------"

        echo "Describe Schema" >> $REPORT
        echo "---------------" >> $REPORT
        cat $OUTPUT_PATH4/describe-schema.txt >> $REPORT
        echo "" >> $REPORT

        echo "Describe Tables" >> $REPORT
        echo "---------------" >> $REPORT
        cat $OUTPUT_PATH4/describe-tables.txt >> $REPORT
        echo "" >> $REPORT
        echo "" >> $REPORT
    else
        echo "ERROR: Data Model NOT Collected - Nothing to Print"
        echo "--------------------------------------------------"
        echo "Data Model was not collected" >> $REPORT
        echo "" >> $REPORT
        echo "" >> $REPORT
        echo "" >> $REPORT
    fi
fi

echo "PERFORMANCE and METRICS INFO" >> $REPORT
echo "============================" >> $REPORT
echo "" >> $REPORT

if [ "$JMX_SERVICE" == "0" ]; then
    echo "Nodetool Proxyhistograms (RD/WR latency)" >> $REPORT
    echo "----------------------------------------" >> $REPORT
    cat $OUTPUT_PATH3/nodetool-proxyhistograms.txt >> $REPORT
    echo "" >> $REPORT

    echo "Nodetool netstats" >> $REPORT
    echo "-----------------" >> $REPORT
    cat $OUTPUT_PATH3/nodetool-netstats.txt >> $REPORT
    echo "" >> $REPORT
    echo "" >> $REPORT

    if [ $PRINT_cfstats == "YES" ]; then
        echo "Printing cfstats Output to Report"
        echo "--------------------------------------------------"

        echo "Nodetool cfstats" >> $REPORT
        echo "----------------" >> $REPORT
        echo "## Keyspace Info ##" >> $REPORT
        cat $OUTPUT_PATH3/nodetool-cfstats-keyspace.txt >> $REPORT
        echo "" >> $REPORT
        echo "" >> $REPORT
        echo "## Tables Info ##" >> $REPORT
        cat $OUTPUT_PATH3/nodetool-cfstats-table.txt >> $REPORT
        echo "" >> $REPORT
        echo "" >> $REPORT
    fi

    echo "Nodetool compactionstats" >> $REPORT
    echo "------------------------" >> $REPORT
    cat $OUTPUT_PATH3/nodetool-compactionstats.txt >> $REPORT
    echo "" >> $REPORT
    echo "" >> $REPORT
    echo "" >> $REPORT
else
    echo "Nodetool info was not collected" >> $REPORT
    echo "" >> $REPORT
    echo "" >> $REPORT
    echo "" >> $REPORT
fi

if [ $PRINT_NET == "YES" ]; then
    echo "Printing Network Info to Report"
    echo "--------------------------------------------------"

    echo "NETWORK INFO" >> $REPORT
    echo "============" >> $REPORT
    echo "" >> $REPORT

    echo "ethtool per NIC" >> $REPORT
    echo "---------------" >> $REPORT
    cat $OUTPUT_PATH5/ethtool-NIC.txt >> $REPORT
    echo "" >> $REPORT
    echo "" >> $REPORT

    echo "/proc/interrupts" >> $REPORT
    echo "----------------" >> $REPORT
    cat $OUTPUT_PATH5/proc-interrupts.txt >> $REPORT
    echo "" >> $REPORT
    echo "" >> $REPORT
    echo "" >> $REPORT

    echo "IRQ smp affinity" >> $REPORT
    echo "----------------" >> $REPORT
    cat $OUTPUT_PATH5/irq-smp-affinity.txt >> $REPORT
    echo "" >> $REPORT
    echo "" >> $REPORT

    echo "sysctl -a" >> $REPORT
    echo "---------" >> $REPORT
    cat $OUTPUT_PATH5/sysctl.txt >> $REPORT
    echo "" >> $REPORT
    echo "" >> $REPORT
    echo "" >> $REPORT

    echo "iptables -L" >> $REPORT
    echo "-----------" >> $REPORT
    cat $OUTPUT_PATH5/iptables.txt >> $REPORT
    echo "" >> $REPORT
    echo "" >> $REPORT
    echo "" >> $REPORT

    echo "netstat -an | grep tcp" >> $REPORT
    echo "----------------------" >> $REPORT
    cat $OUTPUT_PATH5/netstat-tcp.txt >> $REPORT
    echo "" >> $REPORT
    echo "" >> $REPORT
    echo "" >> $REPORT
fi

echo "Archiving Output Files"
echo "--------------------------------------------------"
tar cvzf output_files.tgz $OUTPUT_PATH --remove-files
echo "--------------------------------------------------"

echo "Health Check Report Created Successfully"
echo "Path to Report: $REPORT"
echo "--------------------------------------------------"

