nagios系列(六)之nagios实现对服务器cpu温度的监控
生活随笔
收集整理的這篇文章主要介紹了
nagios系列(六)之nagios实现对服务器cpu温度的监控
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
1、安裝硬件傳感器監控軟件sensors
yum install -y lm_sensors*
2、運行sensors-detect進行傳感器檢測
##一路回車即可
Do you want to overwrite /etc/sysconfig/lm_sensors? (YES/no):?
Starting lm_sensors: loading module coretemp ? ? ? ? ? ? ? [ ?OK ?]
Unloading i2c-dev... OK
3、運行sensors看是否能讀取數據,如下像下面這樣表示正常
# sensors
coretemp-isa-0000
Adapter: ISA adapter
ERROR: Can't get value of subfeature temp1_input: Can't read
Physical id 0: ?+0.0°C ?(high = +100.0°C, crit = +100.0°C) ?
ERROR: Can't get value of subfeature temp2_input: Can't read
Core 0: ? ? ? ? +0.0°C ?(high = +100.0°C, crit = +100.0°C) ?
ERROR: Can't get value of subfeature temp3_input: Can't read
Core 1: ? ? ? ? +0.0°C ?(high = +100.0°C, crit = +100.0°C) ?
coretemp-isa-0002
Adapter: ISA adapter
ERROR: Can't get value of subfeature temp1_input: Can't read
Physical id 1: ?+0.0°C ?(high = +100.0°C, crit = +100.0°C) ?
ERROR: Can't get value of subfeature temp2_input: Can't read
Core 0: ? ? ? ? +0.0°C ?(high = +100.0°C, crit = +100.0°C) ?
ERROR: Can't get value of subfeature temp3_input: Can't read
Core 1: ? ? ? ? +0.0°C ?(high = +100.0°C, crit = +100.0°C) ?
4、添加監控腳本vim /usr/local/nagios/libexec/check_cputemp
#!/bin/sh
#########check_cputemp###########
#date : May 2013
#Licence GPLv2
#by Barlow
#/usr/local/nagios/libexec/check_cputemp
#you can use NRPE to define service in nagios
#check_nrpe!check_cputemp
# Plugin return statements
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3
print_help_msg(){
$Echo "Usage: $0 -h to get help."
}
print_full_help_msg(){
$Echo "Usage:"
$Echo "$0 [ -v ] -m sensors -w cpuT -c cpuT"
$Echo "Sepicify the method to use the temperature data sensors."
$Echo "And the corresponding Critical value must greater than Warning value."
$Echo "Example:"
$Echo "${0} -m sensors -w 40 -c 50"
}
print_err_msg(){
$Echo "Error."
print_full_help_msg
}
to_debug(){
if [ "$Debug" = "true" ]; then
$Echo "$*" >> /var/log/check_sys_temperature.log.$$ 2>&1
fi
}
unset LANG
Echo="echo -e"
if [ $# -lt 1 ]; then
print_help_msg
exit 3
else
while getopts :vhm:w:c: OPTION
do
case $OPTION
in
v)
#$Echo "Verbose mode."
Debug=true
;;
m)
method=$OPTARG
;;
w)
WARNING=$OPTARG
;;
c)
CRITICAL=$OPTARG ;;
h)
print_full_help_msg
exit 3
;;
?)
$Echo "Error: Illegal Option."
print_help_msg
exit 3
;;
esac
done
if [ "$method" = "sensors" ]; then
use_sensors="true"
to_debug use_sensors
else
$Echo "Error. Must to sepcify the method to use sensors."
print_full_help_msg
exit 3
fi
to_debug All Values ?are \" Warning: "$WARNING" and Critical: "$CRITICAL" \".
fi
#########lm_sensors##################
if [ "$use_sensors" = "true" ]; then
sensorsCheckOut=`which sensors 2>&1`
if [ $? -ne 0 ];then
echo $sensorsCheckOut
echo Maybe you need to check your sensors.
exit 3
fi
to_debug Use $sensorsCheckOut to check system temperature
TEMP1=`sensors | head -3 | tail -1 | gawk '{print $3}' | grep -o [0-9][0-9]`
TEMP2=`sensors | head -4 | tail -1 | gawk '{print $3}' | grep -o [0-9][0-9]`
TEMP3=`sensors | head -5 | tail -1 | gawk '{print $3}' | grep -o [0-9][0-9]`
TEMP4=`sensors | head -6 | tail -1 | gawk '{print $3}' | grep -o [0-9][0-9]`
##溫度的取數根據你cpu的核數確定,我的是四核,所以取TEMP1-4個CPU溫度數并計算平均值
SUM=$(( $TEMP1 + $TEMP2 + $TEMP3 + $TEMP4 ))
TEMP=$(($SUM/4))
if [ -z "$TEMP" ] ; then
$Echo "No Data been get here. Please confirm your ARGS and re-check it with Verbose mode, then to check the log."
exit 3
fi
to_debug temperature data is $TEMP
else
$Echo "Error. Must to sepcify the method to use sensors"
print_full_help_msg
exit 3
fi
######### Comparaison with the warnings and criticals thresholds given by user############
CPU_TEMP=$TEMP
#if [ "$WARNING" != "0" ] || [ "$CRITICAL" != "0" ]; then
if [ "$CPU_TEMP" -gt "$CRITICAL" ] ?&& [ "$CRITICAL" != "0" ]; then
STATE="$STATE_CRITICAL"
STATE_MESSAGE="CRITICAL"
to_debug $STATE , Message is $STATE_MESSAGE
elif [ "$CPU_TEMP" -gt "$WARNING" ] && [ "$WARNING" != "0" ]; then
STATE="$STATE_WARNING"
STATE_MESSAGE="WARNING"
to_debug $STATE , Message is $STATE_MESSAGE
else
STATE="$STATE_OK"
STATE_MESSAGE="OK"
to_debug $STATE , Message is $STATE_MESSAGE
fi
##返回值中注意要包含性能數據,即采用|分隔的后半部數據,且數據單位不能包含中文,否則使用PNP等繪圖軟件無法正常繪圖。
echo "The TEMPERATURE "$STATE_MESSAGE" "-" The CPU's Temperature is "$CPU_TEMP" ℃ ! | 溫度=`echo $CPU_TEMP`Celsius;$WARNING;$CRITICAL"
exit $STATE
5、賦予腳本執行權限:
chmod +x /usr/local/nagios/libexec/check_cputemp
6、配置vim /usr/local/nagios/etc/nrpe.cfg,添加如下一行:
echo "command[check_cputemp]=/usr/local/nagios/libexec/check_cputemp -m sensors -w 38 -c 45" >>/usr/local/nagios/etc/nrpe.cfg
重新啟動客戶端nrpe服務
-w 表示警告值,-c表示關鍵(緊急)值,自行根據實際情況調整
注意:以上六步均在被監控機上完成。
在客戶端測試是否ok,虛擬機測試不成功,需要在物理機上實現
# /usr/local/nagios/libexec/check_cputemp -m sensors -w 38 -c 45
The TEMPERATURE OK - The CPU's Temperature is 14 ℃ ! | 溫度=14Celsius;38;45
服務端執行測試:
/usr/local/nagios/libexec/check_nrpe -H 192.168.8.93 -c check_cputemp
7、在Nagios服務端配置服務:
define service{
use ? ? ? ? ? ? generic-service
host_name需要被監控的hostname
service_description CPU Temperature
check_command check_nrpe!check_cputemp
}
yum install -y lm_sensors*
2、運行sensors-detect進行傳感器檢測
##一路回車即可
Do you want to overwrite /etc/sysconfig/lm_sensors? (YES/no):?
Starting lm_sensors: loading module coretemp ? ? ? ? ? ? ? [ ?OK ?]
Unloading i2c-dev... OK
3、運行sensors看是否能讀取數據,如下像下面這樣表示正常
# sensors
coretemp-isa-0000
Adapter: ISA adapter
ERROR: Can't get value of subfeature temp1_input: Can't read
Physical id 0: ?+0.0°C ?(high = +100.0°C, crit = +100.0°C) ?
ERROR: Can't get value of subfeature temp2_input: Can't read
Core 0: ? ? ? ? +0.0°C ?(high = +100.0°C, crit = +100.0°C) ?
ERROR: Can't get value of subfeature temp3_input: Can't read
Core 1: ? ? ? ? +0.0°C ?(high = +100.0°C, crit = +100.0°C) ?
coretemp-isa-0002
Adapter: ISA adapter
ERROR: Can't get value of subfeature temp1_input: Can't read
Physical id 1: ?+0.0°C ?(high = +100.0°C, crit = +100.0°C) ?
ERROR: Can't get value of subfeature temp2_input: Can't read
Core 0: ? ? ? ? +0.0°C ?(high = +100.0°C, crit = +100.0°C) ?
ERROR: Can't get value of subfeature temp3_input: Can't read
Core 1: ? ? ? ? +0.0°C ?(high = +100.0°C, crit = +100.0°C) ?
4、添加監控腳本vim /usr/local/nagios/libexec/check_cputemp
#!/bin/sh
#########check_cputemp###########
#date : May 2013
#Licence GPLv2
#by Barlow
#/usr/local/nagios/libexec/check_cputemp
#you can use NRPE to define service in nagios
#check_nrpe!check_cputemp
# Plugin return statements
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3
print_help_msg(){
$Echo "Usage: $0 -h to get help."
}
print_full_help_msg(){
$Echo "Usage:"
$Echo "$0 [ -v ] -m sensors -w cpuT -c cpuT"
$Echo "Sepicify the method to use the temperature data sensors."
$Echo "And the corresponding Critical value must greater than Warning value."
$Echo "Example:"
$Echo "${0} -m sensors -w 40 -c 50"
}
print_err_msg(){
$Echo "Error."
print_full_help_msg
}
to_debug(){
if [ "$Debug" = "true" ]; then
$Echo "$*" >> /var/log/check_sys_temperature.log.$$ 2>&1
fi
}
unset LANG
Echo="echo -e"
if [ $# -lt 1 ]; then
print_help_msg
exit 3
else
while getopts :vhm:w:c: OPTION
do
case $OPTION
in
v)
#$Echo "Verbose mode."
Debug=true
;;
m)
method=$OPTARG
;;
w)
WARNING=$OPTARG
;;
c)
CRITICAL=$OPTARG ;;
h)
print_full_help_msg
exit 3
;;
?)
$Echo "Error: Illegal Option."
print_help_msg
exit 3
;;
esac
done
if [ "$method" = "sensors" ]; then
use_sensors="true"
to_debug use_sensors
else
$Echo "Error. Must to sepcify the method to use sensors."
print_full_help_msg
exit 3
fi
to_debug All Values ?are \" Warning: "$WARNING" and Critical: "$CRITICAL" \".
fi
#########lm_sensors##################
if [ "$use_sensors" = "true" ]; then
sensorsCheckOut=`which sensors 2>&1`
if [ $? -ne 0 ];then
echo $sensorsCheckOut
echo Maybe you need to check your sensors.
exit 3
fi
to_debug Use $sensorsCheckOut to check system temperature
TEMP1=`sensors | head -3 | tail -1 | gawk '{print $3}' | grep -o [0-9][0-9]`
TEMP2=`sensors | head -4 | tail -1 | gawk '{print $3}' | grep -o [0-9][0-9]`
TEMP3=`sensors | head -5 | tail -1 | gawk '{print $3}' | grep -o [0-9][0-9]`
TEMP4=`sensors | head -6 | tail -1 | gawk '{print $3}' | grep -o [0-9][0-9]`
##溫度的取數根據你cpu的核數確定,我的是四核,所以取TEMP1-4個CPU溫度數并計算平均值
SUM=$(( $TEMP1 + $TEMP2 + $TEMP3 + $TEMP4 ))
TEMP=$(($SUM/4))
if [ -z "$TEMP" ] ; then
$Echo "No Data been get here. Please confirm your ARGS and re-check it with Verbose mode, then to check the log."
exit 3
fi
to_debug temperature data is $TEMP
else
$Echo "Error. Must to sepcify the method to use sensors"
print_full_help_msg
exit 3
fi
######### Comparaison with the warnings and criticals thresholds given by user############
CPU_TEMP=$TEMP
#if [ "$WARNING" != "0" ] || [ "$CRITICAL" != "0" ]; then
if [ "$CPU_TEMP" -gt "$CRITICAL" ] ?&& [ "$CRITICAL" != "0" ]; then
STATE="$STATE_CRITICAL"
STATE_MESSAGE="CRITICAL"
to_debug $STATE , Message is $STATE_MESSAGE
elif [ "$CPU_TEMP" -gt "$WARNING" ] && [ "$WARNING" != "0" ]; then
STATE="$STATE_WARNING"
STATE_MESSAGE="WARNING"
to_debug $STATE , Message is $STATE_MESSAGE
else
STATE="$STATE_OK"
STATE_MESSAGE="OK"
to_debug $STATE , Message is $STATE_MESSAGE
fi
##返回值中注意要包含性能數據,即采用|分隔的后半部數據,且數據單位不能包含中文,否則使用PNP等繪圖軟件無法正常繪圖。
echo "The TEMPERATURE "$STATE_MESSAGE" "-" The CPU's Temperature is "$CPU_TEMP" ℃ ! | 溫度=`echo $CPU_TEMP`Celsius;$WARNING;$CRITICAL"
exit $STATE
5、賦予腳本執行權限:
chmod +x /usr/local/nagios/libexec/check_cputemp
6、配置vim /usr/local/nagios/etc/nrpe.cfg,添加如下一行:
echo "command[check_cputemp]=/usr/local/nagios/libexec/check_cputemp -m sensors -w 38 -c 45" >>/usr/local/nagios/etc/nrpe.cfg
重新啟動客戶端nrpe服務
-w 表示警告值,-c表示關鍵(緊急)值,自行根據實際情況調整
注意:以上六步均在被監控機上完成。
在客戶端測試是否ok,虛擬機測試不成功,需要在物理機上實現
# /usr/local/nagios/libexec/check_cputemp -m sensors -w 38 -c 45
The TEMPERATURE OK - The CPU's Temperature is 14 ℃ ! | 溫度=14Celsius;38;45
服務端執行測試:
/usr/local/nagios/libexec/check_nrpe -H 192.168.8.93 -c check_cputemp
7、在Nagios服務端配置服務:
define service{
use ? ? ? ? ? ? generic-service
host_name需要被監控的hostname
service_description CPU Temperature
check_command check_nrpe!check_cputemp
}
保存后重啟nagios服務
轉載于:https://www.cnblogs.com/reblue520/p/6239760.html
總結
以上是生活随笔為你收集整理的nagios系列(六)之nagios实现对服务器cpu温度的监控的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: max's java road
- 下一篇: reflow 和 repaint