用python编写daemon监控进程并自动恢复(附Shell版)
生活随笔
收集整理的這篇文章主要介紹了
用python编写daemon监控进程并自动恢复(附Shell版)
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
因為hadoop集群中datanode是大量存在的,那么多機器,什么事都可能發生,最通常的大概就是進程掛掉了。所以為了省事,參考別人的代碼寫了這個監控進程的daemon。當然,稍加修改就可以用來監控別的必須常駐的進程。只需start,不用后面跟&或者前面加nohup。
其實很多人都對進程掛掉很頭疼,沒事半夜得爬起來上服務器啟動進程是一件非常痛苦的事情。
每2秒監測一次進程,發現進程消失就重啟進程。主要原理是fork出子進程,然后將子進程掛起,并退出父進程。其程序本身作為被監控進程的外殼程序存在。
#!/usr/bin/env python
import sys, os, time, atexit, string
from signal import SIGTERM
class Daemon:
??def __init__(self, pidfile, stdin='/dev/null', stdout='/dev/null', stderr='/dev/null'):
????#需要獲取調試信息,改為stdin='/dev/stdin', stdout='/dev/stdout', stderr='/dev/stderr',以root身份運行。
????self.stdin = stdin
????self.stdout = stdout
????self.stderr = stderr
????self.pidfile = pidfile
??
??def _daemonize(self):
????try:
??????pid = os.fork()
??????if pid > 0:
????????#退出主進程
????????sys.exit(0)
????except OSError, e:
??????sys.stderr.write('fork #1 failed: %d (%s)\n' % (e.errno, e.strerror))
??????sys.exit(1)
??
????os.chdir("/")
????os.setsid()
????os.umask(0)
??
????#創建子進程
????try:
??????pid = os.fork()
??????if pid > 0:
????????sys.exit(0)
????except OSError, e:
??????sys.stderr.write('fork #2 failed: %d (%s)\n' % (e.errno, e.strerror))
??????sys.exit(1)
??
????#重定向文件描述符
????sys.stdout.flush()
????sys.stderr.flush()
????si = file(self.stdin, 'r')
????so = file(self.stdout, 'a+')
????se = file(self.stderr, 'a+', 0)
????os.dup2(si.fileno(), sys.stdin.fileno())
????os.dup2(so.fileno(), sys.stdout.fileno())
????os.dup2(se.fileno(), sys.stderr.fileno())
??
????#創建processid文件
????atexit.register(self.delpid)
????pid = str(os.getpid())
????file(self.pidfile,'w+').write('%s\n' % pid)
??
??def delpid(self):
????os.remove(self.pidfile)
??def start(self):
????#檢查pid文件是否存在以探測是否存在進程
????try:
??????pf = file(self.pidfile,'r')
??????pid = int(pf.read().strip())
??????pf.close()
????except IOError:
??????pid = None
??
????if pid:
??????message = 'pidfile %s already exist. Daemon already running?\n'
??????sys.stderr.write(message % self.pidfile)
??????sys.exit(1)
????
????#啟動監控
????self._daemonize()
????self._run()
??def stop(self):
????#從pid文件中獲取pid
????try:
??????pf = file(self.pidfile,'r')
??????pid = int(pf.read().strip())
??????pf.close()
????except IOError:
??????pid = None
??
????if not pid:
??????message = 'pidfile %s does not exist. Daemon not running?\n'
??????sys.stderr.write(message % self.pidfile)
??????return #重啟不報錯
????#殺進程
????try:
??????while 1:
????????os.kill(pid, SIGTERM)
????????time.sleep(0.1)
????????os.system('/opt/modules/hadoop/hadoop-0.20.203.0/bin/hadoop-daemon.sh stop datanode')
????????os.system('/opt/modules/hadoop/hadoop-0.20.203.0/bin/hadoop-daemon.sh stop tasktracker')
????except OSError, err:
??????err = str(err)
??????if err.find('No such process') > 0:
????????if os.path.exists(self.pidfile):
??????????os.remove(self.pidfile)
??????else:
????????print str(err)
????????sys.exit(1)
??def restart(self):
????self.stop()
????self.start()
??def _run(self):
????while True:
??????datanode = os.popen('ps -fe | grep "java" | grep "datanode" | grep -v "grep" | wc -l').read().strip()
??????tasktracker = os.popen('ps -fe | grep "java" | grep "tasktracker" | grep -v "grep" | wc -l').read().strip()
??????#選出進程中含有java且含有datanode|tasktracker且不含有grep,計算出現行數。修改上面的進程監控語句以適應其他應用需求
???????????????? if datanode == '0':
???????????????????????????????? os.system('/opt/modules/hadoop/hadoop-0.20.203.0/bin/hadoop-daemon.sh start datanode')
??????????#修改這里的啟動命令
???????????????? if tasktracker == '0':
???????????????????????????????? os.system('/opt/modules/hadoop/hadoop-0.20.203.0/bin/hadoop-daemon.sh start tasktracker')
??????????#修改這里的啟動命令
??????time.sleep(2)
??????#修改這里的停留時間
????
????
if __name__ == '__main__':
????????daemon = Daemon('/tmp/watch_process.pid')
????????if len(sys.argv) == 2:
????????????????if 'start' == sys.argv[1]:
????????????????????????daemon.start()
????????????????elif 'stop' == sys.argv[1]:
????????????????????????daemon.stop()
????????????????elif 'restart' == sys.argv[1]:
????????????????????????daemon.restart()
????????????????else:
????????????????????????print 'Unknown command'
????????????????????????sys.exit(2)
????????????????sys.exit(0)
????????else:
????????????????print 'usage: %s start|stop|restart' % sys.argv[0]
????????????????sys.exit(2)
實際使用中還是比較好用的,把該程序加入到啟動文件中,重啟服務器,服務器重啟后會自動啟動其所監控的進程。用到的都是python的基本模塊,不會存在依賴缺失問題。
#/bin/sh
while true;
do
??count=`ps -fe | grep "java" | grep "datanode" | grep -v "grep"`
??if [ "$?" != "0" ]; then
????/opt/modules/hadoop/hadoop-0.20.203.0/bin/hadoop-daemon.sh start datanode
??fi
??sleep 2
done-----------------------------------#/bin/sh
while true;
do
??count=`ps -fe | grep "java" | grep "tasktracker" | grep -v "grep"`
??if [ "$?" != "0" ]; then
????/opt/modules/hadoop/hadoop-0.20.203.0/bin/hadoop-daemon.sh start tasktracker
??fi
??sleep 2
done
其實作用是一樣的,只是一個前臺,一個后臺。一個占用bash,一個占用python。
其實很多人都對進程掛掉很頭疼,沒事半夜得爬起來上服務器啟動進程是一件非常痛苦的事情。
每2秒監測一次進程,發現進程消失就重啟進程。主要原理是fork出子進程,然后將子進程掛起,并退出父進程。其程序本身作為被監控進程的外殼程序存在。
#!/usr/bin/env python
import sys, os, time, atexit, string
from signal import SIGTERM
class Daemon:
??def __init__(self, pidfile, stdin='/dev/null', stdout='/dev/null', stderr='/dev/null'):
????#需要獲取調試信息,改為stdin='/dev/stdin', stdout='/dev/stdout', stderr='/dev/stderr',以root身份運行。
????self.stdin = stdin
????self.stdout = stdout
????self.stderr = stderr
????self.pidfile = pidfile
??
??def _daemonize(self):
????try:
??????pid = os.fork()
??????if pid > 0:
????????#退出主進程
????????sys.exit(0)
????except OSError, e:
??????sys.stderr.write('fork #1 failed: %d (%s)\n' % (e.errno, e.strerror))
??????sys.exit(1)
??
????os.chdir("/")
????os.setsid()
????os.umask(0)
??
????#創建子進程
????try:
??????pid = os.fork()
??????if pid > 0:
????????sys.exit(0)
????except OSError, e:
??????sys.stderr.write('fork #2 failed: %d (%s)\n' % (e.errno, e.strerror))
??????sys.exit(1)
??
????#重定向文件描述符
????sys.stdout.flush()
????sys.stderr.flush()
????si = file(self.stdin, 'r')
????so = file(self.stdout, 'a+')
????se = file(self.stderr, 'a+', 0)
????os.dup2(si.fileno(), sys.stdin.fileno())
????os.dup2(so.fileno(), sys.stdout.fileno())
????os.dup2(se.fileno(), sys.stderr.fileno())
??
????#創建processid文件
????atexit.register(self.delpid)
????pid = str(os.getpid())
????file(self.pidfile,'w+').write('%s\n' % pid)
??
??def delpid(self):
????os.remove(self.pidfile)
??def start(self):
????#檢查pid文件是否存在以探測是否存在進程
????try:
??????pf = file(self.pidfile,'r')
??????pid = int(pf.read().strip())
??????pf.close()
????except IOError:
??????pid = None
??
????if pid:
??????message = 'pidfile %s already exist. Daemon already running?\n'
??????sys.stderr.write(message % self.pidfile)
??????sys.exit(1)
????
????#啟動監控
????self._daemonize()
????self._run()
??def stop(self):
????#從pid文件中獲取pid
????try:
??????pf = file(self.pidfile,'r')
??????pid = int(pf.read().strip())
??????pf.close()
????except IOError:
??????pid = None
??
????if not pid:
??????message = 'pidfile %s does not exist. Daemon not running?\n'
??????sys.stderr.write(message % self.pidfile)
??????return #重啟不報錯
????#殺進程
????try:
??????while 1:
????????os.kill(pid, SIGTERM)
????????time.sleep(0.1)
????????os.system('/opt/modules/hadoop/hadoop-0.20.203.0/bin/hadoop-daemon.sh stop datanode')
????????os.system('/opt/modules/hadoop/hadoop-0.20.203.0/bin/hadoop-daemon.sh stop tasktracker')
????except OSError, err:
??????err = str(err)
??????if err.find('No such process') > 0:
????????if os.path.exists(self.pidfile):
??????????os.remove(self.pidfile)
??????else:
????????print str(err)
????????sys.exit(1)
??def restart(self):
????self.stop()
????self.start()
??def _run(self):
????while True:
??????datanode = os.popen('ps -fe | grep "java" | grep "datanode" | grep -v "grep" | wc -l').read().strip()
??????tasktracker = os.popen('ps -fe | grep "java" | grep "tasktracker" | grep -v "grep" | wc -l').read().strip()
??????#選出進程中含有java且含有datanode|tasktracker且不含有grep,計算出現行數。修改上面的進程監控語句以適應其他應用需求
???????????????? if datanode == '0':
???????????????????????????????? os.system('/opt/modules/hadoop/hadoop-0.20.203.0/bin/hadoop-daemon.sh start datanode')
??????????#修改這里的啟動命令
???????????????? if tasktracker == '0':
???????????????????????????????? os.system('/opt/modules/hadoop/hadoop-0.20.203.0/bin/hadoop-daemon.sh start tasktracker')
??????????#修改這里的啟動命令
??????time.sleep(2)
??????#修改這里的停留時間
????
????
if __name__ == '__main__':
????????daemon = Daemon('/tmp/watch_process.pid')
????????if len(sys.argv) == 2:
????????????????if 'start' == sys.argv[1]:
????????????????????????daemon.start()
????????????????elif 'stop' == sys.argv[1]:
????????????????????????daemon.stop()
????????????????elif 'restart' == sys.argv[1]:
????????????????????????daemon.restart()
????????????????else:
????????????????????????print 'Unknown command'
????????????????????????sys.exit(2)
????????????????sys.exit(0)
????????else:
????????????????print 'usage: %s start|stop|restart' % sys.argv[0]
????????????????sys.exit(2)
實際使用中還是比較好用的,把該程序加入到啟動文件中,重啟服務器,服務器重啟后會自動啟動其所監控的進程。用到的都是python的基本模塊,不會存在依賴缺失問題。
#/bin/sh
while true;
do
??count=`ps -fe | grep "java" | grep "datanode" | grep -v "grep"`
??if [ "$?" != "0" ]; then
????/opt/modules/hadoop/hadoop-0.20.203.0/bin/hadoop-daemon.sh start datanode
??fi
??sleep 2
done-----------------------------------#/bin/sh
while true;
do
??count=`ps -fe | grep "java" | grep "tasktracker" | grep -v "grep"`
??if [ "$?" != "0" ]; then
????/opt/modules/hadoop/hadoop-0.20.203.0/bin/hadoop-daemon.sh start tasktracker
??fi
??sleep 2
done
其實作用是一樣的,只是一個前臺,一個后臺。一個占用bash,一個占用python。
總結
以上是生活随笔為你收集整理的用python编写daemon监控进程并自动恢复(附Shell版)的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: PPP Over Frame Relay
- 下一篇: 算法题008 快速找出故障机器