检测服务器状态
检测服务器状态脚本
主打一个抽象, 代码如下:
from flask import Flask, request, redirect, url_for, session, render_template
import psutil
import subprocess
app = Flask(__name__)
app.secret_key = 'keyyyyy' # 设置一个随机的密钥来加密 session 数据
def get_cpu_usage():
# 显示cpu型号
return '{}%'.format(psutil.cpu_percent(interval=1))
def bytes_to_gb(bytes):
return bytes / (1024 ** 3)
def get_mem_usage():
return '{:.2f}/{} GB({}%)'.format(bytes_to_gb(psutil.virtual_memory().used), int(bytes_to_gb(psutil.virtual_memory().total)), psutil.virtual_memory().percent)
def get_gpu_info():
try:
result = subprocess.run(['nvidia-smi', '--query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory', '--format=csv,noheader,nounits'], capture_output=True, text=True).stdout.strip()
except Exception as e:
if "No such file or directory: 'nvidia-smi'" in str(e):
return {'id':['无显卡'], 'gpu_mem_usage': [], 'gpu_utilization': []}
else:
print(e)
return {'id':['未知显卡错误'], 'gpu_mem_usage': [], 'gpu_utilization': []}
# print(result)
if "NVIDIA-SMI has failed " in result:
return {'id':['显卡驱动异常'], 'gpu_mem_usage': ['N/A'], 'gpu_utilization': ['N/A']}
elif "Unable to determine the device handle" in result:
return {'id':['显卡状态异常'], 'gpu_mem_usage': ['N/A'], 'gpu_utilization': ['N/A']}
else:
gpu_info = result.split('\n')
# print("gpu:", gpu_info)
id = []
gpu_mem_usage = []
gpu_utilization = []
# return None
for info in gpu_info:
gpu_usage = info.split(', ')
# print(gpu_usage)
id.append(gpu_usage[1])
gpu_mem_usage.append('{}/{} MB ({}%)'.format(gpu_usage[3], gpu_usage[2], gpu_usage[6]))
gpu_utilization.append('{}%'.format(gpu_usage[5]))
# print(f"GPU Index: {gpu_usage[0]}, Name: {gpu_usage[1]}, Memory Total: {gpu_usage[2]} MB, Memory Used: {gpu_usage[3]} MB, Memory Free: {gpu_usage[4]} MB, GPU Utilization: {gpu_usage[5]}%, Memory Utilization: {gpu_usage[6]}%")
return {'id': id, 'gpu_mem_usage': gpu_mem_usage, 'gpu_utilization': gpu_utilization}
@app.route('/get_server_info', methods=['GET'])
def get_server_info():
return {'cpu_usage': get_cpu_usage(), 'mem_usage': get_mem_usage(), 'gpus': get_gpu_info()}
@app.route('/get_system_info', methods=['GET'])
def get_system_info():
try:
result = subprocess.run(['uname', '-r'], capture_output=True, text=True).stdout.strip()
return {"result": result, "code":200}
except Exception as e:
return {"result":result, "code":500}
if __name__ == '__main__':
app.run(debug=True, host='0.0.0.0')
然后用一个服务器作为网站服务器, 配置IP向所有服务器发送请求, 得到服务器的状态.
虽然可能有开源的做的更好, 但是懒得搞了, 属于能用就行.
crontab -e
# 2
# @reboot bash xxx.sh
service cron reload