检测服务器状态脚本

主打一个抽象, 代码如下:

from flask import Flask, request, redirect, url_for, session, render_template
import psutil

import subprocess

app = Flask(__name__)
app.secret_key = 'keyyyyy'  # 设置一个随机的密钥来加密 session 数据

def get_cpu_usage():
    # 显示cpu型号
    return '{}%'.format(psutil.cpu_percent(interval=1))

def bytes_to_gb(bytes):
    return bytes / (1024 ** 3)

def get_mem_usage():
    return '{:.2f}/{} GB({}%)'.format(bytes_to_gb(psutil.virtual_memory().used), int(bytes_to_gb(psutil.virtual_memory().total)), psutil.virtual_memory().percent)

def get_gpu_info():
    try:
        result = subprocess.run(['nvidia-smi', '--query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory', '--format=csv,noheader,nounits'], capture_output=True, text=True).stdout.strip()
    except Exception as e:
        if "No such file or directory: 'nvidia-smi'" in str(e):
            return {'id':['无显卡'], 'gpu_mem_usage': [], 'gpu_utilization': []}
        else:
            print(e)
            return {'id':['未知显卡错误'], 'gpu_mem_usage': [], 'gpu_utilization': []}
    # print(result)
    if "NVIDIA-SMI has failed " in result:
        return {'id':['显卡驱动异常'], 'gpu_mem_usage': ['N/A'], 'gpu_utilization': ['N/A']}
    elif "Unable to determine the device handle" in result:
        return {'id':['显卡状态异常'], 'gpu_mem_usage': ['N/A'], 'gpu_utilization': ['N/A']}
    else:
        gpu_info = result.split('\n')
        # print("gpu:", gpu_info)
        id = []
        gpu_mem_usage = []
        gpu_utilization = []
        # return None
        for info in gpu_info:
            gpu_usage = info.split(', ')
            # print(gpu_usage)
            id.append(gpu_usage[1])
            gpu_mem_usage.append('{}/{} MB ({}%)'.format(gpu_usage[3], gpu_usage[2], gpu_usage[6]))
            gpu_utilization.append('{}%'.format(gpu_usage[5]))
            # print(f"GPU Index: {gpu_usage[0]}, Name: {gpu_usage[1]}, Memory Total: {gpu_usage[2]} MB, Memory Used: {gpu_usage[3]} MB, Memory Free: {gpu_usage[4]} MB, GPU Utilization: {gpu_usage[5]}%, Memory Utilization: {gpu_usage[6]}%")
        return {'id': id, 'gpu_mem_usage': gpu_mem_usage, 'gpu_utilization': gpu_utilization}

@app.route('/get_server_info', methods=['GET'])
def get_server_info():
    return {'cpu_usage': get_cpu_usage(), 'mem_usage': get_mem_usage(), 'gpus': get_gpu_info()}

@app.route('/get_system_info', methods=['GET'])
def get_system_info():
    try:
        result = subprocess.run(['uname', '-r'], capture_output=True, text=True).stdout.strip()
        return {"result": result, "code":200}
    except Exception as e:
        return {"result":result, "code":500}
            
if __name__ == '__main__':
    app.run(debug=True, host='0.0.0.0')

然后用一个服务器作为网站服务器, 配置IP向所有服务器发送请求, 得到服务器的状态.

虽然可能有开源的做的更好, 但是懒得搞了, 属于能用就行.

crontab -e
# 2
# @reboot bash xxx.sh
service cron reload