247 lines
9.0 KiB
Python
247 lines
9.0 KiB
Python
|
|
# resource_monitor.py
|
|||
|
|
import psutil
|
|||
|
|
import time
|
|||
|
|
import threading
|
|||
|
|
from datetime import datetime
|
|||
|
|
|
|||
|
|
import torch
|
|||
|
|
|
|||
|
|
from log import logger
|
|||
|
|
from global_data import gd
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
import GPUtil
|
|||
|
|
|
|||
|
|
GPU_AVAILABLE = True
|
|||
|
|
except ImportError:
|
|||
|
|
GPU_AVAILABLE = False
|
|||
|
|
logger.warning("GPUtil 未安装,GPU监控不可用")
|
|||
|
|
|
|||
|
|
|
|||
|
|
class ResourceMonitor:
|
|||
|
|
"""系统资源监控器"""
|
|||
|
|
|
|||
|
|
def __init__(self, config):
|
|||
|
|
self.config = config
|
|||
|
|
self.running = True
|
|||
|
|
self.monitor_thread = None
|
|||
|
|
self.resource_history = []
|
|||
|
|
self.max_history = 100
|
|||
|
|
self.lock = threading.Lock()
|
|||
|
|
|
|||
|
|
# 资源限制
|
|||
|
|
self.resource_limits = config.get('resource_limits', {
|
|||
|
|
'max_cpu_percent': 80,
|
|||
|
|
'max_memory_percent': 80,
|
|||
|
|
'max_gpu_memory_percent': 80,
|
|||
|
|
'max_concurrent_tasks': 5,
|
|||
|
|
'min_concurrent_tasks': 1
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
# 动态调整参数
|
|||
|
|
self.current_max_tasks = self.resource_limits['max_concurrent_tasks']
|
|||
|
|
self.adjustment_factor = 1.0
|
|||
|
|
self.last_adjustment = time.time()
|
|||
|
|
|
|||
|
|
# GPU信息
|
|||
|
|
self.gpu_info = None
|
|||
|
|
if GPU_AVAILABLE and torch.cuda.is_available():
|
|||
|
|
self.init_gpu_monitor()
|
|||
|
|
|
|||
|
|
def init_gpu_monitor(self):
|
|||
|
|
"""初始化GPU监控"""
|
|||
|
|
try:
|
|||
|
|
gpus = GPUtil.getGPUs()
|
|||
|
|
self.gpu_info = []
|
|||
|
|
for gpu in gpus:
|
|||
|
|
self.gpu_info.append({
|
|||
|
|
'id': gpu.id,
|
|||
|
|
'name': gpu.name,
|
|||
|
|
'memory_total': gpu.memoryTotal,
|
|||
|
|
'driver_version': torch.version.cuda if torch.cuda.is_available() else 'Unknown'
|
|||
|
|
})
|
|||
|
|
logger.info(f"GPU监控已初始化: {len(gpus)}个GPU")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"GPU监控初始化失败: {str(e)}")
|
|||
|
|
self.gpu_info = None
|
|||
|
|
|
|||
|
|
def get_system_resources(self):
|
|||
|
|
"""获取系统资源使用情况"""
|
|||
|
|
resources = {
|
|||
|
|
'timestamp': datetime.now().isoformat(),
|
|||
|
|
'cpu_percent': psutil.cpu_percent(interval=0.1),
|
|||
|
|
'memory_percent': psutil.virtual_memory().percent,
|
|||
|
|
'memory_used': psutil.virtual_memory().used // (1024 * 1024), # MB
|
|||
|
|
'memory_total': psutil.virtual_memory().total // (1024 * 1024), # MB
|
|||
|
|
'disk_percent': psutil.disk_usage('/').percent,
|
|||
|
|
'network_io': psutil.net_io_counters()._asdict(),
|
|||
|
|
'process_count': len(psutil.pids()),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# GPU信息
|
|||
|
|
if self.gpu_info is not None:
|
|||
|
|
gpus = GPUtil.getGPUs()
|
|||
|
|
gpu_data = []
|
|||
|
|
for i, gpu in enumerate(gpus):
|
|||
|
|
gpu_data.append({
|
|||
|
|
'id': gpu.id,
|
|||
|
|
'name': gpu.name,
|
|||
|
|
'load': gpu.load * 100,
|
|||
|
|
'memory_used': gpu.memoryUsed,
|
|||
|
|
'memory_total': gpu.memoryTotal,
|
|||
|
|
'memory_percent': (gpu.memoryUsed / gpu.memoryTotal) * 100,
|
|||
|
|
'temperature': gpu.temperature,
|
|||
|
|
'driver_version': self.gpu_info[i]['driver_version'] if i < len(self.gpu_info) else 'Unknown'
|
|||
|
|
})
|
|||
|
|
resources['gpus'] = gpu_data
|
|||
|
|
|
|||
|
|
return resources
|
|||
|
|
|
|||
|
|
def check_resource_limits(self, resources):
|
|||
|
|
"""检查资源是否超过限制"""
|
|||
|
|
violations = []
|
|||
|
|
|
|||
|
|
# CPU检查
|
|||
|
|
if resources['cpu_percent'] > self.resource_limits['max_cpu_percent']:
|
|||
|
|
violations.append(f"CPU使用率过高: {resources['cpu_percent']:.1f}%")
|
|||
|
|
|
|||
|
|
# 内存检查
|
|||
|
|
if resources['memory_percent'] > self.resource_limits['max_memory_percent']:
|
|||
|
|
violations.append(f"内存使用率过高: {resources['memory_percent']:.1f}%")
|
|||
|
|
|
|||
|
|
# GPU检查
|
|||
|
|
if 'gpus' in resources:
|
|||
|
|
for gpu in resources['gpus']:
|
|||
|
|
if gpu['memory_percent'] > self.resource_limits['max_gpu_memory_percent']:
|
|||
|
|
violations.append(f"GPU{gpu['id']}内存使用率过高: {gpu['memory_percent']:.1f}%")
|
|||
|
|
|
|||
|
|
return violations
|
|||
|
|
|
|||
|
|
def adjust_concurrent_tasks(self, resources):
|
|||
|
|
"""根据资源使用动态调整并发任务数"""
|
|||
|
|
current_time = time.time()
|
|||
|
|
|
|||
|
|
# 检查调整间隔
|
|||
|
|
if current_time - self.last_adjustment < self.resource_limits['check_interval']:
|
|||
|
|
return self.current_max_tasks
|
|||
|
|
|
|||
|
|
violations = self.check_resource_limits(resources)
|
|||
|
|
|
|||
|
|
if violations:
|
|||
|
|
# 资源使用过高,减少并发任务
|
|||
|
|
if self.current_max_tasks > self.resource_limits['min_concurrent_tasks']:
|
|||
|
|
self.current_max_tasks -= 1
|
|||
|
|
logger.warning(f"资源使用过高,减少并发任务数至: {self.current_max_tasks}")
|
|||
|
|
logger.warning(f"违规项: {', '.join(violations)}")
|
|||
|
|
else:
|
|||
|
|
# 资源使用正常,尝试增加并发任务
|
|||
|
|
safety_margin = 0.8 # 安全边际
|
|||
|
|
cpu_headroom = (self.resource_limits['max_cpu_percent'] - resources['cpu_percent']) / 100
|
|||
|
|
memory_headroom = (self.resource_limits['max_memory_percent'] - resources['memory_percent']) / 100
|
|||
|
|
|
|||
|
|
# 考虑GPU内存
|
|||
|
|
gpu_headroom = 1.0
|
|||
|
|
if 'gpus' in resources:
|
|||
|
|
gpu_headrooms = []
|
|||
|
|
for gpu in resources['gpus']:
|
|||
|
|
gpu_headrooms.append((self.resource_limits['max_gpu_memory_percent'] - gpu['memory_percent']) / 100)
|
|||
|
|
gpu_headroom = min(gpu_headrooms) if gpu_headrooms else 1.0
|
|||
|
|
|
|||
|
|
# 计算可用资源比例
|
|||
|
|
available_resources = min(cpu_headroom, memory_headroom, gpu_headroom)
|
|||
|
|
|
|||
|
|
# 根据可用资源调整任务数
|
|||
|
|
if available_resources > 0.3: # 有30%以上余量
|
|||
|
|
if self.current_max_tasks < self.resource_limits['max_concurrent_tasks']:
|
|||
|
|
self.current_max_tasks += 1
|
|||
|
|
logger.info(f"资源充足,增加并发任务数至: {self.current_max_tasks}")
|
|||
|
|
elif available_resources < 0.1: # 余量不足10%
|
|||
|
|
if self.current_max_tasks > self.resource_limits['min_concurrent_tasks']:
|
|||
|
|
self.current_max_tasks -= 1
|
|||
|
|
logger.warning(f"资源紧张,减少并发任务数至: {self.current_max_tasks}")
|
|||
|
|
|
|||
|
|
self.last_adjustment = current_time
|
|||
|
|
return self.current_max_tasks
|
|||
|
|
|
|||
|
|
def monitor_loop(self):
|
|||
|
|
"""监控循环"""
|
|||
|
|
logger.info("资源监控线程启动")
|
|||
|
|
|
|||
|
|
while self.running:
|
|||
|
|
try:
|
|||
|
|
# 获取资源使用情况
|
|||
|
|
resources = self.get_system_resources()
|
|||
|
|
|
|||
|
|
with self.lock:
|
|||
|
|
# 保存历史记录
|
|||
|
|
self.resource_history.append(resources)
|
|||
|
|
if len(self.resource_history) > self.max_history:
|
|||
|
|
self.resource_history.pop(0)
|
|||
|
|
|
|||
|
|
# 动态调整并发任务数
|
|||
|
|
self.adjust_concurrent_tasks(resources)
|
|||
|
|
|
|||
|
|
# 更新全局数据
|
|||
|
|
gd.set_value('system_resources', resources)
|
|||
|
|
gd.set_value('max_concurrent_tasks', self.current_max_tasks)
|
|||
|
|
|
|||
|
|
# 记录资源使用情况(每分钟一次)
|
|||
|
|
if len(self.resource_history) % 12 == 0: # 5秒 * 12 = 60秒
|
|||
|
|
self.log_resource_summary(resources)
|
|||
|
|
|
|||
|
|
time.sleep(5) # 5秒检查一次
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"资源监控异常: {str(e)}")
|
|||
|
|
time.sleep(10)
|
|||
|
|
|
|||
|
|
logger.info("资源监控线程停止")
|
|||
|
|
|
|||
|
|
def log_resource_summary(self, resources):
|
|||
|
|
"""记录资源使用摘要"""
|
|||
|
|
summary = [
|
|||
|
|
f"CPU: {resources['cpu_percent']:.1f}%",
|
|||
|
|
f"内存: {resources['memory_percent']:.1f}% ({resources['memory_used']}/{resources['memory_total']}MB)",
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
if 'gpus' in resources:
|
|||
|
|
for gpu in resources['gpus']:
|
|||
|
|
summary.append(f"GPU{gpu['id']}: {gpu['load']:.1f}%负载, {gpu['memory_percent']:.1f}%内存")
|
|||
|
|
|
|||
|
|
summary.append(f"并发任务限制: {self.current_max_tasks}")
|
|||
|
|
|
|||
|
|
logger.info("资源使用摘要: " + " | ".join(summary))
|
|||
|
|
|
|||
|
|
def start(self):
|
|||
|
|
"""启动监控"""
|
|||
|
|
self.running = True
|
|||
|
|
self.monitor_thread = threading.Thread(target=self.monitor_loop, daemon=True)
|
|||
|
|
self.monitor_thread.start()
|
|||
|
|
|
|||
|
|
def stop(self):
|
|||
|
|
"""停止监控"""
|
|||
|
|
self.running = False
|
|||
|
|
if self.monitor_thread and self.monitor_thread.is_alive():
|
|||
|
|
self.monitor_thread.join(5.0)
|
|||
|
|
|
|||
|
|
def get_resource_history(self, count=10):
|
|||
|
|
"""获取最近资源历史"""
|
|||
|
|
with self.lock:
|
|||
|
|
return self.resource_history[-count:] if self.resource_history else []
|
|||
|
|
|
|||
|
|
def get_current_resources(self):
|
|||
|
|
"""获取当前资源使用情况"""
|
|||
|
|
with self.lock:
|
|||
|
|
return self.resource_history[-1] if self.resource_history else None
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 全局资源监控器实例
|
|||
|
|
resource_monitor = None
|
|||
|
|
|
|||
|
|
|
|||
|
|
def init_resource_monitor(config):
|
|||
|
|
"""初始化资源监控器"""
|
|||
|
|
global resource_monitor
|
|||
|
|
resource_monitor = ResourceMonitor(config)
|
|||
|
|
resource_monitor.start()
|
|||
|
|
return resource_monitor
|