Yolov/resource_monitor.py

247 lines
9.0 KiB
Python
Raw Normal View History

2025-12-11 13:41:07 +08:00
# resource_monitor.py
import psutil
import time
import threading
from datetime import datetime
import torch
from log import logger
from global_data import gd
try:
import GPUtil
GPU_AVAILABLE = True
except ImportError:
GPU_AVAILABLE = False
logger.warning("GPUtil 未安装GPU监控不可用")
class ResourceMonitor:
"""系统资源监控器"""
def __init__(self, config):
self.config = config
self.running = True
self.monitor_thread = None
self.resource_history = []
self.max_history = 100
self.lock = threading.Lock()
# 资源限制
self.resource_limits = config.get('resource_limits', {
'max_cpu_percent': 80,
'max_memory_percent': 80,
'max_gpu_memory_percent': 80,
'max_concurrent_tasks': 5,
'min_concurrent_tasks': 1
})
# 动态调整参数
self.current_max_tasks = self.resource_limits['max_concurrent_tasks']
self.adjustment_factor = 1.0
self.last_adjustment = time.time()
# GPU信息
self.gpu_info = None
if GPU_AVAILABLE and torch.cuda.is_available():
self.init_gpu_monitor()
def init_gpu_monitor(self):
"""初始化GPU监控"""
try:
gpus = GPUtil.getGPUs()
self.gpu_info = []
for gpu in gpus:
self.gpu_info.append({
'id': gpu.id,
'name': gpu.name,
'memory_total': gpu.memoryTotal,
'driver_version': torch.version.cuda if torch.cuda.is_available() else 'Unknown'
})
logger.info(f"GPU监控已初始化: {len(gpus)}个GPU")
except Exception as e:
logger.error(f"GPU监控初始化失败: {str(e)}")
self.gpu_info = None
def get_system_resources(self):
"""获取系统资源使用情况"""
resources = {
'timestamp': datetime.now().isoformat(),
'cpu_percent': psutil.cpu_percent(interval=0.1),
'memory_percent': psutil.virtual_memory().percent,
'memory_used': psutil.virtual_memory().used // (1024 * 1024), # MB
'memory_total': psutil.virtual_memory().total // (1024 * 1024), # MB
'disk_percent': psutil.disk_usage('/').percent,
'network_io': psutil.net_io_counters()._asdict(),
'process_count': len(psutil.pids()),
}
# GPU信息
if self.gpu_info is not None:
gpus = GPUtil.getGPUs()
gpu_data = []
for i, gpu in enumerate(gpus):
gpu_data.append({
'id': gpu.id,
'name': gpu.name,
'load': gpu.load * 100,
'memory_used': gpu.memoryUsed,
'memory_total': gpu.memoryTotal,
'memory_percent': (gpu.memoryUsed / gpu.memoryTotal) * 100,
'temperature': gpu.temperature,
'driver_version': self.gpu_info[i]['driver_version'] if i < len(self.gpu_info) else 'Unknown'
})
resources['gpus'] = gpu_data
return resources
def check_resource_limits(self, resources):
"""检查资源是否超过限制"""
violations = []
# CPU检查
if resources['cpu_percent'] > self.resource_limits['max_cpu_percent']:
violations.append(f"CPU使用率过高: {resources['cpu_percent']:.1f}%")
# 内存检查
if resources['memory_percent'] > self.resource_limits['max_memory_percent']:
violations.append(f"内存使用率过高: {resources['memory_percent']:.1f}%")
# GPU检查
if 'gpus' in resources:
for gpu in resources['gpus']:
if gpu['memory_percent'] > self.resource_limits['max_gpu_memory_percent']:
violations.append(f"GPU{gpu['id']}内存使用率过高: {gpu['memory_percent']:.1f}%")
return violations
def adjust_concurrent_tasks(self, resources):
"""根据资源使用动态调整并发任务数"""
current_time = time.time()
# 检查调整间隔
if current_time - self.last_adjustment < self.resource_limits['check_interval']:
return self.current_max_tasks
violations = self.check_resource_limits(resources)
if violations:
# 资源使用过高,减少并发任务
if self.current_max_tasks > self.resource_limits['min_concurrent_tasks']:
self.current_max_tasks -= 1
logger.warning(f"资源使用过高,减少并发任务数至: {self.current_max_tasks}")
logger.warning(f"违规项: {', '.join(violations)}")
else:
# 资源使用正常,尝试增加并发任务
safety_margin = 0.8 # 安全边际
cpu_headroom = (self.resource_limits['max_cpu_percent'] - resources['cpu_percent']) / 100
memory_headroom = (self.resource_limits['max_memory_percent'] - resources['memory_percent']) / 100
# 考虑GPU内存
gpu_headroom = 1.0
if 'gpus' in resources:
gpu_headrooms = []
for gpu in resources['gpus']:
gpu_headrooms.append((self.resource_limits['max_gpu_memory_percent'] - gpu['memory_percent']) / 100)
gpu_headroom = min(gpu_headrooms) if gpu_headrooms else 1.0
# 计算可用资源比例
available_resources = min(cpu_headroom, memory_headroom, gpu_headroom)
# 根据可用资源调整任务数
if available_resources > 0.3: # 有30%以上余量
if self.current_max_tasks < self.resource_limits['max_concurrent_tasks']:
self.current_max_tasks += 1
logger.info(f"资源充足,增加并发任务数至: {self.current_max_tasks}")
elif available_resources < 0.1: # 余量不足10%
if self.current_max_tasks > self.resource_limits['min_concurrent_tasks']:
self.current_max_tasks -= 1
logger.warning(f"资源紧张,减少并发任务数至: {self.current_max_tasks}")
self.last_adjustment = current_time
return self.current_max_tasks
def monitor_loop(self):
"""监控循环"""
logger.info("资源监控线程启动")
while self.running:
try:
# 获取资源使用情况
resources = self.get_system_resources()
with self.lock:
# 保存历史记录
self.resource_history.append(resources)
if len(self.resource_history) > self.max_history:
self.resource_history.pop(0)
# 动态调整并发任务数
self.adjust_concurrent_tasks(resources)
# 更新全局数据
gd.set_value('system_resources', resources)
gd.set_value('max_concurrent_tasks', self.current_max_tasks)
# 记录资源使用情况(每分钟一次)
if len(self.resource_history) % 12 == 0: # 5秒 * 12 = 60秒
self.log_resource_summary(resources)
time.sleep(5) # 5秒检查一次
except Exception as e:
logger.error(f"资源监控异常: {str(e)}")
time.sleep(10)
logger.info("资源监控线程停止")
def log_resource_summary(self, resources):
"""记录资源使用摘要"""
summary = [
f"CPU: {resources['cpu_percent']:.1f}%",
f"内存: {resources['memory_percent']:.1f}% ({resources['memory_used']}/{resources['memory_total']}MB)",
]
if 'gpus' in resources:
for gpu in resources['gpus']:
summary.append(f"GPU{gpu['id']}: {gpu['load']:.1f}%负载, {gpu['memory_percent']:.1f}%内存")
summary.append(f"并发任务限制: {self.current_max_tasks}")
logger.info("资源使用摘要: " + " | ".join(summary))
def start(self):
"""启动监控"""
self.running = True
self.monitor_thread = threading.Thread(target=self.monitor_loop, daemon=True)
self.monitor_thread.start()
def stop(self):
"""停止监控"""
self.running = False
if self.monitor_thread and self.monitor_thread.is_alive():
self.monitor_thread.join(5.0)
def get_resource_history(self, count=10):
"""获取最近资源历史"""
with self.lock:
return self.resource_history[-count:] if self.resource_history else []
def get_current_resources(self):
"""获取当前资源使用情况"""
with self.lock:
return self.resource_history[-1] if self.resource_history else None
# 全局资源监控器实例
resource_monitor = None
def init_resource_monitor(config):
"""初始化资源监控器"""
global resource_monitor
resource_monitor = ResourceMonitor(config)
resource_monitor.start()
return resource_monitor