Yolov/resource_monitor.py

247 lines
9.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# resource_monitor.py
import psutil
import time
import threading
from datetime import datetime
import torch
from log import logger
from global_data import gd
try:
import GPUtil
GPU_AVAILABLE = True
except ImportError:
GPU_AVAILABLE = False
logger.warning("GPUtil 未安装GPU监控不可用")
class ResourceMonitor:
"""系统资源监控器"""
def __init__(self, config):
self.config = config
self.running = True
self.monitor_thread = None
self.resource_history = []
self.max_history = 100
self.lock = threading.Lock()
# 资源限制
self.resource_limits = config.get('resource_limits', {
'max_cpu_percent': 80,
'max_memory_percent': 80,
'max_gpu_memory_percent': 80,
'max_concurrent_tasks': 5,
'min_concurrent_tasks': 1
})
# 动态调整参数
self.current_max_tasks = self.resource_limits['max_concurrent_tasks']
self.adjustment_factor = 1.0
self.last_adjustment = time.time()
# GPU信息
self.gpu_info = None
if GPU_AVAILABLE and torch.cuda.is_available():
self.init_gpu_monitor()
def init_gpu_monitor(self):
"""初始化GPU监控"""
try:
gpus = GPUtil.getGPUs()
self.gpu_info = []
for gpu in gpus:
self.gpu_info.append({
'id': gpu.id,
'name': gpu.name,
'memory_total': gpu.memoryTotal,
'driver_version': torch.version.cuda if torch.cuda.is_available() else 'Unknown'
})
logger.info(f"GPU监控已初始化: {len(gpus)}个GPU")
except Exception as e:
logger.error(f"GPU监控初始化失败: {str(e)}")
self.gpu_info = None
def get_system_resources(self):
"""获取系统资源使用情况"""
resources = {
'timestamp': datetime.now().isoformat(),
'cpu_percent': psutil.cpu_percent(interval=0.1),
'memory_percent': psutil.virtual_memory().percent,
'memory_used': psutil.virtual_memory().used // (1024 * 1024), # MB
'memory_total': psutil.virtual_memory().total // (1024 * 1024), # MB
'disk_percent': psutil.disk_usage('/').percent,
'network_io': psutil.net_io_counters()._asdict(),
'process_count': len(psutil.pids()),
}
# GPU信息
if self.gpu_info is not None:
gpus = GPUtil.getGPUs()
gpu_data = []
for i, gpu in enumerate(gpus):
gpu_data.append({
'id': gpu.id,
'name': gpu.name,
'load': gpu.load * 100,
'memory_used': gpu.memoryUsed,
'memory_total': gpu.memoryTotal,
'memory_percent': (gpu.memoryUsed / gpu.memoryTotal) * 100,
'temperature': gpu.temperature,
'driver_version': self.gpu_info[i]['driver_version'] if i < len(self.gpu_info) else 'Unknown'
})
resources['gpus'] = gpu_data
return resources
def check_resource_limits(self, resources):
"""检查资源是否超过限制"""
violations = []
# CPU检查
if resources['cpu_percent'] > self.resource_limits['max_cpu_percent']:
violations.append(f"CPU使用率过高: {resources['cpu_percent']:.1f}%")
# 内存检查
if resources['memory_percent'] > self.resource_limits['max_memory_percent']:
violations.append(f"内存使用率过高: {resources['memory_percent']:.1f}%")
# GPU检查
if 'gpus' in resources:
for gpu in resources['gpus']:
if gpu['memory_percent'] > self.resource_limits['max_gpu_memory_percent']:
violations.append(f"GPU{gpu['id']}内存使用率过高: {gpu['memory_percent']:.1f}%")
return violations
def adjust_concurrent_tasks(self, resources):
"""根据资源使用动态调整并发任务数"""
current_time = time.time()
# 检查调整间隔
if current_time - self.last_adjustment < self.resource_limits['check_interval']:
return self.current_max_tasks
violations = self.check_resource_limits(resources)
if violations:
# 资源使用过高,减少并发任务
if self.current_max_tasks > self.resource_limits['min_concurrent_tasks']:
self.current_max_tasks -= 1
logger.warning(f"资源使用过高,减少并发任务数至: {self.current_max_tasks}")
logger.warning(f"违规项: {', '.join(violations)}")
else:
# 资源使用正常,尝试增加并发任务
safety_margin = 0.8 # 安全边际
cpu_headroom = (self.resource_limits['max_cpu_percent'] - resources['cpu_percent']) / 100
memory_headroom = (self.resource_limits['max_memory_percent'] - resources['memory_percent']) / 100
# 考虑GPU内存
gpu_headroom = 1.0
if 'gpus' in resources:
gpu_headrooms = []
for gpu in resources['gpus']:
gpu_headrooms.append((self.resource_limits['max_gpu_memory_percent'] - gpu['memory_percent']) / 100)
gpu_headroom = min(gpu_headrooms) if gpu_headrooms else 1.0
# 计算可用资源比例
available_resources = min(cpu_headroom, memory_headroom, gpu_headroom)
# 根据可用资源调整任务数
if available_resources > 0.3: # 有30%以上余量
if self.current_max_tasks < self.resource_limits['max_concurrent_tasks']:
self.current_max_tasks += 1
logger.info(f"资源充足,增加并发任务数至: {self.current_max_tasks}")
elif available_resources < 0.1: # 余量不足10%
if self.current_max_tasks > self.resource_limits['min_concurrent_tasks']:
self.current_max_tasks -= 1
logger.warning(f"资源紧张,减少并发任务数至: {self.current_max_tasks}")
self.last_adjustment = current_time
return self.current_max_tasks
def monitor_loop(self):
"""监控循环"""
logger.info("资源监控线程启动")
while self.running:
try:
# 获取资源使用情况
resources = self.get_system_resources()
with self.lock:
# 保存历史记录
self.resource_history.append(resources)
if len(self.resource_history) > self.max_history:
self.resource_history.pop(0)
# 动态调整并发任务数
self.adjust_concurrent_tasks(resources)
# 更新全局数据
gd.set_value('system_resources', resources)
gd.set_value('max_concurrent_tasks', self.current_max_tasks)
# 记录资源使用情况(每分钟一次)
if len(self.resource_history) % 12 == 0: # 5秒 * 12 = 60秒
self.log_resource_summary(resources)
time.sleep(5) # 5秒检查一次
except Exception as e:
logger.error(f"资源监控异常: {str(e)}")
time.sleep(10)
logger.info("资源监控线程停止")
def log_resource_summary(self, resources):
"""记录资源使用摘要"""
summary = [
f"CPU: {resources['cpu_percent']:.1f}%",
f"内存: {resources['memory_percent']:.1f}% ({resources['memory_used']}/{resources['memory_total']}MB)",
]
if 'gpus' in resources:
for gpu in resources['gpus']:
summary.append(f"GPU{gpu['id']}: {gpu['load']:.1f}%负载, {gpu['memory_percent']:.1f}%内存")
summary.append(f"并发任务限制: {self.current_max_tasks}")
logger.info("资源使用摘要: " + " | ".join(summary))
def start(self):
"""启动监控"""
self.running = True
self.monitor_thread = threading.Thread(target=self.monitor_loop, daemon=True)
self.monitor_thread.start()
def stop(self):
"""停止监控"""
self.running = False
if self.monitor_thread and self.monitor_thread.is_alive():
self.monitor_thread.join(5.0)
def get_resource_history(self, count=10):
"""获取最近资源历史"""
with self.lock:
return self.resource_history[-count:] if self.resource_history else []
def get_current_resources(self):
"""获取当前资源使用情况"""
with self.lock:
return self.resource_history[-1] if self.resource_history else None
# 全局资源监控器实例
resource_monitor = None
def init_resource_monitor(config):
"""初始化资源监控器"""
global resource_monitor
resource_monitor = ResourceMonitor(config)
resource_monitor.start()
return resource_monitor