247 lines
9.0 KiB
Python
247 lines
9.0 KiB
Python
# resource_monitor.py
|
||
import psutil
|
||
import time
|
||
import threading
|
||
from datetime import datetime
|
||
|
||
import torch
|
||
|
||
from log import logger
|
||
from global_data import gd
|
||
|
||
try:
|
||
import GPUtil
|
||
|
||
GPU_AVAILABLE = True
|
||
except ImportError:
|
||
GPU_AVAILABLE = False
|
||
logger.warning("GPUtil 未安装,GPU监控不可用")
|
||
|
||
|
||
class ResourceMonitor:
|
||
"""系统资源监控器"""
|
||
|
||
def __init__(self, config):
|
||
self.config = config
|
||
self.running = True
|
||
self.monitor_thread = None
|
||
self.resource_history = []
|
||
self.max_history = 100
|
||
self.lock = threading.Lock()
|
||
|
||
# 资源限制
|
||
self.resource_limits = config.get('resource_limits', {
|
||
'max_cpu_percent': 80,
|
||
'max_memory_percent': 80,
|
||
'max_gpu_memory_percent': 80,
|
||
'max_concurrent_tasks': 5,
|
||
'min_concurrent_tasks': 1
|
||
})
|
||
|
||
# 动态调整参数
|
||
self.current_max_tasks = self.resource_limits['max_concurrent_tasks']
|
||
self.adjustment_factor = 1.0
|
||
self.last_adjustment = time.time()
|
||
|
||
# GPU信息
|
||
self.gpu_info = None
|
||
if GPU_AVAILABLE and torch.cuda.is_available():
|
||
self.init_gpu_monitor()
|
||
|
||
def init_gpu_monitor(self):
|
||
"""初始化GPU监控"""
|
||
try:
|
||
gpus = GPUtil.getGPUs()
|
||
self.gpu_info = []
|
||
for gpu in gpus:
|
||
self.gpu_info.append({
|
||
'id': gpu.id,
|
||
'name': gpu.name,
|
||
'memory_total': gpu.memoryTotal,
|
||
'driver_version': torch.version.cuda if torch.cuda.is_available() else 'Unknown'
|
||
})
|
||
logger.info(f"GPU监控已初始化: {len(gpus)}个GPU")
|
||
except Exception as e:
|
||
logger.error(f"GPU监控初始化失败: {str(e)}")
|
||
self.gpu_info = None
|
||
|
||
def get_system_resources(self):
|
||
"""获取系统资源使用情况"""
|
||
resources = {
|
||
'timestamp': datetime.now().isoformat(),
|
||
'cpu_percent': psutil.cpu_percent(interval=0.1),
|
||
'memory_percent': psutil.virtual_memory().percent,
|
||
'memory_used': psutil.virtual_memory().used // (1024 * 1024), # MB
|
||
'memory_total': psutil.virtual_memory().total // (1024 * 1024), # MB
|
||
'disk_percent': psutil.disk_usage('/').percent,
|
||
'network_io': psutil.net_io_counters()._asdict(),
|
||
'process_count': len(psutil.pids()),
|
||
}
|
||
|
||
# GPU信息
|
||
if self.gpu_info is not None:
|
||
gpus = GPUtil.getGPUs()
|
||
gpu_data = []
|
||
for i, gpu in enumerate(gpus):
|
||
gpu_data.append({
|
||
'id': gpu.id,
|
||
'name': gpu.name,
|
||
'load': gpu.load * 100,
|
||
'memory_used': gpu.memoryUsed,
|
||
'memory_total': gpu.memoryTotal,
|
||
'memory_percent': (gpu.memoryUsed / gpu.memoryTotal) * 100,
|
||
'temperature': gpu.temperature,
|
||
'driver_version': self.gpu_info[i]['driver_version'] if i < len(self.gpu_info) else 'Unknown'
|
||
})
|
||
resources['gpus'] = gpu_data
|
||
|
||
return resources
|
||
|
||
def check_resource_limits(self, resources):
|
||
"""检查资源是否超过限制"""
|
||
violations = []
|
||
|
||
# CPU检查
|
||
if resources['cpu_percent'] > self.resource_limits['max_cpu_percent']:
|
||
violations.append(f"CPU使用率过高: {resources['cpu_percent']:.1f}%")
|
||
|
||
# 内存检查
|
||
if resources['memory_percent'] > self.resource_limits['max_memory_percent']:
|
||
violations.append(f"内存使用率过高: {resources['memory_percent']:.1f}%")
|
||
|
||
# GPU检查
|
||
if 'gpus' in resources:
|
||
for gpu in resources['gpus']:
|
||
if gpu['memory_percent'] > self.resource_limits['max_gpu_memory_percent']:
|
||
violations.append(f"GPU{gpu['id']}内存使用率过高: {gpu['memory_percent']:.1f}%")
|
||
|
||
return violations
|
||
|
||
def adjust_concurrent_tasks(self, resources):
|
||
"""根据资源使用动态调整并发任务数"""
|
||
current_time = time.time()
|
||
|
||
# 检查调整间隔
|
||
if current_time - self.last_adjustment < self.resource_limits['check_interval']:
|
||
return self.current_max_tasks
|
||
|
||
violations = self.check_resource_limits(resources)
|
||
|
||
if violations:
|
||
# 资源使用过高,减少并发任务
|
||
if self.current_max_tasks > self.resource_limits['min_concurrent_tasks']:
|
||
self.current_max_tasks -= 1
|
||
logger.warning(f"资源使用过高,减少并发任务数至: {self.current_max_tasks}")
|
||
logger.warning(f"违规项: {', '.join(violations)}")
|
||
else:
|
||
# 资源使用正常,尝试增加并发任务
|
||
safety_margin = 0.8 # 安全边际
|
||
cpu_headroom = (self.resource_limits['max_cpu_percent'] - resources['cpu_percent']) / 100
|
||
memory_headroom = (self.resource_limits['max_memory_percent'] - resources['memory_percent']) / 100
|
||
|
||
# 考虑GPU内存
|
||
gpu_headroom = 1.0
|
||
if 'gpus' in resources:
|
||
gpu_headrooms = []
|
||
for gpu in resources['gpus']:
|
||
gpu_headrooms.append((self.resource_limits['max_gpu_memory_percent'] - gpu['memory_percent']) / 100)
|
||
gpu_headroom = min(gpu_headrooms) if gpu_headrooms else 1.0
|
||
|
||
# 计算可用资源比例
|
||
available_resources = min(cpu_headroom, memory_headroom, gpu_headroom)
|
||
|
||
# 根据可用资源调整任务数
|
||
if available_resources > 0.3: # 有30%以上余量
|
||
if self.current_max_tasks < self.resource_limits['max_concurrent_tasks']:
|
||
self.current_max_tasks += 1
|
||
logger.info(f"资源充足,增加并发任务数至: {self.current_max_tasks}")
|
||
elif available_resources < 0.1: # 余量不足10%
|
||
if self.current_max_tasks > self.resource_limits['min_concurrent_tasks']:
|
||
self.current_max_tasks -= 1
|
||
logger.warning(f"资源紧张,减少并发任务数至: {self.current_max_tasks}")
|
||
|
||
self.last_adjustment = current_time
|
||
return self.current_max_tasks
|
||
|
||
def monitor_loop(self):
|
||
"""监控循环"""
|
||
logger.info("资源监控线程启动")
|
||
|
||
while self.running:
|
||
try:
|
||
# 获取资源使用情况
|
||
resources = self.get_system_resources()
|
||
|
||
with self.lock:
|
||
# 保存历史记录
|
||
self.resource_history.append(resources)
|
||
if len(self.resource_history) > self.max_history:
|
||
self.resource_history.pop(0)
|
||
|
||
# 动态调整并发任务数
|
||
self.adjust_concurrent_tasks(resources)
|
||
|
||
# 更新全局数据
|
||
gd.set_value('system_resources', resources)
|
||
gd.set_value('max_concurrent_tasks', self.current_max_tasks)
|
||
|
||
# 记录资源使用情况(每分钟一次)
|
||
if len(self.resource_history) % 12 == 0: # 5秒 * 12 = 60秒
|
||
self.log_resource_summary(resources)
|
||
|
||
time.sleep(5) # 5秒检查一次
|
||
|
||
except Exception as e:
|
||
logger.error(f"资源监控异常: {str(e)}")
|
||
time.sleep(10)
|
||
|
||
logger.info("资源监控线程停止")
|
||
|
||
def log_resource_summary(self, resources):
|
||
"""记录资源使用摘要"""
|
||
summary = [
|
||
f"CPU: {resources['cpu_percent']:.1f}%",
|
||
f"内存: {resources['memory_percent']:.1f}% ({resources['memory_used']}/{resources['memory_total']}MB)",
|
||
]
|
||
|
||
if 'gpus' in resources:
|
||
for gpu in resources['gpus']:
|
||
summary.append(f"GPU{gpu['id']}: {gpu['load']:.1f}%负载, {gpu['memory_percent']:.1f}%内存")
|
||
|
||
summary.append(f"并发任务限制: {self.current_max_tasks}")
|
||
|
||
logger.info("资源使用摘要: " + " | ".join(summary))
|
||
|
||
def start(self):
|
||
"""启动监控"""
|
||
self.running = True
|
||
self.monitor_thread = threading.Thread(target=self.monitor_loop, daemon=True)
|
||
self.monitor_thread.start()
|
||
|
||
def stop(self):
|
||
"""停止监控"""
|
||
self.running = False
|
||
if self.monitor_thread and self.monitor_thread.is_alive():
|
||
self.monitor_thread.join(5.0)
|
||
|
||
def get_resource_history(self, count=10):
|
||
"""获取最近资源历史"""
|
||
with self.lock:
|
||
return self.resource_history[-count:] if self.resource_history else []
|
||
|
||
def get_current_resources(self):
|
||
"""获取当前资源使用情况"""
|
||
with self.lock:
|
||
return self.resource_history[-1] if self.resource_history else None
|
||
|
||
|
||
# 全局资源监控器实例
|
||
resource_monitor = None
|
||
|
||
|
||
def init_resource_monitor(config):
|
||
"""初始化资源监控器"""
|
||
global resource_monitor
|
||
resource_monitor = ResourceMonitor(config)
|
||
resource_monitor.start()
|
||
return resource_monitor |