# resource_monitor.py import psutil import time import threading from datetime import datetime import torch from log import logger from global_data import gd try: import GPUtil GPU_AVAILABLE = True except ImportError: GPU_AVAILABLE = False logger.warning("GPUtil 未安装,GPU监控不可用") class ResourceMonitor: """系统资源监控器""" def __init__(self, config): self.config = config self.running = True self.monitor_thread = None self.resource_history = [] self.max_history = 100 self.lock = threading.Lock() # 资源限制 self.resource_limits = config.get('resource_limits', { 'max_cpu_percent': 80, 'max_memory_percent': 80, 'max_gpu_memory_percent': 80, 'max_concurrent_tasks': 5, 'min_concurrent_tasks': 1 }) # 动态调整参数 self.current_max_tasks = self.resource_limits['max_concurrent_tasks'] self.adjustment_factor = 1.0 self.last_adjustment = time.time() # GPU信息 self.gpu_info = None if GPU_AVAILABLE and torch.cuda.is_available(): self.init_gpu_monitor() def init_gpu_monitor(self): """初始化GPU监控""" try: gpus = GPUtil.getGPUs() self.gpu_info = [] for gpu in gpus: self.gpu_info.append({ 'id': gpu.id, 'name': gpu.name, 'memory_total': gpu.memoryTotal, 'driver_version': torch.version.cuda if torch.cuda.is_available() else 'Unknown' }) logger.info(f"GPU监控已初始化: {len(gpus)}个GPU") except Exception as e: logger.error(f"GPU监控初始化失败: {str(e)}") self.gpu_info = None def get_system_resources(self): """获取系统资源使用情况""" resources = { 'timestamp': datetime.now().isoformat(), 'cpu_percent': psutil.cpu_percent(interval=0.1), 'memory_percent': psutil.virtual_memory().percent, 'memory_used': psutil.virtual_memory().used // (1024 * 1024), # MB 'memory_total': psutil.virtual_memory().total // (1024 * 1024), # MB 'disk_percent': psutil.disk_usage('/').percent, 'network_io': psutil.net_io_counters()._asdict(), 'process_count': len(psutil.pids()), } # GPU信息 if self.gpu_info is not None: gpus = GPUtil.getGPUs() gpu_data = [] for i, gpu in enumerate(gpus): gpu_data.append({ 'id': gpu.id, 'name': gpu.name, 'load': gpu.load * 100, 'memory_used': gpu.memoryUsed, 'memory_total': gpu.memoryTotal, 'memory_percent': (gpu.memoryUsed / gpu.memoryTotal) * 100, 'temperature': gpu.temperature, 'driver_version': self.gpu_info[i]['driver_version'] if i < len(self.gpu_info) else 'Unknown' }) resources['gpus'] = gpu_data return resources def check_resource_limits(self, resources): """检查资源是否超过限制""" violations = [] # CPU检查 if resources['cpu_percent'] > self.resource_limits['max_cpu_percent']: violations.append(f"CPU使用率过高: {resources['cpu_percent']:.1f}%") # 内存检查 if resources['memory_percent'] > self.resource_limits['max_memory_percent']: violations.append(f"内存使用率过高: {resources['memory_percent']:.1f}%") # GPU检查 if 'gpus' in resources: for gpu in resources['gpus']: if gpu['memory_percent'] > self.resource_limits['max_gpu_memory_percent']: violations.append(f"GPU{gpu['id']}内存使用率过高: {gpu['memory_percent']:.1f}%") return violations def adjust_concurrent_tasks(self, resources): """根据资源使用动态调整并发任务数""" current_time = time.time() # 检查调整间隔 if current_time - self.last_adjustment < self.resource_limits['check_interval']: return self.current_max_tasks violations = self.check_resource_limits(resources) if violations: # 资源使用过高,减少并发任务 if self.current_max_tasks > self.resource_limits['min_concurrent_tasks']: self.current_max_tasks -= 1 logger.warning(f"资源使用过高,减少并发任务数至: {self.current_max_tasks}") logger.warning(f"违规项: {', '.join(violations)}") else: # 资源使用正常,尝试增加并发任务 safety_margin = 0.8 # 安全边际 cpu_headroom = (self.resource_limits['max_cpu_percent'] - resources['cpu_percent']) / 100 memory_headroom = (self.resource_limits['max_memory_percent'] - resources['memory_percent']) / 100 # 考虑GPU内存 gpu_headroom = 1.0 if 'gpus' in resources: gpu_headrooms = [] for gpu in resources['gpus']: gpu_headrooms.append((self.resource_limits['max_gpu_memory_percent'] - gpu['memory_percent']) / 100) gpu_headroom = min(gpu_headrooms) if gpu_headrooms else 1.0 # 计算可用资源比例 available_resources = min(cpu_headroom, memory_headroom, gpu_headroom) # 根据可用资源调整任务数 if available_resources > 0.3: # 有30%以上余量 if self.current_max_tasks < self.resource_limits['max_concurrent_tasks']: self.current_max_tasks += 1 logger.info(f"资源充足,增加并发任务数至: {self.current_max_tasks}") elif available_resources < 0.1: # 余量不足10% if self.current_max_tasks > self.resource_limits['min_concurrent_tasks']: self.current_max_tasks -= 1 logger.warning(f"资源紧张,减少并发任务数至: {self.current_max_tasks}") self.last_adjustment = current_time return self.current_max_tasks def monitor_loop(self): """监控循环""" logger.info("资源监控线程启动") while self.running: try: # 获取资源使用情况 resources = self.get_system_resources() with self.lock: # 保存历史记录 self.resource_history.append(resources) if len(self.resource_history) > self.max_history: self.resource_history.pop(0) # 动态调整并发任务数 self.adjust_concurrent_tasks(resources) # 更新全局数据 gd.set_value('system_resources', resources) gd.set_value('max_concurrent_tasks', self.current_max_tasks) # 记录资源使用情况(每分钟一次) if len(self.resource_history) % 12 == 0: # 5秒 * 12 = 60秒 self.log_resource_summary(resources) time.sleep(5) # 5秒检查一次 except Exception as e: logger.error(f"资源监控异常: {str(e)}") time.sleep(10) logger.info("资源监控线程停止") def log_resource_summary(self, resources): """记录资源使用摘要""" summary = [ f"CPU: {resources['cpu_percent']:.1f}%", f"内存: {resources['memory_percent']:.1f}% ({resources['memory_used']}/{resources['memory_total']}MB)", ] if 'gpus' in resources: for gpu in resources['gpus']: summary.append(f"GPU{gpu['id']}: {gpu['load']:.1f}%负载, {gpu['memory_percent']:.1f}%内存") summary.append(f"并发任务限制: {self.current_max_tasks}") logger.info("资源使用摘要: " + " | ".join(summary)) def start(self): """启动监控""" self.running = True self.monitor_thread = threading.Thread(target=self.monitor_loop, daemon=True) self.monitor_thread.start() def stop(self): """停止监控""" self.running = False if self.monitor_thread and self.monitor_thread.is_alive(): self.monitor_thread.join(5.0) def get_resource_history(self, count=10): """获取最近资源历史""" with self.lock: return self.resource_history[-count:] if self.resource_history else [] def get_current_resources(self): """获取当前资源使用情况""" with self.lock: return self.resource_history[-1] if self.resource_history else None # 全局资源监控器实例 resource_monitor = None def init_resource_monitor(config): """初始化资源监控器""" global resource_monitor resource_monitor = ResourceMonitor(config) resource_monitor.start() return resource_monitor