#!/usr/bin/env python3 """ 极简重启守护进程 - 不持有策略状态,只监控和重启 目标:崩溃后不影响策略,重启后可无缝接管 """ import sys import time import psutil import logging from pathlib import Path from datetime import datetime import threading import subprocess class RestartDaemon: """重启守护进程 - 定时重启策略子进程""" # 每日重启时间点 RESTART_TIMES = ["08:50", "20:50"] def __init__(self, pid_dir="pids", log_dir="logs"): self.pid_dir = Path(pid_dir) self.log_dir = Path(log_dir) self.logger = self._setup_logger() self.running = False self.thread = None # 确保目录存在 self.pid_dir.mkdir(exist_ok=True) self.log_dir.mkdir(exist_ok=True) # 确保日志目录存在 def _setup_logger(self): """配置日志""" log_file = self.log_dir / "restart_daemon.log" logging.basicConfig( level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', handlers=[ logging.FileHandler(log_file, encoding='utf-8'), logging.StreamHandler(sys.stdout) ], force=True # 防止日志配置冲突 ) logger = logging.getLogger("RestartDaemon") logger.info("=" * 80) logger.info("📝 日志系统初始化完成") logger.info("📂 日志文件: %s", log_file.absolute()) return logger def start(self): """启动守护进程""" if self.running: self.logger.warning("⚠️ 守护进程已在运行") return self.running = True self.thread = threading.Thread(target=self._check_loop, daemon=True) self.thread.start() self.logger.info("=" * 80) self.logger.info("✅ 重启守护进程已启动") self.logger.info("⏰ 监控时间点: %s", ", ".join(self.RESTART_TIMES)) self.logger.info("📂 PID目录: %s", self.pid_dir.absolute()) self.logger.info("=" * 80) # 主线程阻塞(保持进程运行) try: self.logger.info("📌 进程已常驻,按 Ctrl+C 退出...") while self.running: time.sleep(1) except KeyboardInterrupt: self.logger.info("\n⏹️ 收到退出信号,正在停止...") self.stop() def stop(self): """停止守护进程""" self.running = False if self.thread: self.thread.join(timeout=5) self.logger.info("✅ 重启守护进程已停止") def _check_loop(self): """每分钟检查一次重启时间""" last_restart_date = {t: None for t in self.RESTART_TIMES} while self.running: try: now = datetime.now() current_time = now.strftime("%H:%M") current_date = now.date() # 检查是否到达重启时间点 if current_time in self.RESTART_TIMES: # 防重复:检查今天是否已执行 if last_restart_date[current_time] != current_date: last_restart_date[current_time] = current_date self._perform_restart(current_time) time.sleep(60) # 每分钟检查一次 except Exception as e: self.logger.error("❌ 检查循环出错: %s", e, exc_info=True) self.logger.error("=" * 80) time.sleep(60) # 出错后等待1分钟继续 def _perform_restart(self, time_point: str): """执行重启""" self.logger.info("\n" + "=" * 80) self.logger.info("⏰ 到达重启时间: %s", time_point) self.logger.info("=" * 80) # 1. 扫描所有PID文件 pid_files = list(self.pid_dir.glob("*.pid")) if not pid_files: self.logger.info("⚠️ 未发现运行中的策略") return self.logger.info("📋 发现 %d 个策略需要重启", len(pid_files)) # 2. 停止所有策略 stopped_count = 0 for pid_file in pid_files: try: with open(pid_file, 'r') as f: pid = int(f.read().strip()) if psutil.pid_exists(pid): proc = psutil.Process(pid) self.logger.info("⏹️ 停止策略 PID %d: %s", pid, proc.name()) proc.terminate() try: proc.wait(timeout=30) self.logger.info("✅ 已优雅停止 PID %d", pid) stopped_count += 1 except psutil.TimeoutExpired: proc.kill() self.logger.info("🔥 强制终止 PID %d", pid) stopped_count += 1 else: self.logger.warning("⚠️ PID文件存在但进程已死: %d", pid) except Exception as e: self.logger.error("❌ 停止失败 %s: %s", pid_file, e, exc_info=True) if stopped_count == 0: self.logger.warning("⚠️ 未成功停止任何策略") return # 3. 等待资源释放 self.logger.info("\n⏳ 等待2秒资源释放...") time.sleep(2) # 4. 重新启动策略 self.logger.info("\n🚀 重新启动所有策略...") restarted_count = 0 for pid_file in pid_files: try: # 从PID文件名推导配置路径 # DualModeTrendlineHawkesStrategy2_FG.pid -> strategies/DualModeTrendlineHawkesStrategy2/FG.py name = pid_file.stem if '_' not in name: self.logger.error("❌ PID文件名格式错误: %s", name) continue strategy_name, symbol = name.split('_', 1) config_file = Path("strategies") / strategy_name / "{}.py".format(symbol) if not config_file.exists(): self.logger.error("❌ 配置文件不存在: %s", config_file) continue # 启动新进程(不阻塞,立即返回) process = subprocess.Popen( [sys.executable, "launcher.py", "--config", str(config_file)], stdout=subprocess.DEVNULL, # launcher内会自行处理日志 stderr=subprocess.DEVNULL, cwd=Path.cwd() ) self.logger.info("✅ 启动新进程 PID %d: %s", process.pid, config_file.name) restarted_count += 1 except Exception as e: self.logger.error("❌ 启动失败: %s", e, exc_info=True) # 5. 统计结果 self.logger.info("\n" + "=" * 80) self.logger.info("📊 重启统计:") self.logger.info(" 停止成功: %d个", stopped_count) self.logger.info(" 启动成功: %d个", restarted_count) if stopped_count == restarted_count and stopped_count > 0: self.logger.info("✅ 所有策略重启成功") else: self.logger.warning("⚠️ 部分策略重启失败") self.logger.info("=" * 80) def main(): """主入口""" daemon = RestartDaemon() daemon.start() if __name__ == "__main__": main()