#!/usr/bin/env python3 """ 极简重启守护进程 - 不持有策略状态,只监控和重启 目标:崩溃后不影响策略,重启后可无缝接管 """ import sys import time import psutil import logging from pathlib import Path from datetime import datetime import threading import subprocess class RestartDaemon: """重启守护进程 - 定时重启策略子进程""" # 每日重启时间点 RESTART_TIMES = ["08:50", "20:50"] def __init__(self, pid_dir="pids", log_dir="logs"): self.pid_dir = Path(pid_dir) self.log_dir = Path(log_dir) self.logger = self._setup_logger() self.running = False self.thread = None # 确保目录存在 self.pid_dir.mkdir(exist_ok=True) def _setup_logger(self): """配置日志""" self.log_dir.mkdir(exist_ok=True) log_file = self.log_dir / "restart_daemon.log" logging.basicConfig( level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', handlers=[ logging.FileHandler(log_file, encoding='utf-8'), logging.StreamHandler(sys.stdout) ] ) return logging.getLogger("RestartDaemon") def start(self): """启动守护进程""" if self.running: self.logger.warning("⚠️ 守护进程已在运行") return self.running = True self.thread = threading.Thread(target=self._check_loop, daemon=True) self.thread.start() self.logger.info("=" * 80) self.logger.info("✅ 重启守护进程已启动") self.logger.info("⏰ 监控时间点: {}".format(", ".join(self.RESTART_TIMES))) self.logger.info("📂 PID目录: {}".format(self.pid_dir.absolute())) self.logger.info("=" * 80) # 主线程阻塞(保持进程运行) try: self.logger.info("📌 进程已常驻,按 Ctrl+C 退出...") while self.running: time.sleep(1) except KeyboardInterrupt: self.logger.info("\n⏹️ 收到退出信号,正在停止...") self.stop() def stop(self): """停止守护进程""" self.running = False if self.thread: self.thread.join(timeout=5) self.logger.info("✅ 重启守护进程已停止") def _check_loop(self): """每分钟检查一次重启时间""" last_restart_date = {t: None for t in self.RESTART_TIMES} while self.running: try: now = datetime.now() current_time = now.strftime("%H:%M") current_date = now.date() # 检查是否到达重启时间点 if current_time in self.RESTART_TIMES: # 防重复:检查今天是否已执行 if last_restart_date[current_time] != current_date: last_restart_date[current_time] = current_date self._perform_restart(current_time) time.sleep(60) # 每分钟检查一次 except Exception as e: self.logger.error("❌ 检查循环出错: {}".format(e)) self.logger.error("=" * 80) time.sleep(60) # 出错后等待1分钟继续 def _perform_restart(self, time_point: str): """执行重启""" self.logger.info("\n" + "=" * 80) self.logger.info("⏰ 到达重启时间: {}".format(time_point)) self.logger.info("=" * 80) # 1. 扫描所有PID文件 pid_files = list(self.pid_dir.glob("*.pid")) if not pid_files: self.logger.info("⚠️ 未发现运行中的策略") return self.logger.info("📋 发现 {} 个策略需要重启".format(len(pid_files))) # 2. 停止所有策略 stopped_count = 0 for pid_file in pid_files: try: with open(pid_file, 'r') as f: pid = int(f.read().strip()) if psutil.pid_exists(pid): proc = psutil.Process(pid) self.logger.info("⏹️ 停止策略 PID {}: {}".format(pid, proc.name())) proc.terminate() try: proc.wait(timeout=30) self.logger.info("✅ 已优雅停止 PID {}".format(pid)) stopped_count += 1 except psutil.TimeoutExpired: proc.kill() self.logger.info("🔥 强制终止 PID {}".format(pid)) stopped_count += 1 else: self.logger.warning("⚠️ PID文件存在但进程已死: {}".format(pid)) except Exception as e: self.logger.error("❌ 停止失败 {}: {}".format(pid_file, e)) if stopped_count == 0: self.logger.warning("⚠️ 未成功停止任何策略") return # 3. 等待资源释放 self.logger.info("\n⏳ 等待2秒资源释放...") time.sleep(2) # 4. 重新启动策略 self.logger.info("\n🚀 重新启动所有策略...") restarted_count = 0 for pid_file in pid_files: try: # 从PID文件名推导配置路径 # DualModeTrendlineHawkesStrategy2_FG.pid -> strategies/DualModeTrendlineHawkesStrategy2/FG.config name = pid_file.stem if '_' not in name: self.logger.error("❌ PID文件名格式错误: {}".format(name)) continue strategy_name, symbol = name.split('_', 1) config_file = Path("strategies") / strategy_name / "{}.config".format(symbol) if not config_file.exists(): self.logger.error("❌ 配置文件不存在: {}".format(config_file)) continue # 启动新进程(不阻塞,立即返回) process = subprocess.Popen( [sys.executable, "launcher.py", "--config", str(config_file)], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, cwd=Path.cwd() ) self.logger.info("✅ 启动新进程 PID {}: {}".format(process.pid, config_file.name)) restarted_count += 1 except Exception as e: self.logger.error("❌ 启动失败: {}".format(e)) # 5. 统计结果 self.logger.info("\n" + "=" * 80) self.logger.info("📊 重启统计:") self.logger.info(" 停止成功: {}个".format(stopped_count)) self.logger.info(" 启动成功: {}个".format(restarted_count)) if stopped_count == restarted_count and stopped_count > 0: self.logger.info("✅ 所有策略重启成功") else: self.logger.warning("⚠️ 部分策略重启失败") self.logger.info("=" * 80) def main(): """主入口""" daemon = RestartDaemon() daemon.start() if __name__ == "__main__": main()