fix(dashboard): stop server event loop on correct thread
All checks were successful
CI / Lint (push) Successful in 5s
CI / Type Check (push) Successful in 20s
CI / Test (push) Successful in 1m3s
CI / Release (push) Has been skipped

When idle shutdown triggered _stop_server(), it was creating a new event
loop and calling server.stop() on it, but the daemon thread was still
running loop.run_forever() on the original event loop. This left sockets
bound, causing "address already in use" on restart.

Fix by storing references to the server's event loop and thread, then
using call_soon_threadsafe(loop.stop) to signal the correct loop to exit.
The thread join ensures sockets are released before the next server starts.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-30 00:16:47 +00:00
parent 1ec05ea289
commit aba2cabbbc

View File

@@ -35,6 +35,8 @@ IDLE_SHUTDOWN_SECONDS = int(os.environ.get("IDLE_SHUTDOWN_SECONDS", "300"))
_last_activity_time: float = time.time() _last_activity_time: float = time.time()
_idle_checker_started = False _idle_checker_started = False
_server_ref: SimulationServer | None = None # Reference for idle checker thread _server_ref: SimulationServer | None = None # Reference for idle checker thread
_server_loop: asyncio.AbstractEventLoop | None = None # Event loop running the server
_server_thread: threading.Thread | None = None # Thread running the server event loop
def _idle_checker() -> None: def _idle_checker() -> None:
@@ -54,16 +56,19 @@ def _idle_checker() -> None:
def _stop_server() -> None: def _stop_server() -> None:
"""Stop the server and clear caches for fresh restart.""" """Stop the server and clear caches for fresh restart."""
global _server_ref, _idle_checker_started global _server_ref, _idle_checker_started, _server_loop, _server_thread
if _server_ref is not None:
# Stop the server if _server_loop is not None and _server_thread is not None:
loop = asyncio.new_event_loop() # Schedule stop on the correct event loop (the one actually running the server)
try: # This causes loop.run_forever() to exit in the daemon thread
loop.run_until_complete(_server_ref.stop()) _server_loop.call_soon_threadsafe(_server_loop.stop)
except Exception:
pass # Wait for thread to exit (with timeout to avoid hanging)
finally: _server_thread.join(timeout=5.0)
loop.close()
_server_loop = None
_server_thread = None
_server_ref = None _server_ref = None
# Clear Streamlit's cached server so next visitor gets fresh instance # Clear Streamlit's cached server so next visitor gets fresh instance
@@ -136,6 +141,8 @@ def get_or_create_server() -> SimulationServer:
Returns: Returns:
The simulation server instance. The simulation server instance.
""" """
global _server_loop, _server_thread
server = SimulationServer( server = SimulationServer(
ServerConfig( ServerConfig(
host="127.0.0.1", host="127.0.0.1",
@@ -151,7 +158,9 @@ def get_or_create_server() -> SimulationServer:
def run_server() -> None: def run_server() -> None:
"""Run the async server in a new event loop.""" """Run the async server in a new event loop."""
global _server_loop
loop = asyncio.new_event_loop() loop = asyncio.new_event_loop()
_server_loop = loop # Store reference for _stop_server to use
asyncio.set_event_loop(loop) asyncio.set_event_loop(loop)
try: try:
loop.run_until_complete(server.start()) loop.run_until_complete(server.start())
@@ -171,6 +180,7 @@ def get_or_create_server() -> SimulationServer:
thread = threading.Thread(target=run_server, daemon=True) thread = threading.Thread(target=run_server, daemon=True)
thread.start() thread.start()
_server_thread = thread # Store reference for _stop_server to use
# Wait for server to be fully started (up to 5 seconds) # Wait for server to be fully started (up to 5 seconds)
if not server_ready.wait(timeout=5.0): if not server_ready.wait(timeout=5.0):