fix(dashboard): use st.cache_resource for server singleton

Replace module-level singleton with @st.cache_resource decorator. This properly survives Streamlit reruns without losing the server reference, preventing "port already in use" errors when refreshing the browser in Docker. The cache is tied to the Streamlit process lifecycle, so when the process restarts, both the cache and daemon threads are cleared together. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-29 23:44:37 +00:00
parent 235d668d9f
commit b826337b36
1 changed files with 16 additions and 75 deletions
@@ -6,7 +6,6 @@ thermal-electrical coupling in real-time using instrument interfaces.
 """

 import asyncio
-import atexit
 import os
 import threading
 import time
@@ -28,11 +27,6 @@ from py_dvt_ate.tests.thermal.tempco import TempCoTest
 # Thread pool for background test execution
 _test_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="test_runner")

-# Module-level singleton for simulation server
-# This ensures only one server instance exists across all Streamlit reruns,
-# preventing "address already in use" errors when the page refreshes
-_simulation_server: SimulationServer | None = None
-_server_thread: threading.Thread | None = None

 # Idle pause configuration
 # Physics engine pauses after IDLE_PAUSE_SECONDS of no activity (default: 30s)
@@ -107,56 +101,18 @@ class TestProgress:
        return time.time() - self.started_at


-def _is_server_responsive(host: str = "127.0.0.1", port: int = 5001) -> bool:
-    """Check if a server is actually responding on the given port."""
-    import socket
-    try:
-        with socket.create_connection((host, port), timeout=0.5):
-            return True
-    except (OSError, ConnectionRefusedError, TimeoutError):
-        return False
-
-
+@st.cache_resource
 def get_or_create_server() -> SimulationServer:
    """Get or create the simulation server singleton.

-    Uses module-level singleton to ensure only one server instance exists
+    Uses st.cache_resource to ensure only one server instance exists
    across all Streamlit reruns, preventing "address already in use" errors.
+    The cache survives page refreshes and is only invalidated when the
+    Streamlit process restarts.

    Returns:
        The simulation server instance.
    """
-    global _simulation_server, _server_thread
-
-    # FIRST: Check if ports are already in use (regardless of singleton state)
-    # This catches orphan servers from previous processes (e.g., Docker restarts)
-    ports_in_use = _is_server_responsive()
-
-    if ports_in_use:
-        # Ports are in use - either our singleton or an orphan process
-        if _simulation_server is not None and _simulation_server.is_running:
-            # We have a reference - reuse it
-            return _simulation_server
-        else:
-            # Orphan server from previous process - wait for it to die
-            st.warning("Waiting for previous server to shut down...")
-            for _ in range(10):  # Wait up to 5 seconds
-                time.sleep(0.5)
-                if not _is_server_responsive():
-                    break
-            else:
-                st.error(
-                    "Port still in use. Please wait a moment and refresh, "
-                    "or restart the container."
-                )
-                st.stop()
-
-    # Clean up stale singleton reference if ports are free but singleton exists
-    if _simulation_server is not None and not ports_in_use:
-        _simulation_server = None
-        _server_thread = None
-
-    # Create new server
    server = SimulationServer(
        ServerConfig(
            host="127.0.0.1",
@@ -195,26 +151,22 @@ def get_or_create_server() -> SimulationServer:

    # Wait for server to be fully started (up to 5 seconds)
    if not server_ready.wait(timeout=5.0):
-        st.error("Server failed to start within timeout")
+        raise RuntimeError("Server failed to start within timeout")

    # Check if there was an error during startup
    if server_error:
-        st.error(f"Server startup error: {server_error[0]}")
-
-    # Store in module-level singleton
-    _simulation_server = server
-    _server_thread = thread
+        raise server_error[0]

    return server


 def _update_activity() -> None:
    """Update activity timestamp and resume physics if paused."""
-    global _last_activity_time, _simulation_server
+    global _last_activity_time
    _last_activity_time = time.time()

-    # Resume physics if it was paused (use singleton as fallback)
-    server = st.session_state.get("server") or _simulation_server
+    # Resume physics if it was paused
+    server = st.session_state.get("server")
    if server is not None and server.paused:
        server.paused = False
        print("Physics engine resumed (user activity detected)")
@@ -222,13 +174,15 @@ def _update_activity() -> None:

 def init_session_state() -> None:
    """Initialise Streamlit session state."""
-    global _simulation_server
-
-    # Get or create the server singleton (survives Streamlit reruns)
+    # Get or create the server singleton (survives Streamlit reruns via st.cache_resource)
    if "server" not in st.session_state or st.session_state.server is None:
        with st.spinner("Starting simulation server..."):
-            server = get_or_create_server()
-            st.session_state.server = server
+            try:
+                server = get_or_create_server()
+                st.session_state.server = server
+            except Exception as e:
+                st.error(f"Failed to start simulation server: {e}")
+                st.stop()

        # Verify server started correctly
        if st.session_state.server.physics_engine is None:
@@ -238,19 +192,6 @@ def init_session_state() -> None:
        # Start idle checker to pause physics when no one's viewing
        _start_idle_checker(st.session_state.server)

-        # Register cleanup (only once, for the singleton)
-        def cleanup() -> None:
-            global _simulation_server
-            if _simulation_server is not None:
-                loop = asyncio.new_event_loop()
-                try:
-                    loop.run_until_complete(_simulation_server.stop())
-                except Exception:
-                    pass
-                loop.close()
-                _simulation_server = None
-        atexit.register(cleanup)
-
    if "instruments" not in st.session_state:
        # Create instruments via HAL using factory
        config = InstrumentConfig(