fix(dashboard): use st.cache_resource for server singleton

Replace module-level singleton with @st.cache_resource decorator. This properly survives Streamlit reruns without losing the server reference, preventing "port already in use" errors when refreshing the browser in Docker. The cache is tied to the Streamlit process lifecycle, so when the process restarts, both the cache and daemon threads are cleared together. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-29 23:44:37 +00:00
parent 235d668d9f
commit b826337b36
1 changed files with 16 additions and 75 deletions
@@ -6,7 +6,6 @@ thermal-electrical coupling in real-time using instrument interfaces.
 """
 import asyncio
 import atexit
 import os
 import threading
 import time
@@ -28,11 +27,6 @@ from py_dvt_ate.tests.thermal.tempco import TempCoTest
 # Thread pool for background test execution
 _test_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="test_runner")
 # Module-level singleton for simulation server
 # This ensures only one server instance exists across all Streamlit reruns,
 # preventing "address already in use" errors when the page refreshes
 _simulation_server: SimulationServer | None = None
 _server_thread: threading.Thread | None = None
 # Idle pause configuration
 # Physics engine pauses after IDLE_PAUSE_SECONDS of no activity (default: 30s)
@@ -107,56 +101,18 @@ class TestProgress:
        return time.time() - self.started_at
-def _is_server_responsive(host: str = "127.0.0.1", port: int = 5001) -> bool:
+@st.cache_resource
    """Check if a server is actually responding on the given port."""
    import socket
    try:
        with socket.create_connection((host, port), timeout=0.5):
            return True
    except (OSError, ConnectionRefusedError, TimeoutError):
        return False
 def get_or_create_server() -> SimulationServer:
    """Get or create the simulation server singleton.
-    Uses module-level singleton to ensure only one server instance exists
+    Uses st.cache_resource to ensure only one server instance exists
    across all Streamlit reruns, preventing "address already in use" errors.
    The cache survives page refreshes and is only invalidated when the
    Streamlit process restarts.
    Returns:
        The simulation server instance.
    """
    global _simulation_server, _server_thread
    # FIRST: Check if ports are already in use (regardless of singleton state)
    # This catches orphan servers from previous processes (e.g., Docker restarts)
    ports_in_use = _is_server_responsive()
    if ports_in_use:
        # Ports are in use - either our singleton or an orphan process
        if _simulation_server is not None and _simulation_server.is_running:
            # We have a reference - reuse it
            return _simulation_server
        else:
            # Orphan server from previous process - wait for it to die
            st.warning("Waiting for previous server to shut down...")
            for _ in range(10):  # Wait up to 5 seconds
                time.sleep(0.5)
                if not _is_server_responsive():
                    break
            else:
                st.error(
                    "Port still in use. Please wait a moment and refresh, "
                    "or restart the container."
                )
                st.stop()
    # Clean up stale singleton reference if ports are free but singleton exists
    if _simulation_server is not None and not ports_in_use:
        _simulation_server = None
        _server_thread = None
    # Create new server
    server = SimulationServer(
        ServerConfig(
            host="127.0.0.1",
@@ -195,26 +151,22 @@ def get_or_create_server() -> SimulationServer:
    # Wait for server to be fully started (up to 5 seconds)
    if not server_ready.wait(timeout=5.0):
-        st.error("Server failed to start within timeout")
+        raise RuntimeError("Server failed to start within timeout")
    # Check if there was an error during startup
    if server_error:
-        st.error(f"Server startup error: {server_error[0]}")
+        raise server_error[0]
    # Store in module-level singleton
    _simulation_server = server
    _server_thread = thread
    return server
 def _update_activity() -> None:
    """Update activity timestamp and resume physics if paused."""
-    global _last_activity_time, _simulation_server
+    global _last_activity_time
    _last_activity_time = time.time()
-    # Resume physics if it was paused (use singleton as fallback)
+    # Resume physics if it was paused
-    server = st.session_state.get("server") or _simulation_server
+    server = st.session_state.get("server")
    if server is not None and server.paused:
        server.paused = False
        print("Physics engine resumed (user activity detected)")
@@ -222,13 +174,15 @@ def _update_activity() -> None:
 def init_session_state() -> None:
    """Initialise Streamlit session state."""
-    global _simulation_server
+    # Get or create the server singleton (survives Streamlit reruns via st.cache_resource)
    # Get or create the server singleton (survives Streamlit reruns)
    if "server" not in st.session_state or st.session_state.server is None:
        with st.spinner("Starting simulation server..."):
-            server = get_or_create_server()
+            try:
-            st.session_state.server = server
+                server = get_or_create_server()
                st.session_state.server = server
            except Exception as e:
                st.error(f"Failed to start simulation server: {e}")
                st.stop()
        # Verify server started correctly
        if st.session_state.server.physics_engine is None:
@@ -238,19 +192,6 @@ def init_session_state() -> None:
        # Start idle checker to pause physics when no one's viewing
        _start_idle_checker(st.session_state.server)
        # Register cleanup (only once, for the singleton)
        def cleanup() -> None:
            global _simulation_server
            if _simulation_server is not None:
                loop = asyncio.new_event_loop()
                try:
                    loop.run_until_complete(_simulation_server.stop())
                except Exception:
                    pass
                loop.close()
                _simulation_server = None
        atexit.register(cleanup)
    if "instruments" not in st.session_state:
        # Create instruments via HAL using factory
        config = InstrumentConfig(