fix(dashboard): use st.cache_resource for server singleton
All checks were successful
CI / Lint (push) Successful in 5s
CI / Type Check (push) Successful in 19s
CI / Release (push) Has been skipped
CI / Test (push) Successful in 54s

Replace module-level singleton with @st.cache_resource decorator.
This properly survives Streamlit reruns without losing the server
reference, preventing "port already in use" errors when refreshing
the browser in Docker.

The cache is tied to the Streamlit process lifecycle, so when the
process restarts, both the cache and daemon threads are cleared
together.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-29 23:44:37 +00:00
parent 235d668d9f
commit b826337b36

View File

@@ -6,7 +6,6 @@ thermal-electrical coupling in real-time using instrument interfaces.
""" """
import asyncio import asyncio
import atexit
import os import os
import threading import threading
import time import time
@@ -28,11 +27,6 @@ from py_dvt_ate.tests.thermal.tempco import TempCoTest
# Thread pool for background test execution # Thread pool for background test execution
_test_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="test_runner") _test_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="test_runner")
# Module-level singleton for simulation server
# This ensures only one server instance exists across all Streamlit reruns,
# preventing "address already in use" errors when the page refreshes
_simulation_server: SimulationServer | None = None
_server_thread: threading.Thread | None = None
# Idle pause configuration # Idle pause configuration
# Physics engine pauses after IDLE_PAUSE_SECONDS of no activity (default: 30s) # Physics engine pauses after IDLE_PAUSE_SECONDS of no activity (default: 30s)
@@ -107,56 +101,18 @@ class TestProgress:
return time.time() - self.started_at return time.time() - self.started_at
def _is_server_responsive(host: str = "127.0.0.1", port: int = 5001) -> bool: @st.cache_resource
"""Check if a server is actually responding on the given port."""
import socket
try:
with socket.create_connection((host, port), timeout=0.5):
return True
except (OSError, ConnectionRefusedError, TimeoutError):
return False
def get_or_create_server() -> SimulationServer: def get_or_create_server() -> SimulationServer:
"""Get or create the simulation server singleton. """Get or create the simulation server singleton.
Uses module-level singleton to ensure only one server instance exists Uses st.cache_resource to ensure only one server instance exists
across all Streamlit reruns, preventing "address already in use" errors. across all Streamlit reruns, preventing "address already in use" errors.
The cache survives page refreshes and is only invalidated when the
Streamlit process restarts.
Returns: Returns:
The simulation server instance. The simulation server instance.
""" """
global _simulation_server, _server_thread
# FIRST: Check if ports are already in use (regardless of singleton state)
# This catches orphan servers from previous processes (e.g., Docker restarts)
ports_in_use = _is_server_responsive()
if ports_in_use:
# Ports are in use - either our singleton or an orphan process
if _simulation_server is not None and _simulation_server.is_running:
# We have a reference - reuse it
return _simulation_server
else:
# Orphan server from previous process - wait for it to die
st.warning("Waiting for previous server to shut down...")
for _ in range(10): # Wait up to 5 seconds
time.sleep(0.5)
if not _is_server_responsive():
break
else:
st.error(
"Port still in use. Please wait a moment and refresh, "
"or restart the container."
)
st.stop()
# Clean up stale singleton reference if ports are free but singleton exists
if _simulation_server is not None and not ports_in_use:
_simulation_server = None
_server_thread = None
# Create new server
server = SimulationServer( server = SimulationServer(
ServerConfig( ServerConfig(
host="127.0.0.1", host="127.0.0.1",
@@ -195,26 +151,22 @@ def get_or_create_server() -> SimulationServer:
# Wait for server to be fully started (up to 5 seconds) # Wait for server to be fully started (up to 5 seconds)
if not server_ready.wait(timeout=5.0): if not server_ready.wait(timeout=5.0):
st.error("Server failed to start within timeout") raise RuntimeError("Server failed to start within timeout")
# Check if there was an error during startup # Check if there was an error during startup
if server_error: if server_error:
st.error(f"Server startup error: {server_error[0]}") raise server_error[0]
# Store in module-level singleton
_simulation_server = server
_server_thread = thread
return server return server
def _update_activity() -> None: def _update_activity() -> None:
"""Update activity timestamp and resume physics if paused.""" """Update activity timestamp and resume physics if paused."""
global _last_activity_time, _simulation_server global _last_activity_time
_last_activity_time = time.time() _last_activity_time = time.time()
# Resume physics if it was paused (use singleton as fallback) # Resume physics if it was paused
server = st.session_state.get("server") or _simulation_server server = st.session_state.get("server")
if server is not None and server.paused: if server is not None and server.paused:
server.paused = False server.paused = False
print("Physics engine resumed (user activity detected)") print("Physics engine resumed (user activity detected)")
@@ -222,13 +174,15 @@ def _update_activity() -> None:
def init_session_state() -> None: def init_session_state() -> None:
"""Initialise Streamlit session state.""" """Initialise Streamlit session state."""
global _simulation_server # Get or create the server singleton (survives Streamlit reruns via st.cache_resource)
# Get or create the server singleton (survives Streamlit reruns)
if "server" not in st.session_state or st.session_state.server is None: if "server" not in st.session_state or st.session_state.server is None:
with st.spinner("Starting simulation server..."): with st.spinner("Starting simulation server..."):
server = get_or_create_server() try:
st.session_state.server = server server = get_or_create_server()
st.session_state.server = server
except Exception as e:
st.error(f"Failed to start simulation server: {e}")
st.stop()
# Verify server started correctly # Verify server started correctly
if st.session_state.server.physics_engine is None: if st.session_state.server.physics_engine is None:
@@ -238,19 +192,6 @@ def init_session_state() -> None:
# Start idle checker to pause physics when no one's viewing # Start idle checker to pause physics when no one's viewing
_start_idle_checker(st.session_state.server) _start_idle_checker(st.session_state.server)
# Register cleanup (only once, for the singleton)
def cleanup() -> None:
global _simulation_server
if _simulation_server is not None:
loop = asyncio.new_event_loop()
try:
loop.run_until_complete(_simulation_server.stop())
except Exception:
pass
loop.close()
_simulation_server = None
atexit.register(cleanup)
if "instruments" not in st.session_state: if "instruments" not in st.session_state:
# Create instruments via HAL using factory # Create instruments via HAL using factory
config = InstrumentConfig( config = InstrumentConfig(