fix(dashboard): use st.cache_resource for server singleton
All checks were successful
CI / Lint (push) Successful in 5s
CI / Type Check (push) Successful in 19s
CI / Release (push) Has been skipped
CI / Test (push) Successful in 54s

Replace module-level singleton with @st.cache_resource decorator.
This properly survives Streamlit reruns without losing the server
reference, preventing "port already in use" errors when refreshing
the browser in Docker.

The cache is tied to the Streamlit process lifecycle, so when the
process restarts, both the cache and daemon threads are cleared
together.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-29 23:44:37 +00:00
parent 235d668d9f
commit b826337b36

View File

@@ -6,7 +6,6 @@ thermal-electrical coupling in real-time using instrument interfaces.
"""
import asyncio
import atexit
import os
import threading
import time
@@ -28,11 +27,6 @@ from py_dvt_ate.tests.thermal.tempco import TempCoTest
# Thread pool for background test execution
_test_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="test_runner")
# Module-level singleton for simulation server
# This ensures only one server instance exists across all Streamlit reruns,
# preventing "address already in use" errors when the page refreshes
_simulation_server: SimulationServer | None = None
_server_thread: threading.Thread | None = None
# Idle pause configuration
# Physics engine pauses after IDLE_PAUSE_SECONDS of no activity (default: 30s)
@@ -107,56 +101,18 @@ class TestProgress:
return time.time() - self.started_at
def _is_server_responsive(host: str = "127.0.0.1", port: int = 5001) -> bool:
"""Check if a server is actually responding on the given port."""
import socket
try:
with socket.create_connection((host, port), timeout=0.5):
return True
except (OSError, ConnectionRefusedError, TimeoutError):
return False
@st.cache_resource
def get_or_create_server() -> SimulationServer:
"""Get or create the simulation server singleton.
Uses module-level singleton to ensure only one server instance exists
Uses st.cache_resource to ensure only one server instance exists
across all Streamlit reruns, preventing "address already in use" errors.
The cache survives page refreshes and is only invalidated when the
Streamlit process restarts.
Returns:
The simulation server instance.
"""
global _simulation_server, _server_thread
# FIRST: Check if ports are already in use (regardless of singleton state)
# This catches orphan servers from previous processes (e.g., Docker restarts)
ports_in_use = _is_server_responsive()
if ports_in_use:
# Ports are in use - either our singleton or an orphan process
if _simulation_server is not None and _simulation_server.is_running:
# We have a reference - reuse it
return _simulation_server
else:
# Orphan server from previous process - wait for it to die
st.warning("Waiting for previous server to shut down...")
for _ in range(10): # Wait up to 5 seconds
time.sleep(0.5)
if not _is_server_responsive():
break
else:
st.error(
"Port still in use. Please wait a moment and refresh, "
"or restart the container."
)
st.stop()
# Clean up stale singleton reference if ports are free but singleton exists
if _simulation_server is not None and not ports_in_use:
_simulation_server = None
_server_thread = None
# Create new server
server = SimulationServer(
ServerConfig(
host="127.0.0.1",
@@ -195,26 +151,22 @@ def get_or_create_server() -> SimulationServer:
# Wait for server to be fully started (up to 5 seconds)
if not server_ready.wait(timeout=5.0):
st.error("Server failed to start within timeout")
raise RuntimeError("Server failed to start within timeout")
# Check if there was an error during startup
if server_error:
st.error(f"Server startup error: {server_error[0]}")
# Store in module-level singleton
_simulation_server = server
_server_thread = thread
raise server_error[0]
return server
def _update_activity() -> None:
"""Update activity timestamp and resume physics if paused."""
global _last_activity_time, _simulation_server
global _last_activity_time
_last_activity_time = time.time()
# Resume physics if it was paused (use singleton as fallback)
server = st.session_state.get("server") or _simulation_server
# Resume physics if it was paused
server = st.session_state.get("server")
if server is not None and server.paused:
server.paused = False
print("Physics engine resumed (user activity detected)")
@@ -222,13 +174,15 @@ def _update_activity() -> None:
def init_session_state() -> None:
"""Initialise Streamlit session state."""
global _simulation_server
# Get or create the server singleton (survives Streamlit reruns)
# Get or create the server singleton (survives Streamlit reruns via st.cache_resource)
if "server" not in st.session_state or st.session_state.server is None:
with st.spinner("Starting simulation server..."):
server = get_or_create_server()
st.session_state.server = server
try:
server = get_or_create_server()
st.session_state.server = server
except Exception as e:
st.error(f"Failed to start simulation server: {e}")
st.stop()
# Verify server started correctly
if st.session_state.server.physics_engine is None:
@@ -238,19 +192,6 @@ def init_session_state() -> None:
# Start idle checker to pause physics when no one's viewing
_start_idle_checker(st.session_state.server)
# Register cleanup (only once, for the singleton)
def cleanup() -> None:
global _simulation_server
if _simulation_server is not None:
loop = asyncio.new_event_loop()
try:
loop.run_until_complete(_simulation_server.stop())
except Exception:
pass
loop.close()
_simulation_server = None
atexit.register(cleanup)
if "instruments" not in st.session_state:
# Create instruments via HAL using factory
config = InstrumentConfig(