@@ -67,17 +67,20 @@ class HudServer(private val context: Context) {
6767 /* * Server bind port. Matches [HudDiscovery.DEFAULT_PORT] so testers
6868 * don't need to change anything in the phone settings. */
6969 private val PORT = HudDiscovery .DEFAULT_PORT
70- /* * How often the self-heal watchdog ticks (ms). 30 s is small
71- * enough to catch a dead listener within a single missed
72- * reconnect window, large enough that it never noticeably
73- * loads the device. */
74- private const val WATCHDOG_INTERVAL_MS : Long = 30_000L
70+ /* * How often the self-heal watchdog ticks (ms). 10 s so a wedged
71+ * listener or a stalled mDNS/beacon is caught and restarted within
72+ * ~20-30 s -- snappy enough that the rider rarely sees it, but not
73+ * so tight it churns. The CPU wake lock (see [wakeLock]) keeps this
74+ * loop running even if the device tries to suspend. */
75+ private const val WATCHDOG_INTERVAL_MS : Long = 10_000L
7576 /* * Consecutive failed localhost probes before forcing a server
76- * restart. Two misses (= ~60 s) covers an Android Doze blip
77- * without acting on a transient . */
77+ * restart. Two misses (~20-30 s) rides out a transient localhost
78+ * blip without acting on it . */
7879 private const val WATCHDOG_FAIL_THRESHOLD : Int = 2
7980 /* * Per-attempt timeout for the watchdog's localhost TCP probe (ms). */
8081 private const val TCP_PROBE_TIMEOUT_MS : Int = 1_500
82+ /* * Wake-lock tag; shows up in `dumpsys power` for debugging. */
83+ private const val WAKE_LOCK_TAG = " eucplanet-hud:server"
8184 }
8285
8386 /* * Connection state surfaced to the UI status banner. */
@@ -129,6 +132,23 @@ class HudServer(private val context: Context) {
129132 private var server: io.ktor.server.engine.ApplicationEngine ? = null
130133 private var jmdns: JmDNS ? = null
131134 private var multicastLock: WifiManager .MulticastLock ? = null
135+ /* * CPU wake lock held while the link is up. The HUD keeps the screen on
136+ * (FLAG_KEEP_SCREEN_ON) which usually keeps the CPU running, but some
137+ * aftermarket HUDs power the panel independently or background the app;
138+ * a partial wake lock guarantees the Ktor server, beacon and watchdog
139+ * coroutines keep ticking so the link can't silently freeze the way a
140+ * tester saw it die after ~2 min. */
141+ private var wakeLock: android.os.PowerManager .WakeLock ? = null
142+
143+ // --- Diagnostics (surfaced on the HUD stats card + logcat) so the next
144+ // "link died, had to reboot" report carries the cause, not just the
145+ // symptom. ---
146+ /* * How many times the watchdog has force-restarted the server this run. */
147+ @Volatile var watchdogRestarts: Int = 0 ; private set
148+ /* * Why the last phone connection ended (clean close / timeout / exception). */
149+ @Volatile var lastEndReason: String = " " ; private set
150+ /* * Wall-clock of the last disconnect, 0 if still on the first connection. */
151+ @Volatile var lastDisconnectMs: Long = 0L ; private set
132152 /* * Self-heal watchdog. Polls the local listener and the current IPv4 every
133153 * [WATCHDOG_INTERVAL_MS] so a hotspot IP-renew or a silently-dead Ktor
134154 * socket gets fixed without the rider having to reboot the HUD. */
@@ -137,6 +157,10 @@ class HudServer(private val context: Context) {
137157 * We require [WATCHDOG_FAIL_THRESHOLD] in a row before a full restart so
138158 * a transient localhost blip doesn't churn the server. */
139159 @Volatile private var watchdogFailStreak: Int = 0
160+ /* * Monotonic connection counter so a stale, slowly-timing-out WebSocket
161+ * handler doesn't clobber the status/peer of a newer connection that
162+ * arrived during a fast reconnect. */
163+ private val connSeq = java.util.concurrent.atomic.AtomicInteger (0 )
140164 // Always-on UDP broadcast beacon so the phone can find us on networks
141165 // where mDNS multicast is blocked (most phone hotspots, every carrier
142166 // mobile hotspot). Runs in parallel with the mDNS advertise -- whichever
@@ -163,21 +187,31 @@ class HudServer(private val context: Context) {
163187
164188 private suspend fun doStart () {
165189 if (server != null ) return
190+ acquireWakeLock()
166191 _localIp .value = pickLocalIp()
167192 val s = embeddedServer(CIO , port = PORT , host = " 0.0.0.0" ) {
168193 install(WebSockets ) {
169- // Heartbeat: phone or HUD network can go away silently. Ping
170- // every 15s so the OS surfaces a broken TCP connection
171- // within a reasonable window without spamming the wire.
194+ // Heartbeat: phone or HUD network can go away silently. On the
195+ // local hotspot link (sub-10ms RTT) we can afford an aggressive
196+ // beat: ping every 5s and declare the peer dead after 12s of
197+ // silence. This frees [_peer]/[_status] and unblocks the
198+ // `incoming` loop within ~12s of the phone vanishing instead of
199+ // 30s -- so a fresh dial from the phone is accepted promptly and
200+ // the rider isn't stuck on a stale "connected" HUD.
172201 // Millis form because the Duration property accessor names
173202 // moved across Ktor 2.x; millis is stable.
174- pingPeriodMillis = 15_000L
175- timeoutMillis = 30_000L
203+ pingPeriodMillis = 5_000L
204+ timeoutMillis = 12_000L
176205 }
177206 routing {
178207 webSocket(HudDiscovery .PATH_STATE ) {
179208 val remote = call.request.local.remoteHost
180- Log .i(TAG , " phone connected: $remote " )
209+ // Token for THIS connection. With fast heartbeats a phone
210+ // can redial while a previous handler is still unwinding its
211+ // timeout; we must not let the stale handler's finally clear
212+ // the status/peer that the newer connection already set.
213+ val myConn = connSeq.incrementAndGet()
214+ Log .i(TAG , " phone connected: $remote (#$myConn )" )
181215 _peer .value = remote
182216 _status .value = Status .CONNECTED
183217 // Diagnostic: remember the phone IP across reconnects
@@ -259,9 +293,17 @@ class HudServer(private val context: Context) {
259293 endReason = " exception: ${t::class .simpleName} ${t.message} "
260294 } finally {
261295 sender.cancel()
262- Log .i(TAG , " phone disconnected: $remote ($endReason )" )
263- _peer .value = null
264- _status .value = Status .LISTENING
296+ Log .i(TAG , " phone disconnected: $remote (#$myConn , $endReason )" )
297+ lastEndReason = endReason
298+ lastDisconnectMs = System .currentTimeMillis()
299+ // Only surrender the connection state if no newer
300+ // connection has taken over since we opened. Otherwise
301+ // a slow-timing-out stale handler would wrongly flip the
302+ // HUD back to LISTENING while a fresh phone is connected.
303+ if (connSeq.get() == myConn) {
304+ _peer .value = null
305+ _status .value = Status .LISTENING
306+ }
265307 }
266308 }
267309 }
@@ -303,6 +345,30 @@ class HudServer(private val context: Context) {
303345 server = null
304346 _peer .value = null
305347 _status .value = Status .LISTENING
348+ releaseWakeLock()
349+ }
350+
351+ /* * Acquire a partial CPU wake lock so the link can't be frozen by a
352+ * device suspend window. Idempotent; safe to call on every (re)start. */
353+ private fun acquireWakeLock () {
354+ if (wakeLock?.isHeld == true ) return
355+ try {
356+ val pm = context.applicationContext
357+ .getSystemService(Context .POWER_SERVICE ) as ? android.os.PowerManager
358+ wakeLock = pm?.newWakeLock(
359+ android.os.PowerManager .PARTIAL_WAKE_LOCK , WAKE_LOCK_TAG
360+ )?.apply {
361+ setReferenceCounted(false )
362+ acquire()
363+ }
364+ } catch (t: Throwable ) {
365+ Log .w(TAG , " wake lock acquire failed: ${t.message} " )
366+ }
367+ }
368+
369+ private fun releaseWakeLock () {
370+ try { if (wakeLock?.isHeld == true ) wakeLock?.release() } catch (_: Throwable ) {}
371+ wakeLock = null
306372 }
307373
308374 /* * Watchdog loop. Two jobs per tick:
@@ -334,6 +400,12 @@ class HudServer(private val context: Context) {
334400 try { multicastLock?.release() } catch (_: Throwable ) {}
335401 multicastLock = null
336402 startMdnsAdvertise()
403+ } else if (jmdns == null && ! current.isNullOrBlank()) {
404+ // mDNS advertise failed at startup (or got torn down) but the
405+ // IP is fine -- revive it so phones browsing _eucplanet._tcp
406+ // can still resolve us without a server restart.
407+ Log .i(TAG , " watchdog: mDNS advertise was down; reviving" )
408+ startMdnsAdvertise()
337409 }
338410 // 2. Server-liveness probe.
339411 val alive = tcpProbe(" 127.0.0.1" , PORT )
@@ -346,6 +418,7 @@ class HudServer(private val context: Context) {
346418 Log .w(TAG , " watchdog: server appears wedged after $watchdogFailStreak " +
347419 " consecutive misses; forcing restart" )
348420 watchdogFailStreak = 0
421+ watchdogRestarts++
349422 // Restart in its own coroutine so the watchdog loop itself
350423 // doesn't suspend on the start/stop lock that doStart()
351424 // takes. If the watchdog held its own lifecycle-lock here
0 commit comments