@@ -48,6 +48,7 @@ import java.net.URI
4848import java.time.Duration
4949import java.time.LocalDateTime
5050import java.time.format.DateTimeFormatter
51+ import java.util.concurrent.TimeUnit
5152import java.util.concurrent.TimeoutException
5253import kotlin.coroutines.resume
5354import kotlin.coroutines.resumeWithException
@@ -227,16 +228,9 @@ class CoderRemoteConnectionHandle {
227228
228229 // Wait for the IDE to come up.
229230 indicator.text = " Waiting for ${workspace.ideName} backend..."
230- var status: UnattendedHostStatus ? = null
231231 val remoteProjectPath = accessor.makeRemotePath(ShellArgument .PlainText (workspace.projectPath))
232232 val logsDir = accessor.getLogsDir(workspace.ideProduct.productCode, remoteProjectPath)
233- while (lifetime.status == LifetimeStatus .Alive ) {
234- status = ensureIDEBackend(accessor, workspace, ideDir, remoteProjectPath, logsDir, lifetime, null )
235- if (! status?.joinLink.isNullOrBlank()) {
236- break
237- }
238- delay(5000 )
239- }
233+ var status = ensureIDEBackend(accessor, workspace, ideDir, remoteProjectPath, logsDir, lifetime, null )
240234
241235 // We wait for non-null, so this only happens on cancellation.
242236 val joinLink = status?.joinLink
@@ -302,6 +296,7 @@ class CoderRemoteConnectionHandle {
302296 }
303297 // Continue once the client is present.
304298 handle.onClientPresenceChanged.advise(lifetime) {
299+ logger.info(" ${workspace.ideName} client to ${workspace.hostname} presence: ${handle.clientPresent} " )
305300 if (handle.clientPresent && continuation.isActive) {
306301 continuation.resume(true )
307302 }
@@ -437,8 +432,8 @@ class CoderRemoteConnectionHandle {
437432 }
438433
439434 /* *
440- * Ensure the backend is started. Status and/or links may be null if the
441- * backend has not started .
435+ * Ensure the backend is started. It will not return until a join link is
436+ * received or the lifetime expires .
442437 */
443438 private suspend fun ensureIDEBackend (
444439 accessor : HighLevelHostAccessor ,
@@ -449,41 +444,74 @@ class CoderRemoteConnectionHandle {
449444 lifetime : LifetimeDefinition ,
450445 currentStatus : UnattendedHostStatus ? ,
451446 ): UnattendedHostStatus ? {
452- val details = " ${workspace.hostname} :${ideDir.toRawString()} , project=${remoteProjectPath.toRawString()} "
453- return try {
454- if (currentStatus?.appPid != null &&
455- ! currentStatus.joinLink.isNullOrBlank() &&
456- accessor.isPidAlive(currentStatus.appPid.toInt())
457- ) {
458- // If the PID is alive, assume the join link we have is still
459- // valid. The join link seems to change even if it is the same
460- // backend running, so if we always fetched the link the client
461- // would relaunch over and over.
462- return currentStatus
463- }
447+ val details = " $${workspace.hostname} :${ideDir.toRawString()} , project=${remoteProjectPath.toRawString()} "
448+ val wait = TimeUnit .SECONDS .toMillis(5 )
464449
465- // See if there is already a backend running. Weirdly, there is
466- // always a PID, even if there is no backend running, and
467- // backendUnresponsive is always false, but the links are null so
468- // hopefully that is an accurate indicator that the IDE is up.
469- val status = accessor.getHostIdeStatus(ideDir, remoteProjectPath)
470- if (! status.joinLink.isNullOrBlank()) {
471- logger.info(" Found existing ${workspace.ideName} backend on $details " )
472- return status
450+ // Check if the current IDE is alive.
451+ if (currentStatus != null ) {
452+ while (lifetime.status == LifetimeStatus .Alive ) {
453+ try {
454+ val isAlive = accessor.isPidAlive(currentStatus.appPid.toInt())
455+ logger.info(" ${workspace.ideName} status: pid=${currentStatus.appPid} , alive=$isAlive " )
456+ if (isAlive) {
457+ // Use the current status and join link.
458+ return currentStatus
459+ } else {
460+ logger.info(" Relaunching ${workspace.ideName} since it is not alive..." )
461+ break
462+ }
463+ } catch (ex: Exception ) {
464+ logger.info(" Failed to check if ${workspace.ideName} is alive on $details ; waiting $wait ms to try again: pid=${currentStatus.appPid} " , ex)
465+ }
466+ delay(wait)
473467 }
468+ } else {
469+ logger.info(" Launching ${workspace.ideName} for the first time on ${workspace.hostname} ..." )
470+ }
471+
472+ // This means we broke out because the user canceled or closed the IDE.
473+ if (lifetime.status != LifetimeStatus .Alive ) {
474+ return null
475+ }
474476
475- // Otherwise, spawn a new backend. This does not seem to spawn a
476- // second backend if one is already running, yet it does somehow
477- // cause a second client to launch. So only run this if we are
478- // really sure we have to launch a new backend.
479- logger.info(" Starting ${workspace.ideName} backend on $details " )
480- accessor.startHostIdeInBackgroundAndDetach(lifetime, ideDir, remoteProjectPath, logsDir)
481- // Get the newly spawned PID and join link.
482- return accessor.getHostIdeStatus(ideDir, remoteProjectPath)
483- } catch (ex: Exception ) {
484- logger.info(" Failed to get ${workspace.ideName} status from $details " , ex)
485- currentStatus
477+ // If the PID is not alive, spawn a new backend. This may not be
478+ // idempotent, so only call if we are really sure we need to.
479+ accessor.startHostIdeInBackgroundAndDetach(lifetime, ideDir, remoteProjectPath, logsDir)
480+
481+ // Get the newly spawned PID and join link.
482+ var attempts = 0
483+ val maxAttempts = 6
484+ while (lifetime.status == LifetimeStatus .Alive ) {
485+ try {
486+ attempts++
487+ val status = accessor.getHostIdeStatus(ideDir, remoteProjectPath)
488+ if (! status.joinLink.isNullOrBlank()) {
489+ logger.info(" Found join link for ${workspace.ideName} ; proceeding to connect: pid=${status.appPid} " )
490+ return status
491+ }
492+ // If we did not get a join link, see if the IDE is alive in
493+ // case it died and we need to respawn.
494+ val isAlive = status.appPid > 0 && accessor.isPidAlive(status.appPid.toInt())
495+ logger.info(" ${workspace.ideName} status: pid=${status.appPid} , alive=$isAlive , unresponsive=${status.backendUnresponsive} , attempt=$attempts " )
496+ // It is not clear whether the PID can be trusted because we get
497+ // one even when there is no backend at all. For now give it
498+ // some time and if it is still dead, only then try to respawn.
499+ if (! isAlive && attempts >= maxAttempts) {
500+ logger.info(" ${workspace.ideName} is still not alive after $attempts checks, respawning backend and waiting $wait ms to try again" )
501+ accessor.startHostIdeInBackgroundAndDetach(lifetime, ideDir, remoteProjectPath, logsDir)
502+ attempts = 0
503+ } else {
504+ logger.info(" No join link found in status; waiting $wait ms to try again" )
505+ }
506+ } catch (ex: Exception ) {
507+ logger.info(" Failed to get ${workspace.ideName} status from $details ; waiting $wait ms to try again" , ex)
508+ }
509+ delay(wait)
486510 }
511+
512+ // This means the lifetime is no longer alive.
513+ logger.info(" Connection to ${workspace.ideName} on $details aborted by user" )
514+ return null
487515 }
488516
489517 companion object {
0 commit comments