Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix websocket connection retries logic #771

Merged
Merged
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 111 additions & 34 deletions src/main/java/hudson/remoting/Engine.java
Original file line number Diff line number Diff line change
Expand Up @@ -200,8 +200,6 @@ public Thread newThread(@NonNull final Runnable r) {

private Duration noReconnectAfter;

private Instant firstAttempt;

/**
* Determines whether the socket will have {@link Socket#setKeepAlive(boolean)} set or not.
*
Expand Down Expand Up @@ -593,7 +591,7 @@ public void run() {
}

@SuppressFBWarnings(
value = {"REC_CATCH_EXCEPTION", "URLCONNECTION_SSRF_FD"},
value = {"REC_CATCH_EXCEPTION"},
justification = "checked exceptions were a mistake to begin with; connecting to Jenkins from agent")
private void runWebSocket() {
try {
Expand Down Expand Up @@ -785,12 +783,20 @@ public void closeRead() throws IOException {
client.getProperties().put(ClientProperties.SSL_ENGINE_CONFIGURATOR, sslEngineConfigurator);
}
}
container.connectToServer(
new AgentEndpoint(),
ClientEndpointConfig.Builder.create()
.configurator(headerHandler)
.build(),
URI.create(wsUrl + "wsagents/"));
if (!succeedsWithRetries(this::pingSuccessful)) {
return;
}
if (!succeedsWithRetries(() -> {
container.connectToServer(
new AgentEndpoint(),
ClientEndpointConfig.Builder.create()
.configurator(headerHandler)
.build(),
URI.create(wsUrl + "wsagents/"));
return true;
})) {
return;
}
while (ch.get() == null) {
Thread.sleep(100);
}
Expand All @@ -801,38 +807,109 @@ public void closeRead() throws IOException {
if (noReconnect) {
return;
}
firstAttempt = Instant.now();
events.onDisconnect();
while (true) {
// TODO refactor various sleep statements into a common method
if (Util.shouldBailOut(firstAttempt, noReconnectAfter)) {
events.status("Bailing out after " + DurationFormatter.format(noReconnectAfter));
return;
}
TimeUnit.SECONDS.sleep(10);
// Unlike JnlpAgentEndpointResolver, we do not use $jenkins/tcpSlaveAgentListener/, as that will be
// a 404 if the TCP port is disabled.
URL ping = new URL(hudsonUrl, "login");
try {
HttpURLConnection conn = (HttpURLConnection) ping.openConnection();
int status = conn.getResponseCode();
conn.disconnect();
if (status == 200) {
break;
} else {
events.status(ping + " is not ready: " + status);
}
} catch (IOException x) {
events.status(ping + " is not ready", x);
}
}
reconnect();
}
} catch (Exception e) {
events.error(e);
}
}

/**
* Evaluates a condition with exponential backoff until it succeeds or the timeout is reached.
* @param condition the condition to attempt to succeed with exponential backoff
* @return true if the condition succeeded, false if the condition failed and the timeout was reached
* @throws InterruptedException if the thread was interrupted while waiting.
*/
private boolean succeedsWithRetries(java.util.concurrent.Callable<Boolean> condition) throws InterruptedException {
var exponentialRetry = new ExponentialRetry(noReconnectAfter);
while (exponentialRetry != null) {
try {
if (condition.call()) {
return true;
}
} catch (Exception x) {
events.status("Failed to connect: " + x.getMessage());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are losing the stack trace here; is that potentially important for diagnosis?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In my experience these stacktraces are noisy. I could eventually log them with standard JUL logger (FINE) in parallel, but I don't really see the point of keeping them in status.

}
exponentialRetry = exponentialRetry.next(events);
}
return false;
}

@SuppressFBWarnings(
value = {"URLCONNECTION_SSRF_FD"},
justification = "url is provided by the user, and we are trying to connect to it")
private Boolean pingSuccessful() throws MalformedURLException {
// Unlike JnlpAgentEndpointResolver, we do not use $jenkins/tcpSlaveAgentListener/, as that will be
// a 404 if the TCP port is disabled.
URL ping = new URL(hudsonUrl, "login");
try {
HttpURLConnection conn = (HttpURLConnection) ping.openConnection();
int status = conn.getResponseCode();
conn.disconnect();
if (status == 200) {
return true;
} else {
events.status(ping + " is not ready: " + status);
}
} catch (IOException x) {
events.status(ping + " is not ready", x);
}
return false;
}

private static class ExponentialRetry {
final int factor;
final Instant beginning;
final Duration delay;
final Duration timeout;
final Duration incrementDelay;
final Duration maxDelay;

ExponentialRetry(Duration timeout) {
this(Duration.ofSeconds(0), timeout, 2, Duration.ofSeconds(1), Duration.ofSeconds(10));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

compare user-configurable version in #676

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

jitter could be useful, but exposing these as user-level settings seem overkill to me.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, just noting for reference.

}

ExponentialRetry(
Duration initialDelay, Duration timeout, int factor, Duration incrementDelay, Duration maxDelay) {
this.beginning = Instant.now();
this.delay = initialDelay;
this.timeout = timeout;
this.factor = factor;
this.incrementDelay = incrementDelay;
this.maxDelay = maxDelay;
}

ExponentialRetry(ExponentialRetry previous) {
beginning = previous.beginning;
factor = previous.factor;
timeout = previous.timeout;
incrementDelay = previous.incrementDelay;
maxDelay = previous.maxDelay;
delay = min(maxDelay, previous.delay.multipliedBy(previous.factor).plus(incrementDelay));
}

private static Duration min(Duration a, Duration b) {
return a.compareTo(b) < 0 ? a : b;
}

boolean timeoutExceeded() {
return Util.shouldBailOut(beginning, timeout);
}

ExponentialRetry next(EngineListenerSplitter events) throws InterruptedException {
var next = new ExponentialRetry(this);
if (next.timeoutExceeded()) {
events.status("Bailing out after " + DurationFormatter.format(next.timeout));
return null;
} else {
events.status("Waiting " + DurationFormatter.format(next.delay) + " before retry");
TimeUnit.SECONDS.sleep(next.delay.toSeconds());
Vlatombe marked this conversation as resolved.
Show resolved Hide resolved
}
return next;
}
}

private void reconnect() {
try {
events.status("Performing onReconnect operation.");
Expand Down Expand Up @@ -862,7 +939,7 @@ private void innerRun(IOHub hub, SSLContext context, ExecutorService service) {

try {
boolean first = true;
firstAttempt = Instant.now();
var firstAttempt = Instant.now();
while (true) {
if (first) {
first = false;
Expand Down
Loading