-
-
Notifications
You must be signed in to change notification settings - Fork 273
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix websocket connection retries logic #771
Changes from 4 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -200,8 +200,6 @@ public Thread newThread(@NonNull final Runnable r) { | |
|
||
private Duration noReconnectAfter; | ||
|
||
private Instant firstAttempt; | ||
|
||
/** | ||
* Determines whether the socket will have {@link Socket#setKeepAlive(boolean)} set or not. | ||
* | ||
|
@@ -593,7 +591,7 @@ public void run() { | |
} | ||
|
||
@SuppressFBWarnings( | ||
value = {"REC_CATCH_EXCEPTION", "URLCONNECTION_SSRF_FD"}, | ||
value = {"REC_CATCH_EXCEPTION"}, | ||
justification = "checked exceptions were a mistake to begin with; connecting to Jenkins from agent") | ||
private void runWebSocket() { | ||
try { | ||
|
@@ -785,12 +783,20 @@ public void closeRead() throws IOException { | |
client.getProperties().put(ClientProperties.SSL_ENGINE_CONFIGURATOR, sslEngineConfigurator); | ||
} | ||
} | ||
container.connectToServer( | ||
new AgentEndpoint(), | ||
ClientEndpointConfig.Builder.create() | ||
.configurator(headerHandler) | ||
.build(), | ||
URI.create(wsUrl + "wsagents/")); | ||
if (!succeedsWithRetries(this::pingSuccessful)) { | ||
return; | ||
} | ||
if (!succeedsWithRetries(() -> { | ||
container.connectToServer( | ||
new AgentEndpoint(), | ||
ClientEndpointConfig.Builder.create() | ||
.configurator(headerHandler) | ||
.build(), | ||
URI.create(wsUrl + "wsagents/")); | ||
return true; | ||
})) { | ||
return; | ||
} | ||
while (ch.get() == null) { | ||
Thread.sleep(100); | ||
} | ||
|
@@ -801,38 +807,109 @@ public void closeRead() throws IOException { | |
if (noReconnect) { | ||
return; | ||
} | ||
firstAttempt = Instant.now(); | ||
events.onDisconnect(); | ||
while (true) { | ||
// TODO refactor various sleep statements into a common method | ||
if (Util.shouldBailOut(firstAttempt, noReconnectAfter)) { | ||
events.status("Bailing out after " + DurationFormatter.format(noReconnectAfter)); | ||
return; | ||
} | ||
TimeUnit.SECONDS.sleep(10); | ||
// Unlike JnlpAgentEndpointResolver, we do not use $jenkins/tcpSlaveAgentListener/, as that will be | ||
// a 404 if the TCP port is disabled. | ||
URL ping = new URL(hudsonUrl, "login"); | ||
try { | ||
HttpURLConnection conn = (HttpURLConnection) ping.openConnection(); | ||
int status = conn.getResponseCode(); | ||
conn.disconnect(); | ||
if (status == 200) { | ||
break; | ||
} else { | ||
events.status(ping + " is not ready: " + status); | ||
} | ||
} catch (IOException x) { | ||
events.status(ping + " is not ready", x); | ||
} | ||
} | ||
reconnect(); | ||
} | ||
} catch (Exception e) { | ||
events.error(e); | ||
} | ||
} | ||
|
||
/** | ||
* Evaluates a condition with exponential backoff until it succeeds or the timeout is reached. | ||
* @param condition the condition to attempt to succeed with exponential backoff | ||
* @return true if the condition succeeded, false if the condition failed and the timeout was reached | ||
* @throws InterruptedException if the thread was interrupted while waiting. | ||
*/ | ||
private boolean succeedsWithRetries(java.util.concurrent.Callable<Boolean> condition) throws InterruptedException { | ||
var exponentialRetry = new ExponentialRetry(noReconnectAfter); | ||
while (exponentialRetry != null) { | ||
try { | ||
if (condition.call()) { | ||
return true; | ||
} | ||
} catch (Exception x) { | ||
events.status("Failed to connect: " + x.getMessage()); | ||
} | ||
exponentialRetry = exponentialRetry.next(events); | ||
} | ||
return false; | ||
} | ||
|
||
@SuppressFBWarnings( | ||
value = {"URLCONNECTION_SSRF_FD"}, | ||
justification = "url is provided by the user, and we are trying to connect to it") | ||
private Boolean pingSuccessful() throws MalformedURLException { | ||
// Unlike JnlpAgentEndpointResolver, we do not use $jenkins/tcpSlaveAgentListener/, as that will be | ||
// a 404 if the TCP port is disabled. | ||
URL ping = new URL(hudsonUrl, "login"); | ||
try { | ||
HttpURLConnection conn = (HttpURLConnection) ping.openConnection(); | ||
int status = conn.getResponseCode(); | ||
conn.disconnect(); | ||
if (status == 200) { | ||
return true; | ||
} else { | ||
events.status(ping + " is not ready: " + status); | ||
} | ||
} catch (IOException x) { | ||
events.status(ping + " is not ready", x); | ||
} | ||
return false; | ||
} | ||
|
||
private static class ExponentialRetry { | ||
final int factor; | ||
final Instant beginning; | ||
final Duration delay; | ||
final Duration timeout; | ||
final Duration incrementDelay; | ||
final Duration maxDelay; | ||
|
||
ExponentialRetry(Duration timeout) { | ||
this(Duration.ofSeconds(0), timeout, 2, Duration.ofSeconds(1), Duration.ofSeconds(10)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. compare user-configurable version in #676 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. jitter could be useful, but exposing these as user-level settings seem overkill to me. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed, just noting for reference. |
||
} | ||
|
||
ExponentialRetry( | ||
Duration initialDelay, Duration timeout, int factor, Duration incrementDelay, Duration maxDelay) { | ||
this.beginning = Instant.now(); | ||
this.delay = initialDelay; | ||
this.timeout = timeout; | ||
this.factor = factor; | ||
this.incrementDelay = incrementDelay; | ||
this.maxDelay = maxDelay; | ||
} | ||
|
||
ExponentialRetry(ExponentialRetry previous) { | ||
beginning = previous.beginning; | ||
factor = previous.factor; | ||
timeout = previous.timeout; | ||
incrementDelay = previous.incrementDelay; | ||
maxDelay = previous.maxDelay; | ||
delay = min(maxDelay, previous.delay.multipliedBy(previous.factor).plus(incrementDelay)); | ||
} | ||
|
||
private static Duration min(Duration a, Duration b) { | ||
return a.compareTo(b) < 0 ? a : b; | ||
} | ||
|
||
boolean timeoutExceeded() { | ||
return Util.shouldBailOut(beginning, timeout); | ||
} | ||
|
||
ExponentialRetry next(EngineListenerSplitter events) throws InterruptedException { | ||
var next = new ExponentialRetry(this); | ||
if (next.timeoutExceeded()) { | ||
events.status("Bailing out after " + DurationFormatter.format(next.timeout)); | ||
return null; | ||
} else { | ||
events.status("Waiting " + DurationFormatter.format(next.delay) + " before retry"); | ||
TimeUnit.SECONDS.sleep(next.delay.toSeconds()); | ||
Vlatombe marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
return next; | ||
} | ||
} | ||
|
||
private void reconnect() { | ||
try { | ||
events.status("Performing onReconnect operation."); | ||
|
@@ -862,7 +939,7 @@ private void innerRun(IOHub hub, SSLContext context, ExecutorService service) { | |
|
||
try { | ||
boolean first = true; | ||
firstAttempt = Instant.now(); | ||
var firstAttempt = Instant.now(); | ||
while (true) { | ||
if (first) { | ||
first = false; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We are losing the stack trace here; is that potentially important for diagnosis?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In my experience these stacktraces are noisy. I could eventually log them with standard JUL logger (FINE) in parallel, but I don't really see the point of keeping them in status.