Skip to content

Commit

Permalink
Configurable retry delay and jitter
Browse files Browse the repository at this point in the history
  • Loading branch information
basil committed Oct 11, 2023
1 parent b7efaad commit a923bc2
Show file tree
Hide file tree
Showing 5 changed files with 216 additions and 13 deletions.
54 changes: 45 additions & 9 deletions src/main/java/hudson/remoting/Engine.java
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.security.interfaces.RSAPublicKey;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
Expand Down Expand Up @@ -93,7 +94,10 @@
import org.jenkinsci.remoting.protocol.cert.PublicKeyMatchingX509ExtendedTrustManager;
import org.jenkinsci.remoting.protocol.impl.ConnectionRefusalException;
import org.jenkinsci.remoting.util.KeyUtils;
import org.jenkinsci.remoting.util.RetryUtils;
import org.jenkinsci.remoting.util.VersionNumber;
import org.kohsuke.accmod.Restricted;
import org.kohsuke.accmod.restrictions.NoExternalUse;

/**
* Agent engine that proactively connects to Jenkins controller.
Expand Down Expand Up @@ -178,6 +182,12 @@ public Thread newThread(@NonNull final Runnable r) {

private boolean noReconnect = false;

private int delay = 10;

private double jitterFactor = 0;

private int jitter = 0;

/**
* Determines whether the socket will have {@link Socket#setKeepAlive(boolean)} set or not.
*
Expand Down Expand Up @@ -397,6 +407,21 @@ public void setNoReconnect(boolean noReconnect) {
this.noReconnect = noReconnect;
}

@Restricted(NoExternalUse.class)
public void setDelay(int delay) {
this.delay = delay;
}

@Restricted(NoExternalUse.class)
public void setJitterFactor(double jitterFactor) {
this.jitterFactor = jitterFactor;
}

@Restricted(NoExternalUse.class)
public void setJitter(int jitter) {
this.jitter = jitter;
}

/**
* Determines if JNLPAgentEndpointResolver will not perform certificate validation in the HTTPs mode.
*
Expand Down Expand Up @@ -694,7 +719,8 @@ public void closeRead() throws IOException {
}
events.onDisconnect();
while (true) {
TimeUnit.SECONDS.sleep(10);
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
Thread.sleep(duration.toMillis());
// Unlike JnlpAgentEndpointResolver, we do not use $jenkins/tcpSlaveAgentListener/, as that will be a 404 if the TCP port is disabled.
URL ping = new URL(hudsonUrl, "login");
try {
Expand Down Expand Up @@ -873,25 +899,34 @@ private void innerRun(IOHub hub, SSLContext context, ExecutorService service) {
}

private JnlpEndpointResolver createEndpointResolver(List<String> jenkinsUrls) {
JnlpEndpointResolver resolver;
if (directConnection == null) {
SSLSocketFactory sslSocketFactory = null;
try {
sslSocketFactory = getSSLSocketFactory();
} catch (Exception e) {
events.error(e);
}
resolver = new JnlpAgentEndpointResolver(jenkinsUrls, credentials, proxyCredentials, tunnel,
sslSocketFactory, disableHttpsCertValidation);
JnlpAgentEndpointResolver jnlpAgentEndpointResolver =
new JnlpAgentEndpointResolver(
jenkinsUrls,
credentials,
proxyCredentials,
tunnel,
sslSocketFactory,
disableHttpsCertValidation);
jnlpAgentEndpointResolver.setDelay(delay);
jnlpAgentEndpointResolver.setJitterFactor(jitterFactor);
jnlpAgentEndpointResolver.setJitter(jitter);
return jnlpAgentEndpointResolver;
} else {
resolver = new JnlpAgentEndpointConfigurator(directConnection, instanceIdentity, protocols);
return new JnlpAgentEndpointConfigurator(directConnection, instanceIdentity, protocols);
}
return resolver;
}

private void onConnectionRejected(String greeting) throws InterruptedException {
events.status("reconnect rejected, sleeping 10s: ", new Exception("The server rejected the connection: " + greeting));
TimeUnit.SECONDS.sleep(10);
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
events.status("reconnect rejected, sleeping " + duration.getSeconds() + "s: ", new Exception("The server rejected the connection: " + greeting));
Thread.sleep(duration.toMillis());
}

/**
Expand All @@ -913,7 +948,8 @@ private Socket connectTcp(@NonNull JnlpAgentEndpoint endpoint) throws IOExceptio
if(retry++>10) {
throw e;
}
TimeUnit.SECONDS.sleep(10);
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
Thread.sleep(duration.toMillis());
events.status(msg+" (retrying:"+retry+")",e);
}
}
Expand Down
52 changes: 49 additions & 3 deletions src/main/java/hudson/remoting/Launcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import hudson.remoting.Channel.Mode;
import org.jenkinsci.remoting.engine.WorkDirManager;
import org.jenkinsci.remoting.util.PathUtils;
import org.jenkinsci.remoting.util.RetryUtils;
import org.jenkinsci.remoting.util.https.NoCheckHostnameVerifier;
import org.jenkinsci.remoting.util.https.NoCheckTrustManager;
import org.kohsuke.args4j.CmdLineException;
Expand Down Expand Up @@ -84,14 +85,14 @@
import java.security.SecureRandom;
import java.security.cert.CertificateException;
import java.security.cert.CertificateFactory;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Base64;
import java.util.List;
import java.util.Locale;
import java.util.Properties;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;

Expand Down Expand Up @@ -228,6 +229,25 @@ public void setNoCertificateCheck(boolean ignored) throws NoSuchAlgorithmExcepti
@Option(name="-noReconnect",usage="Doesn't try to reconnect when a communication fail, and exit instead")
public boolean noReconnect = false;

@Option(name = "-delay",
usage = "The fixed delay to occur between retries. Default is 10 seconds.")
public int delay = 10;

@Option(name = "-jitterFactor",
usage = "The jitter factor to randomly vary retry delays by. For each retry delay, a"
+ " random portion of the delay multiplied by the jitter factor will be"
+ " added or subtracted to the delay. For example: a retry delay of 10"
+ " seconds and a jitter factor of .25 will result in a random retry delay"
+ " between 7.5 and 12.5 seconds.")
public double jitterFactor = 0;

@Option(name = "-jitter",
usage = "The jitter to randomly vary retry delays by. For each retry delay, a random"
+ " portion of the jitter will be added or subtracted to the delay. For"
+ " example: a jitter of 5 seconds will randomly add between -5 and 5"
+ " seconds to each retry delay.")
public int jitter = 0;

@Option(name = "-noKeepAlive",
usage = "Disable TCP socket keep alive on connection to the controller.")
public boolean noKeepAlive = false;
Expand Down Expand Up @@ -313,6 +333,19 @@ public void run() throws Exception {
return;
}

if (jitterFactor != 0 && jitter != 0) {
throw new CmdLineException("Jitter factor and jitter are mutually exclusive");
}
if (jitterFactor < 0 || jitterFactor > 1) {
throw new CmdLineException("Jitter factor must be >= 0 and <= 1");
}
if (jitter < 0) {
throw new CmdLineException("Jitter must be >= 0");
}
if (jitter > 0 && jitter >= delay) {
throw new CmdLineException("Jitter must be < delay");
}

// Create and verify working directory and logging
// TODO: The pass-through for the JNLP mode has been added in JENKINS-39817. But we still need to keep this parameter in
// consideration for other modes (TcpServer, TcpClient, etc.) to retain the legacy behavior.
Expand Down Expand Up @@ -351,6 +384,18 @@ public void run() throws Exception {
if (this.noReconnect) {
jnlpArgs.add("-noreconnect");
}
if (this.delay != 10) {
jnlpArgs.add("-delay");
jnlpArgs.add(Integer.toString(this.delay));
}
if (this.jitterFactor != 0) {
jnlpArgs.add("-jitterFactor");
jnlpArgs.add(Double.toString(this.jitterFactor));
}
if (this.jitter != 0) {
jnlpArgs.add("-jitter");
jnlpArgs.add(Integer.toString(this.jitter));
}
if (this.noKeepAlive) {
jnlpArgs.add("-noKeepAlive");
}
Expand Down Expand Up @@ -568,8 +613,9 @@ public List<String> parseJnlpArguments() throws ParserConfigurationException, SA

System.err.println("Failed to obtain "+ agentJnlpURL);
e.printStackTrace(System.err);
System.err.println("Waiting 10 seconds before retry");
TimeUnit.SECONDS.sleep(10);
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
System.err.println("Waiting " + duration.getSeconds() + " seconds before retry");
Thread.sleep(duration.toMillis());
// retry
} finally {
if (con instanceof HttpURLConnection) {
Expand Down
34 changes: 34 additions & 0 deletions src/main/java/hudson/remoting/jnlp/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,25 @@ public class Main {
usage="If the connection ends, don't retry and just exit.")
public boolean noReconnect = false;

@Option(name = "-delay",
usage = "The fixed delay to occur between retries. Default is 10 seconds.")
public int delay = 10;

@Option(name = "-jitterFactor",
usage = "The jitter factor to randomly vary retry delays by. For each retry delay, a"
+ " random portion of the delay multiplied by the jitter factor will be"
+ " added or subtracted to the delay. For example: a retry delay of 10"
+ " seconds and a jitter factor of .25 will result in a random retry delay"
+ " between 7.5 and 12.5 seconds.")
public double jitterFactor = 0;

@Option(name = "-jitter",
usage = "The jitter to randomly vary retry delays by. For each retry delay, a random"
+ " portion of the jitter will be added or subtracted to the delay. For"
+ " example: a jitter of 5 seconds will randomly add between -5 and 5"
+ " seconds to each retry delay.")
public int jitter = 0;

@Option(name="-noKeepAlive",
usage="Disable TCP socket keep alive on connection to the controller.")
public boolean noKeepAlive = false;
Expand Down Expand Up @@ -268,6 +287,18 @@ public static void _main(String[] args) throws IOException, InterruptedException
throw new CmdLineException(p, "-webSocket supports only a single -url", null);
}
}
if (m.jitterFactor != 0 && m.jitter != 0) {
throw new CmdLineException("Jitter factor and jitter are mutually exclusive");
}
if (m.jitterFactor < 0 || m.jitterFactor > 1) {
throw new CmdLineException("Jitter factor must be >= 0 and <= 1");
}
if (m.jitter < 0) {
throw new CmdLineException("Jitter must be >= 0");
}
if (m.jitter > 0 && m.jitter >= m.delay) {
throw new CmdLineException("Jitter must be < delay");
}
m.main();
}

Expand Down Expand Up @@ -304,6 +335,9 @@ public Engine createEngine() {
if(jarCache!=null)
engine.setJarCache(new FileSystemJarCache(jarCache,true));
engine.setNoReconnect(noReconnect);
engine.setDelay(delay);
engine.setJitterFactor(jitterFactor);
engine.setJitter(jitter);
engine.setKeepAlive(!noKeepAlive);

if (disableHttpsCertValidation) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,12 @@
import hudson.remoting.Engine;
import hudson.remoting.Launcher;
import hudson.remoting.NoProxyEvaluator;
import org.jenkinsci.remoting.util.RetryUtils;
import org.jenkinsci.remoting.util.VersionNumber;
import org.jenkinsci.remoting.util.https.NoCheckHostnameVerifier;
import org.jenkinsci.remoting.util.https.NoCheckTrustManager;
import org.kohsuke.accmod.Restricted;
import org.kohsuke.accmod.restrictions.NoExternalUse;

import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
Expand All @@ -57,6 +60,7 @@
import java.security.SecureRandom;
import java.security.interfaces.RSAPublicKey;
import java.security.spec.InvalidKeySpecException;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Base64;
Expand Down Expand Up @@ -97,6 +101,12 @@ public class JnlpAgentEndpointResolver extends JnlpEndpointResolver {

private boolean disableHttpsCertValidation;

private int delay = 10;

private double jitterFactor = 0;

private int jitter = 0;

/**
* If specified, only the protocols from the list will be tried during the connection.
* The option provides protocol names, but the order of the check is defined internally and cannot be changed.
Expand Down Expand Up @@ -184,6 +194,21 @@ public void setDisableHttpsCertValidation(boolean disableHttpsCertValidation) {
this.disableHttpsCertValidation = disableHttpsCertValidation;
}

@Restricted(NoExternalUse.class)
public void setDelay(int delay) {
this.delay = delay;
}

@Restricted(NoExternalUse.class)
public void setJitterFactor(double jitterFactor) {
this.jitterFactor = jitterFactor;
}

@Restricted(NoExternalUse.class)
public void setJitter(int jitter) {
this.jitter = jitter;
}

@CheckForNull
@Override
public JnlpAgentEndpoint resolve() throws IOException {
Expand Down Expand Up @@ -378,7 +403,8 @@ public void waitForReady() throws InterruptedException {
try {
int retries = 0;
while (true) {
Thread.sleep(1000 * 10);
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
Thread.sleep(duration.toMillis());
try {
// Jenkins top page might be read-protected. see http://www.nabble
// .com/more-lenient-retry-logic-in-Engine.waitForServerToBack-td24703172.html
Expand Down
61 changes: 61 additions & 0 deletions src/main/java/org/jenkinsci/remoting/util/RetryUtils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
/*
* The MIT License
*
* Copyright (c) 2023, CloudBees, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/

package org.jenkinsci.remoting.util;

import java.security.SecureRandom;
import java.time.Duration;
import java.util.Random;
import org.kohsuke.accmod.Restricted;
import org.kohsuke.accmod.restrictions.NoExternalUse;

/**
* Retry-related utility methods. Used in place of a library like <a
* href="https://failsafe.dev/">Failsafe</a> to minimize external third-party dependencies.
*/
@Restricted(NoExternalUse.class)
public class RetryUtils {

private static final Random RANDOM = new SecureRandom();

// Suppress default constructor for noninstantiability
private RetryUtils() {
throw new AssertionError();
}

/**
* Get the retry duration based on the CLI arguments.
*/
public static Duration getDuration(int delay, double jitterFactor, int jitter) {
if (jitterFactor != 0) {
double randomFactor = 1 + (1 - RANDOM.nextDouble() * 2) * jitterFactor;
return Duration.ofMillis((long) (Duration.ofSeconds(delay).toMillis() * randomFactor));
} else if (jitter != 0) {
double randomAddend = (1 - RANDOM.nextDouble() * 2) * Duration.ofSeconds(jitter).toMillis();
return Duration.ofMillis((long) (Duration.ofSeconds(delay).toMillis() + randomAddend));
} else {
return Duration.ofSeconds(delay);
}
}
}

0 comments on commit a923bc2

Please sign in to comment.