Spring batch Spring XD中自动重试失败的作业
我正在寻找一种标准模式,用于在SpringXD中自动重试失败的作业,重试次数为配置的次数,并在指定的延迟后重试。具体地说,我有一个HTTP条目读取器作业,它是从cron流定期触发的。有时,我们会看到HTTP项读取器由于网络漏洞而失败,因此希望作业自动重试 我曾经尝试过一种当一项工作失败时会自动恢复的方法,但最棘手的是重新尝试失败的工作。我可以通过触发对XD管理员控制器的HTTP PUT(例如Spring batch Spring XD中自动重试失败的作业,spring-batch,spring-xd,Spring Batch,Spring Xd,我正在寻找一种标准模式,用于在SpringXD中自动重试失败的作业,重试次数为配置的次数,并在指定的延迟后重试。具体地说,我有一个HTTP条目读取器作业,它是从cron流定期触发的。有时,我们会看到HTTP项读取器由于网络漏洞而失败,因此希望作业自动重试 我曾经尝试过一种当一项工作失败时会自动恢复的方法,但最棘手的是重新尝试失败的工作。我可以通过触发对XD管理员控制器的HTTP PUT(例如http://xd-server:9393/jobs/executions/2?restart=true)
http://xd-server:9393/jobs/executions/2?restart=true
)
它成功地重试了该作业。但是,我希望能够:
- 请在重试之前指定延迟
- 在XD中进行某种审核,以指示作业将在X秒内重试
有人能提出一种模式吗?因此,我最终选择了以下解决方案: 创建了一个作业执行侦听器
public class RestartableBatchJobExecutionListener extends JobExecutionListener {
private Logger logger = LoggerFactory.getLogger(this.getClass());
public final static String JOB_RESTARTER_NAME = "jobRestarter";
/**
* A list of valid exceptions that are permissible to restart the job on
*/
private List<Class<Throwable>> exceptionsToRestartOn = new ArrayList<Class<Throwable>>();
/**
* The maximum number of times the job can be re-launched before failing
*/
private int maxRestartAttempts = 0;
/**
* The amount of time to wait in milliseconds before restarting a job
*/
private long restartDelayMs = 0;
/**
* Map of all the jobs against how many times they have been attempted to restart
*/
private HashMap<Long,Integer> jobInstanceRestartCount = new HashMap<Long,Integer>();
@Autowired(required=false)
@Qualifier("aynchJobLauncher")
JobLauncher aynchJobLauncher;
@Autowired(required=false)
@Qualifier("jobRegistry")
JobLocator jobLocator;
/*
* (non-Javadoc)
* @see org.springframework.batch.core.JobExecutionListener#afterJob(org.springframework.batch.core.JobExecution)
*/
@Override
public void afterJob(JobExecution jobExecution) {
super.afterJob(jobExecution);
// Check if we can restart if the job has failed
if( jobExecution.getExitStatus().equals(ExitStatus.FAILED) )
{
applyRetryPolicy(jobExecution);
}
}
/**
* Executes the restart policy if one has been specified
*/
private void applyRetryPolicy(JobExecution jobExecution)
{
String jobName = jobExecution.getJobInstance().getJobName();
Long instanceId = jobExecution.getJobInstance().getInstanceId();
if( exceptionsToRestartOn.size() > 0 && maxRestartAttempts > 0 )
{
// Check if the job has failed for a restartable exception
List<Throwable> failedOnExceptions = jobExecution.getAllFailureExceptions();
for( Throwable reason : failedOnExceptions )
{
if( exceptionsToRestartOn.contains(reason.getClass()) ||
exceptionsToRestartOn.contains(reason.getCause().getClass()) )
{
// Get our restart count for this job instance
Integer restartCount = jobInstanceRestartCount.get(instanceId);
if( restartCount == null )
{
restartCount = 0;
}
// Only restart if we haven't reached our limit
if( ++restartCount < maxRestartAttempts )
{
try
{
reLaunchJob(jobExecution, reason, restartCount);
jobInstanceRestartCount.put(instanceId, restartCount);
}
catch (Exception e)
{
String message = "The following error occurred while attempting to re-run job " + jobName + ":" + e.getMessage();
logger.error(message,e);
throw new RuntimeException( message,e);
}
}
else
{
logger.error("Failed to successfully execute jobInstanceId {} of job {} after reaching the maximum restart limit of {}. Abandoning job",instanceId,jobName,maxRestartAttempts );
try
{
jobExecution.setStatus(BatchStatus.ABANDONED);
}
catch (Exception e)
{
throw new RuntimeException( "The following error occurred while attempting to abandon job " + jobName + ":" + e.getMessage(),e);
}
}
break;
}
}
}
}
/**
* Re-launches the configured job with the current job execution details
* @param jobExecution
* @param reason
* @throws JobParametersInvalidException
* @throws JobInstanceAlreadyCompleteException
* @throws JobRestartException
* @throws JobExecutionAlreadyRunningException
*/
private void reLaunchJob( JobExecution jobExecution, Throwable reason, int restartCount ) throws JobExecutionAlreadyRunningException, JobRestartException, JobInstanceAlreadyCompleteException, JobParametersInvalidException
{
try
{
Job jobRestarter = jobLocator.getJob(JOB_RESTARTER_NAME);
JobParameters jobParameters =new JobParametersBuilder().
addLong("delay",(long)restartDelayMs).
addLong("jobExecutionId", jobExecution.getId()).
addString("jobName", jobExecution.getJobInstance().getJobName())
.toJobParameters();
logger.info("Re-launching job with name {} due to exception {}. Attempt {} of {}", jobExecution.getJobInstance().getJobName(), reason, restartCount, maxRestartAttempts);
aynchJobLauncher.run(jobRestarter, jobParameters);
}
catch (NoSuchJobException e)
{
throw new RuntimeException("Failed to find the job restarter with name=" + JOB_RESTARTER_NAME + " in container context",e);
}
}
}
然后,我有一个单独的处理器模块,用于在延迟后重新启动作业(这允许我们从spring XD ui或db进行审计):
delayedJobRestart.xml:
<batch:job id="delayedRestartJob">
<batch:step id="sleep" next="restartJob">
<batch:tasklet ref="sleepTasklet" />
</batch:step>
<batch:step id="restartJob">
<batch:tasklet ref="jobRestarter" />
</batch:step>
</batch:job>
<bean id="sleepTasklet" class="com.mycorp.SleepTasklet" scope="step">
<property name="delayMs" value="#{jobParameters['delay'] != null ? jobParameters['delay'] : '${delay}'}" />
</bean>
<bean id="jobRestarter" class="com.mycorp.HttpRequestTasklet" init-method="init" scope="step">
<property name="uri" value="http://${xd.admin.ui.host}:${xd.admin.ui.port}/jobs/executions/#{jobParameters['jobExecutionId'] != null ? jobParameters['jobExecutionId'] : '${jobExecutionId}'}?restart=true" />
<property name="method" value="PUT" />
</bean>
和附带的助手bean:
SleepTasklet:
public class SleepTasklet implements Tasklet
{
private static Logger logger = LoggerFactory.getLogger(SleepTasklet.class);
@Override
public RepeatStatus execute(StepContribution contribution, ChunkContext chunkContext) throws Exception
{
logger.debug("Pausing current job for {}ms",delayMs);
Thread.sleep( delayMs );
return RepeatStatus.FINISHED;
}
private long delayMs;
public long getDelayMs()
{
return delayMs;
}
public void setDelayMs(long delayMs)
{
this.delayMs = delayMs;
}
}
HttpRequestTasklet:
public class HttpRequestTasklet implements Tasklet
{
private HttpClient httpClient = null;
private static final Logger LOGGER = LoggerFactory.getLogger(HttpRequestTasklet.class);
private String uri;
private String method;
/**
* Initialise HTTP connection.
* @throws Exception
*/
public void init() throws Exception
{
// Create client
RequestConfig config = RequestConfig.custom()
.setCircularRedirectsAllowed(true)
.setRedirectsEnabled(true)
.setExpectContinueEnabled(true)
.setRelativeRedirectsAllowed(true)
.build();
httpClient = HttpClientBuilder.create()
.setRedirectStrategy(new LaxRedirectStrategy())
.setDefaultRequestConfig(config)
.setMaxConnTotal(1)
.build();
}
@Override
public RepeatStatus execute(StepContribution contribution, ChunkContext chunkContext) throws Exception
{
if (LOGGER.isDebugEnabled()) LOGGER.debug("Attempt HTTP {} from '" + uri + "'...",method);
HttpUriRequest request = null;
switch( method.toUpperCase() )
{
case "GET":
request = new HttpGet(uri);
break;
case "POST":
request = new HttpPost(uri);
break;
case "PUT":
request = new HttpPut(uri);
break;
default:
throw new RuntimeException("Http request method " + method + " not supported");
}
HttpResponse response = httpClient.execute(request);
// Check response status and, if valid wrap with InputStreamReader
StatusLine status = response.getStatusLine();
if (status.getStatusCode() != HttpStatus.SC_OK)
{
throw new Exception("Failed to get data from '" + uri + "': " + status.getReasonPhrase());
}
if (LOGGER.isDebugEnabled()) LOGGER.debug("Successfully issued request");
return RepeatStatus.FINISHED;
}
public String getUri()
{
return uri;
}
public void setUri(String uri)
{
this.uri = uri;
}
public String getMethod()
{
return method;
}
public void setMethod(String method)
{
this.method = method;
}
public HttpClient getHttpClient()
{
return httpClient;
}
public void setHttpClient(HttpClient httpClient)
{
this.httpClient = httpClient;
}
}
最后,在构建和部署所有作业后,成对创建作业(注意,重新启动程序应定义为“jobRestarter”):
有点费解,但似乎有效
# Job execution ID
options.jobExecutionId.type=Long
options.jobExecutionId.description=The job execution ID of the job to be restarted
# Job execution name
options.jobName.type=String
options.jobName.description=The name of the job to be restarted. This is more for monitoring purposes
# Delay
options.delay.type=Long
options.delay.description=The delay in milliseconds this job will wait until triggering the restart
options.delay.default=10000
public class SleepTasklet implements Tasklet
{
private static Logger logger = LoggerFactory.getLogger(SleepTasklet.class);
@Override
public RepeatStatus execute(StepContribution contribution, ChunkContext chunkContext) throws Exception
{
logger.debug("Pausing current job for {}ms",delayMs);
Thread.sleep( delayMs );
return RepeatStatus.FINISHED;
}
private long delayMs;
public long getDelayMs()
{
return delayMs;
}
public void setDelayMs(long delayMs)
{
this.delayMs = delayMs;
}
}
public class HttpRequestTasklet implements Tasklet
{
private HttpClient httpClient = null;
private static final Logger LOGGER = LoggerFactory.getLogger(HttpRequestTasklet.class);
private String uri;
private String method;
/**
* Initialise HTTP connection.
* @throws Exception
*/
public void init() throws Exception
{
// Create client
RequestConfig config = RequestConfig.custom()
.setCircularRedirectsAllowed(true)
.setRedirectsEnabled(true)
.setExpectContinueEnabled(true)
.setRelativeRedirectsAllowed(true)
.build();
httpClient = HttpClientBuilder.create()
.setRedirectStrategy(new LaxRedirectStrategy())
.setDefaultRequestConfig(config)
.setMaxConnTotal(1)
.build();
}
@Override
public RepeatStatus execute(StepContribution contribution, ChunkContext chunkContext) throws Exception
{
if (LOGGER.isDebugEnabled()) LOGGER.debug("Attempt HTTP {} from '" + uri + "'...",method);
HttpUriRequest request = null;
switch( method.toUpperCase() )
{
case "GET":
request = new HttpGet(uri);
break;
case "POST":
request = new HttpPost(uri);
break;
case "PUT":
request = new HttpPut(uri);
break;
default:
throw new RuntimeException("Http request method " + method + " not supported");
}
HttpResponse response = httpClient.execute(request);
// Check response status and, if valid wrap with InputStreamReader
StatusLine status = response.getStatusLine();
if (status.getStatusCode() != HttpStatus.SC_OK)
{
throw new Exception("Failed to get data from '" + uri + "': " + status.getReasonPhrase());
}
if (LOGGER.isDebugEnabled()) LOGGER.debug("Successfully issued request");
return RepeatStatus.FINISHED;
}
public String getUri()
{
return uri;
}
public void setUri(String uri)
{
this.uri = uri;
}
public String getMethod()
{
return method;
}
public void setMethod(String method)
{
this.method = method;
}
public HttpClient getHttpClient()
{
return httpClient;
}
public void setHttpClient(HttpClient httpClient)
{
this.httpClient = httpClient;
}
}
job create --name myJob --definition "MyJobModule " --deploy true
job create --name jobRestarter --definition "delayedRestartJob" --deploy true