001 /** 002 * Copyright (c) 2010 Yahoo! Inc. All rights reserved. 003 * Licensed under the Apache License, Version 2.0 (the "License"); 004 * you may not use this file except in compliance with the License. 005 * You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software 010 * distributed under the License is distributed on an "AS IS" BASIS, 011 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 012 * See the License for the specific language governing permissions and 013 * limitations under the License. See accompanying LICENSE file. 014 */ 015 package org.apache.oozie.command.wf; 016 017 import java.io.IOException; 018 import java.io.StringReader; 019 import java.net.URI; 020 import java.net.URISyntaxException; 021 import java.util.Date; 022 import java.util.Properties; 023 024 import org.apache.hadoop.conf.Configuration; 025 import org.apache.hadoop.fs.FileSystem; 026 import org.apache.hadoop.fs.Path; 027 import org.apache.oozie.DagELFunctions; 028 import org.apache.oozie.WorkflowActionBean; 029 import org.apache.oozie.WorkflowJobBean; 030 import org.apache.oozie.action.ActionExecutor; 031 import org.apache.oozie.client.WorkflowAction; 032 import org.apache.oozie.client.WorkflowJob; 033 import org.apache.oozie.command.CommandException; 034 import org.apache.oozie.service.CallbackService; 035 import org.apache.oozie.service.ELService; 036 import org.apache.oozie.service.HadoopAccessorException; 037 import org.apache.oozie.service.HadoopAccessorService; 038 import org.apache.oozie.service.Services; 039 import org.apache.oozie.store.StoreException; 040 import org.apache.oozie.store.WorkflowStore; 041 import org.apache.oozie.util.ELEvaluator; 042 import org.apache.oozie.util.Instrumentation; 043 import org.apache.oozie.util.XConfiguration; 044 import org.apache.oozie.util.XLog; 045 import org.apache.oozie.workflow.WorkflowException; 046 import org.apache.oozie.workflow.WorkflowInstance; 047 import org.apache.oozie.workflow.lite.LiteWorkflowInstance; 048 049 /** 050 * Base class for Action execution commands. Provides common functionality to handle different types of errors while 051 * attempting to start or end an action. 052 */ 053 public abstract class ActionCommand<T> extends WorkflowCommand<Void> { 054 private static final String INSTRUMENTATION_GROUP = "action.executors"; 055 056 protected static final String INSTR_FAILED_JOBS_COUNTER = "failed"; 057 058 protected static final String RECOVERY_ID_SEPARATOR = "@"; 059 060 public ActionCommand(String name, String type, int priority) { 061 super(name, type, priority, XLog.STD); 062 } 063 064 /** 065 * Takes care of Transient failures. Sets the action status to retry and increments the retry count if not enough 066 * attempts have been made. Otherwise returns false. 067 * 068 * @param context the execution context. 069 * @param executor the executor instance being used. 070 * @param status the status to be set for the action. 071 * @return true if the action is scheduled for another retry. false if the number of retries has exceeded the 072 * maximum number of configured retries. 073 * @throws StoreException 074 * @throws org.apache.oozie.command.CommandException 075 */ 076 protected boolean handleTransient(ActionExecutor.Context context, ActionExecutor executor, WorkflowAction.Status status) 077 throws StoreException, CommandException { 078 XLog.getLog(getClass()).debug("Attempting to retry"); 079 ActionExecutorContext aContext = (ActionExecutorContext) context; 080 WorkflowActionBean action = (WorkflowActionBean) aContext.getAction(); 081 incrActionErrorCounter(action.getType(), "transient", 1); 082 083 int actionRetryCount = action.getRetries(); 084 if (actionRetryCount >= executor.getMaxRetries()) { 085 XLog.getLog(getClass()).warn("Exceeded max retry count [{0}]. Suspending Job", executor.getMaxRetries()); 086 return false; 087 } 088 else { 089 action.setStatus(status); 090 action.setPending(); 091 action.incRetries(); 092 long retryDelayMillis = executor.getRetryInterval() * 1000; 093 action.setPendingAge(new Date(System.currentTimeMillis() + retryDelayMillis)); 094 XLog.getLog(getClass()).info("Next Retry, Attempt Number [{0}] in [{1}] milliseconds", 095 actionRetryCount + 1, retryDelayMillis); 096 queueCallable(this, retryDelayMillis); 097 return true; 098 } 099 } 100 101 /** 102 * Takes care of non transient failures. The job is suspended, and the state of the action is changed to *MANUAL 103 * and set pending flag of action to false 104 * 105 * @param store WorkflowStore 106 * @param context the execution context. 107 * @param executor the executor instance being used. 108 * @param status the status to be set for the action. 109 * @throws StoreException 110 * @throws CommandException 111 */ 112 protected void handleNonTransient(WorkflowStore store, ActionExecutor.Context context, ActionExecutor executor, 113 WorkflowAction.Status status) 114 throws StoreException, CommandException { 115 ActionExecutorContext aContext = (ActionExecutorContext) context; 116 WorkflowActionBean action = (WorkflowActionBean) aContext.getAction(); 117 incrActionErrorCounter(action.getType(), "nontransient", 1); 118 WorkflowJobBean workflow = (WorkflowJobBean) context.getWorkflow(); 119 String id = workflow.getId(); 120 action.setStatus(status); 121 action.resetPendingOnly(); 122 XLog.getLog(getClass()).warn("Suspending Workflow Job id=" + id); 123 try { 124 SuspendCommand.suspendJob(store, workflow, id, action.getId()); 125 } 126 catch (WorkflowException e) { 127 throw new CommandException(e); 128 } 129 } 130 131 /** 132 * Takes care of errors. </p> For errors while attempting to start the action, the job state is updated and an 133 * {@link ActionEndCommand} is queued. </p> For errors while attempting to end the action, the job state is updated. 134 * </p> 135 * 136 * @param context the execution context. 137 * @param executor the executor instance being used. 138 * @param message 139 * @param isStart whether the error was generated while starting or ending an action. 140 * @param status the status to be set for the action. 141 * @throws org.apache.oozie.command.CommandException 142 */ 143 protected void handleError(ActionExecutor.Context context, ActionExecutor executor, String message, 144 boolean isStart, WorkflowAction.Status status) throws CommandException { 145 XLog.getLog(getClass()).warn("Setting Action Status to [{0}]", status); 146 ActionExecutorContext aContext = (ActionExecutorContext) context; 147 WorkflowActionBean action = (WorkflowActionBean) aContext.getAction(); 148 incrActionErrorCounter(action.getType(), "error", 1); 149 action.setPending(); 150 if (isStart) { 151 action.setExecutionData(message, null); 152 queueCallable(new ActionEndCommand(action.getId(), action.getType())); 153 } 154 else { 155 action.setEndData(status, WorkflowAction.Status.ERROR.toString()); 156 } 157 } 158 159 public void failJob(ActionExecutor.Context context) throws CommandException { 160 ActionExecutorContext aContext = (ActionExecutorContext) context; 161 WorkflowActionBean action = (WorkflowActionBean) aContext.getAction(); 162 incrActionErrorCounter(action.getType(), "failed", 1); 163 WorkflowJobBean workflow = (WorkflowJobBean) context.getWorkflow(); 164 XLog.getLog(getClass()).warn("Failing Job due to failed action [{0}]", action.getName()); 165 try { 166 workflow.getWorkflowInstance().fail(action.getName()); 167 WorkflowInstance wfInstance = workflow.getWorkflowInstance(); 168 ((LiteWorkflowInstance) wfInstance).setStatus(WorkflowInstance.Status.FAILED); 169 workflow.setWorkflowInstance(wfInstance); 170 workflow.setStatus(WorkflowJob.Status.FAILED); 171 action.setStatus(WorkflowAction.Status.FAILED); 172 action.resetPending(); 173 queueCallable(new NotificationCommand(workflow, action)); 174 queueCallable(new KillCommand(workflow.getId())); 175 incrJobCounter(INSTR_FAILED_JOBS_COUNTER, 1); 176 } 177 catch (WorkflowException ex) { 178 throw new CommandException(ex); 179 } 180 } 181 182 private void incrActionErrorCounter(String type, String error, int count) { 183 getInstrumentation().incr(INSTRUMENTATION_GROUP, type + "#ex." + error, count); 184 } 185 186 protected void incrActionCounter(String type, int count) { 187 getInstrumentation().incr(INSTRUMENTATION_GROUP, type + "#" + getName(), count); 188 } 189 190 protected void addActionCron(String type, Instrumentation.Cron cron) { 191 getInstrumentation().addCron(INSTRUMENTATION_GROUP, type + "#" + getName(), cron); 192 } 193 194 public static class ActionExecutorContext implements ActionExecutor.Context { 195 private WorkflowJobBean workflow; 196 private Configuration protoConf; 197 private WorkflowActionBean action; 198 private boolean isRetry; 199 private boolean started; 200 private boolean ended; 201 private boolean executed; 202 203 public ActionExecutorContext(WorkflowJobBean workflow, WorkflowActionBean action, boolean isRetry) { 204 this.workflow = workflow; 205 this.action = action; 206 this.isRetry = isRetry; 207 try { 208 protoConf = new XConfiguration(new StringReader(workflow.getProtoActionConf())); 209 } 210 catch (IOException ex) { 211 throw new RuntimeException("It should not happen", ex); 212 } 213 } 214 215 public String getCallbackUrl(String externalStatusVar) { 216 return Services.get().get(CallbackService.class).createCallBackUrl(action.getId(), externalStatusVar); 217 } 218 219 public Configuration getProtoActionConf() { 220 return protoConf; 221 } 222 223 public WorkflowJob getWorkflow() { 224 return workflow; 225 } 226 227 public WorkflowAction getAction() { 228 return action; 229 } 230 231 public ELEvaluator getELEvaluator() { 232 ELEvaluator evaluator = Services.get().get(ELService.class).createEvaluator("workflow"); 233 DagELFunctions.configureEvaluator(evaluator, workflow, action); 234 return evaluator; 235 } 236 237 public void setVar(String name, String value) { 238 name = action.getName() + WorkflowInstance.NODE_VAR_SEPARATOR + name; 239 WorkflowInstance wfInstance = workflow.getWorkflowInstance(); 240 wfInstance.setVar(name, value); 241 //workflow.getWorkflowInstance().setVar(name, value); 242 workflow.setWorkflowInstance(wfInstance); 243 } 244 245 public String getVar(String name) { 246 name = action.getName() + WorkflowInstance.NODE_VAR_SEPARATOR + name; 247 return workflow.getWorkflowInstance().getVar(name); 248 } 249 250 public void setStartData(String externalId, String trackerUri, String consoleUrl) { 251 action.setStartData(externalId, trackerUri, consoleUrl); 252 started = true; 253 } 254 255 public void setExecutionData(String externalStatus, Properties actionData) { 256 action.setExecutionData(externalStatus, actionData); 257 executed = true; 258 } 259 260 public void setEndData(WorkflowAction.Status status, String signalValue) { 261 action.setEndData(status, signalValue); 262 ended = true; 263 } 264 265 public boolean isRetry() { 266 return isRetry; 267 } 268 269 /** 270 * Returns whether setStartData has been called or not. 271 * 272 * @return true if start completion info has been set. 273 */ 274 public boolean isStarted() { 275 return started; 276 } 277 278 /** 279 * Returns whether setExecutionData has been called or not. 280 * 281 * @return true if execution completion info has been set, otherwise false. 282 */ 283 public boolean isExecuted() { 284 return executed; 285 } 286 287 288 /** 289 * Returns whether setEndData has been called or not. 290 * 291 * @return true if end completion info has been set. 292 */ 293 public boolean isEnded() { 294 return ended; 295 } 296 297 public void setExternalStatus(String externalStatus) { 298 action.setExternalStatus(externalStatus); 299 } 300 301 @Override 302 public String getRecoveryId() { 303 return action.getId() + RECOVERY_ID_SEPARATOR + workflow.getRun(); 304 } 305 306 /* (non-Javadoc) 307 * @see org.apache.oozie.action.ActionExecutor.Context#getActionDir() 308 */ 309 public Path getActionDir() throws HadoopAccessorException, IOException, URISyntaxException { 310 String name = getWorkflow().getId() + "/" + action.getName() + "--" + action.getType(); 311 FileSystem fs = getAppFileSystem(); 312 String actionDirPath = Services.get().getSystemId() + "/" + name; 313 Path fqActionDir = new Path(fs.getHomeDirectory(), actionDirPath); 314 return fqActionDir; 315 } 316 317 /* (non-Javadoc) 318 * @see org.apache.oozie.action.ActionExecutor.Context#getAppFileSystem() 319 */ 320 public FileSystem getAppFileSystem() throws HadoopAccessorException, IOException, URISyntaxException { 321 WorkflowJob workflow = getWorkflow(); 322 XConfiguration jobConf = new XConfiguration(new StringReader(workflow.getConf())); 323 Configuration fsConf = new Configuration(); 324 XConfiguration.copy(jobConf, fsConf); 325 return Services.get().get(HadoopAccessorService.class).createFileSystem(workflow.getUser(), 326 workflow.getGroup(), new URI(getWorkflow().getAppPath()), fsConf); 327 328 } 329 330 @Override 331 public void setErrorInfo(String str, String exMsg) { 332 action.setErrorInfo(str, exMsg); 333 } 334 } 335 336 }