001 /** 002 * Copyright (c) 2010 Yahoo! Inc. All rights reserved. 003 * Licensed under the Apache License, Version 2.0 (the "License"); 004 * you may not use this file except in compliance with the License. 005 * You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software 010 * distributed under the License is distributed on an "AS IS" BASIS, 011 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 012 * See the License for the specific language governing permissions and 013 * limitations under the License. See accompanying LICENSE file. 014 */ 015 package org.apache.oozie.command.wf; 016 017 import java.util.Date; 018 019 import javax.servlet.jsp.el.ELException; 020 021 import org.apache.hadoop.conf.Configuration; 022 import org.apache.oozie.ErrorCode; 023 import org.apache.oozie.FaultInjection; 024 import org.apache.oozie.WorkflowActionBean; 025 import org.apache.oozie.WorkflowJobBean; 026 import org.apache.oozie.XException; 027 import org.apache.oozie.action.ActionExecutor; 028 import org.apache.oozie.action.ActionExecutorException; 029 import org.apache.oozie.client.OozieClient; 030 import org.apache.oozie.client.WorkflowAction; 031 import org.apache.oozie.client.WorkflowJob; 032 import org.apache.oozie.client.SLAEvent.SlaAppType; 033 import org.apache.oozie.client.SLAEvent.Status; 034 import org.apache.oozie.command.CommandException; 035 import org.apache.oozie.command.coord.CoordActionUpdateCommand; 036 import org.apache.oozie.service.ActionService; 037 import org.apache.oozie.service.Services; 038 import org.apache.oozie.service.UUIDService; 039 import org.apache.oozie.store.StoreException; 040 import org.apache.oozie.store.WorkflowStore; 041 import org.apache.oozie.util.ELEvaluationException; 042 import org.apache.oozie.util.Instrumentation; 043 import org.apache.oozie.util.XLog; 044 import org.apache.oozie.util.XmlUtils; 045 import org.apache.oozie.util.db.SLADbOperations; 046 047 public class ActionStartCommand extends ActionCommand<Void> { 048 public static final String EL_ERROR = "EL_ERROR"; 049 public static final String EL_EVAL_ERROR = "EL_EVAL_ERROR"; 050 public static final String COULD_NOT_START = "COULD_NOT_START"; 051 public static final String START_DATA_MISSING = "START_DATA_MISSING"; 052 public static final String EXEC_DATA_MISSING = "EXEC_DATA_MISSING"; 053 054 private String id; 055 private String jobId; 056 057 public ActionStartCommand(String id, String type) { 058 super("action.start", type, 0); 059 this.id = id; 060 } 061 062 @Override 063 protected Void call(WorkflowStore store) throws StoreException, CommandException { 064 WorkflowJobBean workflow = store.getWorkflow(jobId, false); 065 setLogInfo(workflow); 066 WorkflowActionBean action = store.getAction(id, false); 067 XLog.getLog(getClass()).warn(XLog.STD, 068 "[***" + action.getId() + "***]" + "In call()....status=" + action.getStatusStr()); 069 setLogInfo(action); 070 if (action.isPending() 071 && (action.getStatus() == WorkflowActionBean.Status.PREP 072 || action.getStatus() == WorkflowActionBean.Status.START_RETRY || action.getStatus() == WorkflowActionBean.Status.START_MANUAL)) { 073 if (workflow.getStatus() == WorkflowJob.Status.RUNNING) { 074 075 ActionExecutor executor = Services.get().get(ActionService.class).getExecutor(action.getType()); 076 Configuration conf = workflow.getWorkflowInstance().getConf(); 077 078 int maxRetries = conf.getInt(OozieClient.ACTION_MAX_RETRIES, executor.getMaxRetries()); 079 long retryInterval = conf.getLong(OozieClient.ACTION_RETRY_INTERVAL, executor.getRetryInterval()); 080 executor.setMaxRetries(maxRetries); 081 executor.setRetryInterval(retryInterval); 082 083 if (executor != null) { 084 ActionExecutorContext context = null; 085 try { 086 boolean isRetry = false; 087 if (action.getStatus() == WorkflowActionBean.Status.START_RETRY 088 || action.getStatus() == WorkflowActionBean.Status.START_MANUAL) { 089 isRetry = true; 090 } 091 context = new ActionCommand.ActionExecutorContext(workflow, action, isRetry); 092 try { 093 String tmpActionConf = XmlUtils.removeComments(action.getConf()); 094 String actionConf = context.getELEvaluator().evaluate(tmpActionConf, String.class); 095 action.setConf(actionConf); 096 097 XLog.getLog(getClass()).debug("Start, name [{0}] type [{1}] configuration{E}{E}{2}{E}", 098 action.getName(), action.getType(), actionConf); 099 100 } 101 catch (ELEvaluationException ex) { 102 throw new ActionExecutorException(ActionExecutorException.ErrorType.TRANSIENT, 103 EL_EVAL_ERROR, ex.getMessage(), ex); 104 } 105 catch (ELException ex) { 106 context.setErrorInfo(EL_ERROR, ex.getMessage()); 107 XLog.getLog(getClass()).warn("ELException in ActionStartCommand ", ex.getMessage(), ex); 108 handleError(context, store, workflow, action); 109 return null; 110 } 111 catch (org.jdom.JDOMException je) { 112 context.setErrorInfo("ParsingError", je.getMessage()); 113 XLog.getLog(getClass()).warn("JDOMException in ActionStartCommand ", je.getMessage(), je); 114 handleError(context, store, workflow, action); 115 return null; 116 } 117 catch (Exception ex) { 118 context.setErrorInfo(EL_ERROR, ex.getMessage()); 119 XLog.getLog(getClass()).warn("Exception in ActionStartCommand ", ex.getMessage(), ex); 120 handleError(context, store, workflow, action); 121 return null; 122 } 123 action.setErrorInfo(null, null); 124 incrActionCounter(action.getType(), 1); 125 126 Instrumentation.Cron cron = new Instrumentation.Cron(); 127 cron.start(); 128 executor.start(context, action); 129 cron.stop(); 130 FaultInjection.activate("org.apache.oozie.command.SkipCommitFaultInjection"); 131 addActionCron(action.getType(), cron); 132 133 action.setRetries(0); 134 if (action.isExecutionComplete()) { 135 if (!context.isExecuted()) { 136 XLog.getLog(getClass()).warn(XLog.OPS, 137 "Action Completed, ActionExecutor [{0}] must call setExecutionData()", 138 executor.getType()); 139 action.setErrorInfo(EXEC_DATA_MISSING, 140 "Execution Complete, but Execution Data Missing from Action"); 141 failJob(context); 142 store.updateAction(action); 143 store.updateWorkflow(workflow); 144 return null; 145 } 146 action.setPending(); 147 queueCallable(new ActionEndCommand(action.getId(), action.getType())); 148 } 149 else { 150 if (!context.isStarted()) { 151 XLog.getLog(getClass()).warn(XLog.OPS, 152 "Action Started, ActionExecutor [{0}] must call setStartData()", 153 executor.getType()); 154 action.setErrorInfo(START_DATA_MISSING, 155 "Execution Started, but Start Data Missing from Action"); 156 failJob(context); 157 store.updateAction(action); 158 store.updateWorkflow(workflow); 159 return null; 160 } 161 queueCallable(new NotificationCommand(workflow, action)); 162 } 163 164 XLog.getLog(getClass()).warn(XLog.STD, 165 "[***" + action.getId() + "***]" + "Action status=" + action.getStatusStr()); 166 167 store.updateAction(action); 168 store.updateWorkflow(workflow); 169 // Add SLA status event (STARTED) for WF_ACTION 170 // SLADbOperations.writeSlaStatusEvent(eSla, 171 // action.getId(), Status.STARTED, store); 172 SLADbOperations.writeStausEvent(action.getSlaXml(), action.getId(), store, Status.STARTED, 173 SlaAppType.WORKFLOW_ACTION); 174 XLog.getLog(getClass()).warn(XLog.STD, 175 "[***" + action.getId() + "***]" + "Action updated in DB!"); 176 177 } 178 catch (ActionExecutorException ex) { 179 XLog.getLog(getClass()).warn( 180 "Error starting action [{0}]. ErrorType [{1}], ErrorCode [{2}], Message [{3}]", 181 action.getName(), ex.getErrorType(), ex.getErrorCode(), ex.getMessage(), ex); 182 action.setErrorInfo(ex.getErrorCode(), ex.getMessage()); 183 switch (ex.getErrorType()) { 184 case TRANSIENT: 185 if (!handleTransient(context, executor, WorkflowAction.Status.START_RETRY)) { 186 handleNonTransient(store, context, executor, WorkflowAction.Status.START_MANUAL); 187 action.setPendingAge(new Date()); 188 action.setRetries(0); 189 action.setStartTime(null); 190 } 191 break; 192 case NON_TRANSIENT: 193 handleNonTransient(store, context, executor, WorkflowAction.Status.START_MANUAL); 194 break; 195 case ERROR: 196 handleError(context, executor, WorkflowAction.Status.ERROR.toString(), true, 197 WorkflowAction.Status.DONE); 198 break; 199 case FAILED: 200 try { 201 failJob(context); 202 queueCallable(new CoordActionUpdateCommand(workflow)); 203 new WfEndCommand(jobId).call(); //To delete the WF temp dir 204 SLADbOperations.writeStausEvent(action.getSlaXml(), action.getId(), store, 205 Status.FAILED, SlaAppType.WORKFLOW_ACTION); 206 SLADbOperations.writeStausEvent(workflow.getSlaXml(), workflow.getId(), store, 207 Status.FAILED, SlaAppType.WORKFLOW_JOB); 208 } 209 catch (XException x) { 210 XLog.getLog(getClass()).warn("ActionStartCommand - case:FAILED ", x.getMessage()); 211 } 212 break; 213 } 214 store.updateAction(action); 215 store.updateWorkflow(workflow); 216 } 217 } 218 else { 219 throw new CommandException(ErrorCode.E0802, action.getType()); 220 } 221 222 } 223 else { 224 XLog.getLog(getClass()).warn("Job state is not {0}. Skipping Action Execution", 225 WorkflowJob.Status.RUNNING.toString()); 226 } 227 } 228 return null; 229 } 230 231 private void handleError(ActionExecutorContext context, WorkflowStore store, WorkflowJobBean workflow, 232 WorkflowActionBean action) throws CommandException, StoreException { 233 failJob(context); 234 store.updateAction(action); 235 store.updateWorkflow(workflow); 236 SLADbOperations.writeStausEvent(action.getSlaXml(), action.getId(), store, Status.FAILED, 237 SlaAppType.WORKFLOW_ACTION); 238 SLADbOperations.writeStausEvent(workflow.getSlaXml(), workflow.getId(), store, Status.FAILED, 239 SlaAppType.WORKFLOW_JOB); 240 queueCallable(new CoordActionUpdateCommand(workflow)); 241 new WfEndCommand(jobId).call(); //To delete the WF temp dir 242 return; 243 } 244 245 @Override 246 protected Void execute(WorkflowStore store) throws CommandException, StoreException { 247 try { 248 XLog.getLog(getClass()).debug("STARTED ActionStartCommand for wf actionId=" + id); 249 jobId = Services.get().get(UUIDService.class).getId(id); 250 if (lock(jobId)) { 251 call(store); 252 } 253 else { 254 queueCallable(new ActionStartCommand(id, type), LOCK_FAILURE_REQUEUE_INTERVAL); 255 XLog.getLog(getClass()).warn("ActionStartCommand lock was not acquired - failed {0}", id); 256 } 257 } 258 catch (InterruptedException e) { 259 queueCallable(new ActionStartCommand(id, type), LOCK_FAILURE_REQUEUE_INTERVAL); 260 XLog.getLog(getClass()).warn("ActionStartCommand lock was not acquired - interrupted exception failed {0}", 261 id); 262 } 263 XLog.getLog(getClass()).debug("ENDED ActionStartCommand for wf actionId=" + id + ", jobId=" + jobId); 264 return null; 265 } 266 267 }