001 /** 002 * Copyright (c) 2010 Yahoo! Inc. All rights reserved. 003 * Licensed under the Apache License, Version 2.0 (the "License"); 004 * you may not use this file except in compliance with the License. 005 * You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software 010 * distributed under the License is distributed on an "AS IS" BASIS, 011 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 012 * See the License for the specific language governing permissions and 013 * limitations under the License. See accompanying LICENSE file. 014 */ 015 package org.apache.oozie.command.wf; 016 017 import java.util.Date; 018 019 import javax.servlet.jsp.el.ELException; 020 021 import org.apache.hadoop.conf.Configuration; 022 import org.apache.oozie.ErrorCode; 023 import org.apache.oozie.FaultInjection; 024 import org.apache.oozie.WorkflowActionBean; 025 import org.apache.oozie.WorkflowJobBean; 026 import org.apache.oozie.XException; 027 import org.apache.oozie.action.ActionExecutor; 028 import org.apache.oozie.action.ActionExecutorException; 029 import org.apache.oozie.client.OozieClient; 030 import org.apache.oozie.client.WorkflowAction; 031 import org.apache.oozie.client.WorkflowJob; 032 import org.apache.oozie.client.SLAEvent.SlaAppType; 033 import org.apache.oozie.client.SLAEvent.Status; 034 import org.apache.oozie.command.CommandException; 035 import org.apache.oozie.command.coord.CoordActionUpdateCommand; 036 import org.apache.oozie.service.ActionService; 037 import org.apache.oozie.service.Services; 038 import org.apache.oozie.service.UUIDService; 039 import org.apache.oozie.store.StoreException; 040 import org.apache.oozie.store.WorkflowStore; 041 import org.apache.oozie.util.ELEvaluationException; 042 import org.apache.oozie.util.Instrumentation; 043 import org.apache.oozie.util.XLog; 044 import org.apache.oozie.util.XmlUtils; 045 import org.apache.oozie.util.db.SLADbOperations; 046 047 public class ActionStartCommand extends ActionCommand<Void> { 048 public static final String EL_ERROR = "EL_ERROR"; 049 public static final String EL_EVAL_ERROR = "EL_EVAL_ERROR"; 050 public static final String COULD_NOT_START = "COULD_NOT_START"; 051 public static final String START_DATA_MISSING = "START_DATA_MISSING"; 052 public static final String EXEC_DATA_MISSING = "EXEC_DATA_MISSING"; 053 054 private String id; 055 private String jobId; 056 057 public ActionStartCommand(String id, String type) { 058 super("action.start", type, 0); 059 this.id = id; 060 } 061 062 @Override 063 protected Void call(WorkflowStore store) throws StoreException, CommandException { 064 WorkflowJobBean workflow = store.getWorkflow(jobId, false); 065 setLogInfo(workflow); 066 WorkflowActionBean action = store.getAction(id, false); 067 XLog.getLog(getClass()).warn(XLog.STD, 068 "[***" + action.getId() + "***]" + "In call()....status=" + action.getStatusStr()); 069 setLogInfo(action); 070 if (action.isPending() 071 && (action.getStatus() == WorkflowActionBean.Status.PREP 072 || action.getStatus() == WorkflowActionBean.Status.START_RETRY || action.getStatus() == WorkflowActionBean.Status.START_MANUAL)) { 073 if (workflow.getStatus() == WorkflowJob.Status.RUNNING) { 074 075 ActionExecutor executor = Services.get().get(ActionService.class).getExecutor(action.getType()); 076 Configuration conf = workflow.getWorkflowInstance().getConf(); 077 078 int maxRetries = conf.getInt(OozieClient.ACTION_MAX_RETRIES, executor.getMaxRetries()); 079 long retryInterval = conf.getLong(OozieClient.ACTION_RETRY_INTERVAL, executor.getRetryInterval()); 080 executor.setMaxRetries(maxRetries); 081 executor.setRetryInterval(retryInterval); 082 083 if (executor != null) { 084 ActionExecutorContext context = null; 085 try { 086 boolean isRetry = false; 087 if (action.getStatus() == WorkflowActionBean.Status.START_RETRY 088 || action.getStatus() == WorkflowActionBean.Status.START_MANUAL) { 089 isRetry = true; 090 } 091 context = new ActionCommand.ActionExecutorContext(workflow, action, isRetry); 092 try { 093 String tmpActionConf = XmlUtils.removeComments(action.getConf()); 094 String actionConf = context.getELEvaluator().evaluate(tmpActionConf, String.class); 095 action.setConf(actionConf); 096 097 XLog.getLog(getClass()).debug("Start, name [{0}] type [{1}] configuration{E}{E}{2}{E}", 098 action.getName(), action.getType(), actionConf); 099 100 } 101 catch (ELEvaluationException ex) { 102 throw new ActionExecutorException(ActionExecutorException.ErrorType.TRANSIENT, 103 EL_EVAL_ERROR, ex.getMessage(), ex); 104 } 105 catch (ELException ex) { 106 context.setErrorInfo(EL_ERROR, ex.getMessage()); 107 XLog.getLog(getClass()).warn("ELException in ActionStartCommand ", ex.getMessage(), ex); 108 handleError(context, store, workflow, action); 109 return null; 110 } 111 catch (org.jdom.JDOMException je) { 112 context.setErrorInfo("ParsingError", je.getMessage()); 113 XLog.getLog(getClass()).warn("JDOMException in ActionStartCommand ", je.getMessage(), je); 114 handleError(context, store, workflow, action); 115 return null; 116 } 117 catch (Exception ex) { 118 context.setErrorInfo(EL_ERROR, ex.getMessage()); 119 XLog.getLog(getClass()).warn("Exception in ActionStartCommand ", ex.getMessage(), ex); 120 handleError(context, store, workflow, action); 121 return null; 122 } 123 action.setErrorInfo(null, null); 124 incrActionCounter(action.getType(), 1); 125 126 Instrumentation.Cron cron = new Instrumentation.Cron(); 127 cron.start(); 128 executor.start(context, action); 129 cron.stop(); 130 FaultInjection.activate("org.apache.oozie.command.SkipCommitFaultInjection"); 131 addActionCron(action.getType(), cron); 132 133 action.setRetries(0); 134 if (action.isExecutionComplete()) { 135 if (!context.isExecuted()) { 136 XLog.getLog(getClass()).warn(XLog.OPS, 137 "Action Completed, ActionExecutor [{0}] must call setExecutionData()", 138 executor.getType()); 139 action.setErrorInfo(EXEC_DATA_MISSING, 140 "Execution Complete, but Execution Data Missing from Action"); 141 failJob(context); 142 store.updateAction(action); 143 store.updateWorkflow(workflow); 144 return null; 145 } 146 action.setPending(); 147 queueCallable(new ActionEndCommand(action.getId(), action.getType())); 148 } 149 else { 150 if (!context.isStarted()) { 151 XLog.getLog(getClass()).warn(XLog.OPS, 152 "Action Started, ActionExecutor [{0}] must call setStartData()", 153 executor.getType()); 154 action.setErrorInfo(START_DATA_MISSING, 155 "Execution Started, but Start Data Missing from Action"); 156 failJob(context); 157 store.updateAction(action); 158 store.updateWorkflow(workflow); 159 return null; 160 } 161 queueCallable(new NotificationCommand(workflow, action)); 162 } 163 164 XLog.getLog(getClass()).warn(XLog.STD, 165 "[***" + action.getId() + "***]" + "Action status=" + action.getStatusStr()); 166 167 store.updateAction(action); 168 store.updateWorkflow(workflow); 169 // Add SLA status event (STARTED) for WF_ACTION 170 // SLADbOperations.writeSlaStatusEvent(eSla, 171 // action.getId(), Status.STARTED, store); 172 SLADbOperations.writeStausEvent(action.getSlaXml(), action.getId(), store, Status.STARTED, 173 SlaAppType.WORKFLOW_ACTION); 174 XLog.getLog(getClass()).warn(XLog.STD, 175 "[***" + action.getId() + "***]" + "Action updated in DB!"); 176 177 } 178 catch (ActionExecutorException ex) { 179 XLog.getLog(getClass()).warn( 180 "Error starting action [{0}]. ErrorType [{1}], ErrorCode [{2}], Message [{3}]", 181 action.getName(), ex.getErrorType(), ex.getErrorCode(), ex.getMessage(), ex); 182 action.setErrorInfo(ex.getErrorCode(), ex.getMessage()); 183 switch (ex.getErrorType()) { 184 case TRANSIENT: 185 if (!handleTransient(context, executor, WorkflowAction.Status.START_RETRY)) { 186 handleNonTransient(store, context, executor, WorkflowAction.Status.START_MANUAL); 187 action.setPendingAge(new Date()); 188 action.setRetries(0); 189 action.setStartTime(null); 190 } 191 break; 192 case NON_TRANSIENT: 193 handleNonTransient(store, context, executor, WorkflowAction.Status.START_MANUAL); 194 break; 195 case ERROR: 196 handleError(context, executor, WorkflowAction.Status.ERROR.toString(), true, 197 WorkflowAction.Status.DONE); 198 break; 199 case FAILED: 200 try { 201 failJob(context); 202 queueCallable(new CoordActionUpdateCommand(workflow)); 203 SLADbOperations.writeStausEvent(action.getSlaXml(), action.getId(), store, 204 Status.FAILED, SlaAppType.WORKFLOW_ACTION); 205 SLADbOperations.writeStausEvent(workflow.getSlaXml(), workflow.getId(), store, 206 Status.FAILED, SlaAppType.WORKFLOW_JOB); 207 } 208 catch (XException x) { 209 XLog.getLog(getClass()).warn("ActionStartCommand - case:FAILED ", x.getMessage()); 210 } 211 break; 212 } 213 store.updateAction(action); 214 store.updateWorkflow(workflow); 215 } 216 } 217 else { 218 throw new CommandException(ErrorCode.E0802, action.getType()); 219 } 220 221 } 222 else { 223 XLog.getLog(getClass()).warn("Job state is not {0}. Skipping Action Execution", 224 WorkflowJob.Status.RUNNING.toString()); 225 } 226 } 227 return null; 228 } 229 230 private void handleError(ActionExecutorContext context, WorkflowStore store, WorkflowJobBean workflow, 231 WorkflowActionBean action) throws CommandException, StoreException { 232 failJob(context); 233 store.updateAction(action); 234 store.updateWorkflow(workflow); 235 SLADbOperations.writeStausEvent(action.getSlaXml(), action.getId(), store, Status.FAILED, 236 SlaAppType.WORKFLOW_ACTION); 237 SLADbOperations.writeStausEvent(workflow.getSlaXml(), workflow.getId(), store, Status.FAILED, 238 SlaAppType.WORKFLOW_JOB); 239 queueCallable(new CoordActionUpdateCommand(workflow)); 240 return; 241 } 242 243 @Override 244 protected Void execute(WorkflowStore store) throws CommandException, StoreException { 245 try { 246 XLog.getLog(getClass()).debug("STARTED ActionStartCommand for wf actionId=" + id); 247 jobId = Services.get().get(UUIDService.class).getId(id); 248 if (lock(jobId)) { 249 call(store); 250 } 251 else { 252 queueCallable(new ActionStartCommand(id, type), LOCK_FAILURE_REQUEUE_INTERVAL); 253 XLog.getLog(getClass()).warn("ActionStartCommand lock was not acquired - failed {0}", id); 254 } 255 } 256 catch (InterruptedException e) { 257 queueCallable(new ActionStartCommand(id, type), LOCK_FAILURE_REQUEUE_INTERVAL); 258 XLog.getLog(getClass()).warn("ActionStartCommand lock was not acquired - interrupted exception failed {0}", 259 id); 260 } 261 XLog.getLog(getClass()).debug("ENDED ActionStartCommand for wf actionId=" + id + ", jobId=" + jobId); 262 return null; 263 } 264 265 }