001 /** 002 * Copyright (c) 2010 Yahoo! Inc. All rights reserved. 003 * Licensed under the Apache License, Version 2.0 (the "License"); 004 * you may not use this file except in compliance with the License. 005 * You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software 010 * distributed under the License is distributed on an "AS IS" BASIS, 011 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 012 * See the License for the specific language governing permissions and 013 * limitations under the License. See accompanying LICENSE file. 014 */ 015 package org.apache.oozie.command.wf; 016 017 import java.util.HashSet; 018 import java.util.Set; 019 import java.io.IOException; 020 import java.util.Collection; 021 import java.util.HashMap; 022 import java.util.List; 023 import java.util.Map; 024 025 import org.apache.hadoop.conf.Configuration; 026 import org.apache.hadoop.fs.FileSystem; 027 import org.apache.hadoop.fs.Path; 028 import org.apache.oozie.client.WorkflowAction; 029 import org.apache.oozie.client.WorkflowJob; 030 import org.apache.oozie.client.OozieClient; 031 import org.apache.oozie.WorkflowActionBean; 032 import org.apache.oozie.WorkflowJobBean; 033 import org.apache.oozie.ErrorCode; 034 import org.apache.oozie.service.HadoopAccessorException; 035 import org.apache.oozie.service.WorkflowAppService; 036 import org.apache.oozie.service.Services; 037 import org.apache.oozie.service.DagXLogInfoService; 038 import org.apache.oozie.service.WorkflowStoreService; 039 import org.apache.oozie.service.HadoopAccessorService; 040 import org.apache.oozie.util.ParamChecker; 041 import org.apache.oozie.util.PropertiesUtils; 042 import org.apache.oozie.util.XLog; 043 import org.apache.oozie.util.XConfiguration; 044 import org.apache.oozie.util.XmlUtils; 045 import org.apache.oozie.command.Command; 046 import org.apache.oozie.command.CommandException; 047 import org.apache.oozie.store.StoreException; 048 import org.apache.oozie.store.WorkflowStore; 049 import org.apache.oozie.workflow.WorkflowApp; 050 import org.apache.oozie.workflow.WorkflowException; 051 import org.apache.oozie.workflow.WorkflowInstance; 052 import org.apache.oozie.workflow.WorkflowLib; 053 import org.apache.oozie.workflow.lite.NodeHandler; 054 055 public class ReRunCommand extends WorkflowCommand<Void> { 056 057 private String jobId; 058 private Configuration conf; 059 private String authToken; 060 private Set<String> nodesToSkip = new HashSet<String>(); 061 public static final String TO_SKIP = "TO_SKIP"; 062 063 private static final Set<String> DISALLOWED_DEFAULT_PROPERTIES = new HashSet<String>(); 064 private static final Set<String> DISALLOWED_USER_PROPERTIES = new HashSet<String>(); 065 066 static { 067 String[] badUserProps = {PropertiesUtils.DAYS, PropertiesUtils.HOURS, PropertiesUtils.MINUTES, 068 PropertiesUtils.KB, PropertiesUtils.MB, PropertiesUtils.GB, PropertiesUtils.TB, PropertiesUtils.PB, 069 PropertiesUtils.RECORDS, PropertiesUtils.MAP_IN, PropertiesUtils.MAP_OUT, PropertiesUtils.REDUCE_IN, 070 PropertiesUtils.REDUCE_OUT, PropertiesUtils.GROUPS}; 071 PropertiesUtils.createPropertySet(badUserProps, DISALLOWED_USER_PROPERTIES); 072 073 String[] badDefaultProps = {PropertiesUtils.HADOOP_USER, PropertiesUtils.HADOOP_UGI, 074 WorkflowAppService.HADOOP_JT_KERBEROS_NAME, WorkflowAppService.HADOOP_NN_KERBEROS_NAME}; 075 PropertiesUtils.createPropertySet(badUserProps, DISALLOWED_DEFAULT_PROPERTIES); 076 PropertiesUtils.createPropertySet(badDefaultProps, DISALLOWED_DEFAULT_PROPERTIES); 077 } 078 079 public ReRunCommand(String jobId, Configuration conf, String authToken) { 080 super("rerun", "rerun", 1, XLog.STD); 081 this.jobId = ParamChecker.notEmpty(jobId, "jobId"); 082 this.conf = ParamChecker.notNull(conf, "conf"); 083 this.authToken = ParamChecker.notEmpty(authToken, "authToken"); 084 } 085 086 /** 087 * Checks the pre-conditions that are required for workflow to recover - Last run of Workflow should be completed - 088 * The nodes that are to be skipped are to be completed successfully in the base run. 089 * 090 * @param wfBean Workflow bean 091 * @param actions List of actions of Workflow 092 * @throws org.apache.oozie.command.CommandException On failure of pre-conditions 093 */ 094 private void checkPreConditions(WorkflowJobBean wfBean, List<WorkflowActionBean> actions) throws CommandException { 095 if (!(wfBean.getStatus().equals(WorkflowJob.Status.FAILED) 096 || wfBean.getStatus().equals(WorkflowJob.Status.KILLED) || wfBean.getStatus().equals( 097 WorkflowJob.Status.SUCCEEDED))) { 098 throw new CommandException(ErrorCode.E0805, wfBean.getStatus()); 099 } 100 Set<String> unmachedNodes = new HashSet<String>(nodesToSkip); 101 for (WorkflowActionBean action : actions) { 102 if (nodesToSkip.contains(action.getName())) { 103 if (!action.getStatus().equals(WorkflowAction.Status.OK) 104 && !action.getStatus().equals(WorkflowAction.Status.ERROR)) { 105 throw new CommandException(ErrorCode.E0806, action.getName()); 106 } 107 unmachedNodes.remove(action.getName()); 108 } 109 } 110 if (unmachedNodes.size() > 0) { 111 StringBuilder sb = new StringBuilder(); 112 String separator = ""; 113 for (String s : unmachedNodes) { 114 sb.append(separator).append(s); 115 separator = ","; 116 } 117 throw new CommandException(ErrorCode.E0807, sb); 118 } 119 } 120 121 /** 122 * Parses the config and adds the nodes that are to be skipped to the skipped node list 123 */ 124 private void parseSkippedNodeConf() { 125 if (conf != null) { 126 Collection<String> skipNodes = conf.getStringCollection(OozieClient.RERUN_SKIP_NODES); 127 for (String str : skipNodes) { 128 // trimming is required 129 nodesToSkip.add(str.trim()); 130 } 131 } 132 } 133 134 protected Void call(WorkflowStore store) throws StoreException, CommandException { 135 incrJobCounter(1); 136 WorkflowJobBean wfBean = store.getWorkflow(jobId, false); 137 setLogInfo(wfBean); 138 List<WorkflowActionBean> actions = store.getActionsForWorkflow(jobId, false); 139 WorkflowInstance oldWfInstance = wfBean.getWorkflowInstance(); 140 WorkflowInstance newWfInstance; 141 XLog log = XLog.getLog(getClass()); 142 parseSkippedNodeConf(); 143 checkPreConditions(wfBean, actions); 144 145 WorkflowAppService wps = Services.get().get(WorkflowAppService.class); 146 try { 147 XLog.Info.get().setParameter(DagXLogInfoService.TOKEN, conf.get(OozieClient.LOG_TOKEN)); 148 WorkflowApp app = wps.parseDef(conf, authToken); 149 XConfiguration protoActionConf = wps.createProtoActionConf(conf, authToken, true); 150 WorkflowLib workflowLib = Services.get().get(WorkflowStoreService.class).getWorkflowLibWithNoDB(); 151 152 Path configDefault = new Path(new Path(conf.get(OozieClient.APP_PATH)).getParent(), SubmitCommand.CONFIG_DEFAULT); 153 FileSystem fs = Services.get().get(HadoopAccessorService.class). 154 createFileSystem(wfBean.getUser(), wfBean.getGroup(), configDefault.toUri(), protoActionConf); 155 156 if (fs.exists(configDefault)) { 157 Configuration defaultConf = new XConfiguration(fs.open(configDefault)); 158 PropertiesUtils.checkDisallowedProperties(defaultConf, DISALLOWED_DEFAULT_PROPERTIES); 159 XConfiguration.injectDefaults(defaultConf, conf); 160 } 161 162 PropertiesUtils.checkDisallowedProperties(conf, DISALLOWED_USER_PROPERTIES); 163 164 try { 165 newWfInstance = workflowLib.createInstance(app, conf, jobId); 166 } 167 catch (WorkflowException e) { 168 throw new StoreException(e); 169 } 170 wfBean.setAppName(app.getName()); 171 wfBean.setProtoActionConf(protoActionConf.toXmlString()); 172 } 173 catch (WorkflowException ex) { 174 throw new CommandException(ex); 175 } 176 catch (IOException ex) { 177 throw new CommandException(ErrorCode.E0803, ex); 178 } 179 catch (HadoopAccessorException e) { 180 throw new CommandException(e); 181 } 182 183 for (int i = 0; i < actions.size(); i++) { 184 if (!nodesToSkip.contains(actions.get(i).getName())) { 185 store.deleteAction(actions.get(i).getId()); 186 log.info("Deleting Action[{0}] for re-run", actions.get(i).getId()); 187 } 188 else { 189 copyActionData(newWfInstance, oldWfInstance); 190 } 191 } 192 193 wfBean.setAppPath(conf.get(OozieClient.APP_PATH)); 194 wfBean.setConf(XmlUtils.prettyPrint(conf).toString()); 195 wfBean.setLogToken(conf.get(OozieClient.LOG_TOKEN, "")); 196 wfBean.setUser(conf.get(OozieClient.USER_NAME)); 197 wfBean.setGroup(conf.get(OozieClient.GROUP_NAME)); 198 wfBean.setExternalId(conf.get(OozieClient.EXTERNAL_ID)); 199 wfBean.setEndTime(null); 200 wfBean.setRun(wfBean.getRun() + 1); 201 wfBean.setStatus(WorkflowJob.Status.PREP); 202 wfBean.setWorkflowInstance(newWfInstance); 203 store.updateWorkflow(wfBean); 204 return null; 205 } 206 207 /** 208 * Copys the variables for skipped nodes from the old wfInstance to new one. 209 * 210 * @param newWfInstance 211 * @param oldWfInstance 212 */ 213 private void copyActionData(WorkflowInstance newWfInstance, WorkflowInstance oldWfInstance) { 214 Map<String, String> oldVars = new HashMap<String, String>(); 215 Map<String, String> newVars = new HashMap<String, String>(); 216 oldVars = oldWfInstance.getAllVars(); 217 for (String var : oldVars.keySet()) { 218 String actionName = var.split(WorkflowInstance.NODE_VAR_SEPARATOR)[0]; 219 if (nodesToSkip.contains(actionName)) { 220 newVars.put(var, oldVars.get(var)); 221 } 222 } 223 for (String node : nodesToSkip) { 224 // Setting the TO_SKIP variable to true. This will be used by 225 // SignalCommand and LiteNodeHandler to skip the action. 226 newVars.put(node + WorkflowInstance.NODE_VAR_SEPARATOR + TO_SKIP, "true"); 227 String visitedFlag = NodeHandler.getLoopFlag(node); 228 // Removing the visited flag so that the action won't be considered 229 // a loop. 230 if (newVars.containsKey(visitedFlag)) { 231 newVars.remove(visitedFlag); 232 } 233 } 234 newWfInstance.setAllVars(newVars); 235 } 236 237 @Override 238 protected Void execute(WorkflowStore store) throws CommandException, StoreException { 239 try { 240 XLog.getLog(getClass()).debug("STARTED ReRunCommand for job " + jobId); 241 if (lock(jobId)) { 242 call(store); 243 } 244 else { 245 queueCallable(new ReRunCommand(jobId, conf, authToken), LOCK_FAILURE_REQUEUE_INTERVAL); 246 XLog.getLog(getClass()).warn("ReRunCommand lock was not acquired - failed {0}", jobId); 247 } 248 } 249 catch (InterruptedException e) { 250 queueCallable(new ReRunCommand(jobId, conf, authToken), LOCK_FAILURE_REQUEUE_INTERVAL); 251 XLog.getLog(getClass()) 252 .warn("ReRunCommand lock was not acquired - interrupted exception failed {0}", jobId); 253 } 254 XLog.getLog(getClass()).debug("ENDED ReRunCommand for job " + jobId); 255 return null; 256 } 257 }