001    /**
002     * Copyright (c) 2010 Yahoo! Inc. All rights reserved.
003     * Licensed under the Apache License, Version 2.0 (the "License");
004     * you may not use this file except in compliance with the License.
005     * You may obtain a copy of the License at
006     *
007     *   http://www.apache.org/licenses/LICENSE-2.0
008     *
009     *  Unless required by applicable law or agreed to in writing, software
010     *  distributed under the License is distributed on an "AS IS" BASIS,
011     *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
012     *  See the License for the specific language governing permissions and
013     *  limitations under the License. See accompanying LICENSE file.
014     */
015    package org.apache.oozie.command.wf;
016    
017    import java.util.HashSet;
018    import java.util.Set;
019    import java.io.IOException;
020    import java.util.Collection;
021    import java.util.HashMap;
022    import java.util.List;
023    import java.util.Map;
024    
025    import org.apache.hadoop.conf.Configuration;
026    import org.apache.hadoop.fs.FileSystem;
027    import org.apache.hadoop.fs.Path;
028    import org.apache.oozie.client.WorkflowAction;
029    import org.apache.oozie.client.WorkflowJob;
030    import org.apache.oozie.client.OozieClient;
031    import org.apache.oozie.WorkflowActionBean;
032    import org.apache.oozie.WorkflowJobBean;
033    import org.apache.oozie.ErrorCode;
034    import org.apache.oozie.service.HadoopAccessorException;
035    import org.apache.oozie.service.WorkflowAppService;
036    import org.apache.oozie.service.Services;
037    import org.apache.oozie.service.DagXLogInfoService;
038    import org.apache.oozie.service.WorkflowStoreService;
039    import org.apache.oozie.service.HadoopAccessorService;
040    import org.apache.oozie.util.ParamChecker;
041    import org.apache.oozie.util.PropertiesUtils;
042    import org.apache.oozie.util.XLog;
043    import org.apache.oozie.util.XConfiguration;
044    import org.apache.oozie.util.XmlUtils;
045    import org.apache.oozie.command.Command;
046    import org.apache.oozie.command.CommandException;
047    import org.apache.oozie.store.StoreException;
048    import org.apache.oozie.store.WorkflowStore;
049    import org.apache.oozie.workflow.WorkflowApp;
050    import org.apache.oozie.workflow.WorkflowException;
051    import org.apache.oozie.workflow.WorkflowInstance;
052    import org.apache.oozie.workflow.WorkflowLib;
053    import org.apache.oozie.workflow.lite.NodeHandler;
054    
055    public class ReRunCommand extends WorkflowCommand<Void> {
056    
057        private String jobId;
058        private Configuration conf;
059        private String authToken;
060        private Set<String> nodesToSkip = new HashSet<String>();
061        public static final String TO_SKIP = "TO_SKIP";
062    
063        private static final Set<String> DISALLOWED_DEFAULT_PROPERTIES = new HashSet<String>();
064        private static final Set<String> DISALLOWED_USER_PROPERTIES = new HashSet<String>();
065    
066        static {
067            String[] badUserProps = {PropertiesUtils.DAYS, PropertiesUtils.HOURS, PropertiesUtils.MINUTES,
068                    PropertiesUtils.KB, PropertiesUtils.MB, PropertiesUtils.GB, PropertiesUtils.TB, PropertiesUtils.PB,
069                    PropertiesUtils.RECORDS, PropertiesUtils.MAP_IN, PropertiesUtils.MAP_OUT, PropertiesUtils.REDUCE_IN,
070                    PropertiesUtils.REDUCE_OUT, PropertiesUtils.GROUPS};
071            PropertiesUtils.createPropertySet(badUserProps, DISALLOWED_USER_PROPERTIES);
072    
073            String[] badDefaultProps = {PropertiesUtils.HADOOP_USER, PropertiesUtils.HADOOP_UGI,
074                    WorkflowAppService.HADOOP_JT_KERBEROS_NAME, WorkflowAppService.HADOOP_NN_KERBEROS_NAME};
075            PropertiesUtils.createPropertySet(badUserProps, DISALLOWED_DEFAULT_PROPERTIES);
076            PropertiesUtils.createPropertySet(badDefaultProps, DISALLOWED_DEFAULT_PROPERTIES);
077        }
078    
079        public ReRunCommand(String jobId, Configuration conf, String authToken) {
080            super("rerun", "rerun", 1, XLog.STD);
081            this.jobId = ParamChecker.notEmpty(jobId, "jobId");
082            this.conf = ParamChecker.notNull(conf, "conf");
083            this.authToken = ParamChecker.notEmpty(authToken, "authToken");
084        }
085    
086        /**
087         * Checks the pre-conditions that are required for workflow to recover - Last run of Workflow should be completed -
088         * The nodes that are to be skipped are to be completed successfully in the base run.
089         *
090         * @param wfBean Workflow bean
091         * @param actions List of actions of Workflow
092         * @throws org.apache.oozie.command.CommandException On failure of pre-conditions
093         */
094        private void checkPreConditions(WorkflowJobBean wfBean, List<WorkflowActionBean> actions) throws CommandException {
095            if (!(wfBean.getStatus().equals(WorkflowJob.Status.FAILED)
096                    || wfBean.getStatus().equals(WorkflowJob.Status.KILLED) || wfBean.getStatus().equals(
097                    WorkflowJob.Status.SUCCEEDED))) {
098                throw new CommandException(ErrorCode.E0805, wfBean.getStatus());
099            }
100            Set<String> unmachedNodes = new HashSet<String>(nodesToSkip);
101            for (WorkflowActionBean action : actions) {
102                if (nodesToSkip.contains(action.getName())) {
103                    if (!action.getStatus().equals(WorkflowAction.Status.OK)
104                            && !action.getStatus().equals(WorkflowAction.Status.ERROR)) {
105                        throw new CommandException(ErrorCode.E0806, action.getName());
106                    }
107                    unmachedNodes.remove(action.getName());
108                }
109            }
110            if (unmachedNodes.size() > 0) {
111                StringBuilder sb = new StringBuilder();
112                String separator = "";
113                for (String s : unmachedNodes) {
114                    sb.append(separator).append(s);
115                    separator = ",";
116                }
117                throw new CommandException(ErrorCode.E0807, sb);
118            }
119        }
120    
121        /**
122         * Parses the config and adds the nodes that are to be skipped to the skipped node list
123         */
124        private void parseSkippedNodeConf() {
125            if (conf != null) {
126                Collection<String> skipNodes = conf.getStringCollection(OozieClient.RERUN_SKIP_NODES);
127                for (String str : skipNodes) {
128                    // trimming is required
129                    nodesToSkip.add(str.trim());
130                }
131            }
132        }
133    
134        protected Void call(WorkflowStore store) throws StoreException, CommandException {
135            incrJobCounter(1);
136            WorkflowJobBean wfBean = store.getWorkflow(jobId, false);
137            setLogInfo(wfBean);
138            List<WorkflowActionBean> actions = store.getActionsForWorkflow(jobId, false);
139            WorkflowInstance oldWfInstance = wfBean.getWorkflowInstance();
140            WorkflowInstance newWfInstance;
141            XLog log = XLog.getLog(getClass());
142            parseSkippedNodeConf();
143            checkPreConditions(wfBean, actions);
144    
145            WorkflowAppService wps = Services.get().get(WorkflowAppService.class);
146            try {
147                XLog.Info.get().setParameter(DagXLogInfoService.TOKEN, conf.get(OozieClient.LOG_TOKEN));
148                WorkflowApp app = wps.parseDef(conf, authToken);
149                XConfiguration protoActionConf = wps.createProtoActionConf(conf, authToken, true);
150                WorkflowLib workflowLib = Services.get().get(WorkflowStoreService.class).getWorkflowLibWithNoDB();
151    
152                Path configDefault = new Path(new Path(conf.get(OozieClient.APP_PATH)).getParent(), SubmitCommand.CONFIG_DEFAULT);
153                FileSystem fs = Services.get().get(HadoopAccessorService.class).
154                        createFileSystem(wfBean.getUser(), wfBean.getGroup(), configDefault.toUri(), protoActionConf);
155    
156                if (fs.exists(configDefault)) {
157                    Configuration defaultConf = new XConfiguration(fs.open(configDefault));
158                    PropertiesUtils.checkDisallowedProperties(defaultConf, DISALLOWED_DEFAULT_PROPERTIES);
159                    XConfiguration.injectDefaults(defaultConf, conf);
160                }
161    
162                PropertiesUtils.checkDisallowedProperties(conf, DISALLOWED_USER_PROPERTIES);
163    
164                try {
165                    newWfInstance = workflowLib.createInstance(app, conf, jobId);
166                }
167                catch (WorkflowException e) {
168                    throw new StoreException(e);
169                }
170                wfBean.setAppName(app.getName());
171                wfBean.setProtoActionConf(protoActionConf.toXmlString());
172            }
173            catch (WorkflowException ex) {
174                throw new CommandException(ex);
175            }
176            catch (IOException ex) {
177                throw new CommandException(ErrorCode.E0803, ex);
178            }
179            catch (HadoopAccessorException e) {
180                throw new CommandException(e);
181            }
182    
183            for (int i = 0; i < actions.size(); i++) {
184                if (!nodesToSkip.contains(actions.get(i).getName())) {
185                    store.deleteAction(actions.get(i).getId());
186                    log.info("Deleting Action[{0}] for re-run", actions.get(i).getId());
187                }
188                else {
189                    copyActionData(newWfInstance, oldWfInstance);
190                }
191            }
192    
193            wfBean.setAppPath(conf.get(OozieClient.APP_PATH));
194            wfBean.setConf(XmlUtils.prettyPrint(conf).toString());
195            wfBean.setLogToken(conf.get(OozieClient.LOG_TOKEN, ""));
196            wfBean.setUser(conf.get(OozieClient.USER_NAME));
197            wfBean.setGroup(conf.get(OozieClient.GROUP_NAME));
198            wfBean.setExternalId(conf.get(OozieClient.EXTERNAL_ID));
199            wfBean.setEndTime(null);
200            wfBean.setRun(wfBean.getRun() + 1);
201            wfBean.setStatus(WorkflowJob.Status.PREP);
202            wfBean.setWorkflowInstance(newWfInstance);
203            store.updateWorkflow(wfBean);
204            return null;
205        }
206    
207        /**
208         * Copys the variables for skipped nodes from the old wfInstance to new one.
209         *
210         * @param newWfInstance
211         * @param oldWfInstance
212         */
213        private void copyActionData(WorkflowInstance newWfInstance, WorkflowInstance oldWfInstance) {
214            Map<String, String> oldVars = new HashMap<String, String>();
215            Map<String, String> newVars = new HashMap<String, String>();
216            oldVars = oldWfInstance.getAllVars();
217            for (String var : oldVars.keySet()) {
218                String actionName = var.split(WorkflowInstance.NODE_VAR_SEPARATOR)[0];
219                if (nodesToSkip.contains(actionName)) {
220                    newVars.put(var, oldVars.get(var));
221                }
222            }
223            for (String node : nodesToSkip) {
224                // Setting the TO_SKIP variable to true. This will be used by
225                // SignalCommand and LiteNodeHandler to skip the action.
226                newVars.put(node + WorkflowInstance.NODE_VAR_SEPARATOR + TO_SKIP, "true");
227                String visitedFlag = NodeHandler.getLoopFlag(node);
228                // Removing the visited flag so that the action won't be considered
229                // a loop.
230                if (newVars.containsKey(visitedFlag)) {
231                    newVars.remove(visitedFlag);
232                }
233            }
234            newWfInstance.setAllVars(newVars);
235        }
236    
237        @Override
238        protected Void execute(WorkflowStore store) throws CommandException, StoreException {
239            try {
240                XLog.getLog(getClass()).debug("STARTED ReRunCommand for job " + jobId);
241                if (lock(jobId)) {
242                    call(store);
243                }
244                else {
245                    queueCallable(new ReRunCommand(jobId, conf, authToken), LOCK_FAILURE_REQUEUE_INTERVAL);
246                    XLog.getLog(getClass()).warn("ReRunCommand lock was not acquired - failed {0}", jobId);
247                }
248            }
249            catch (InterruptedException e) {
250                queueCallable(new ReRunCommand(jobId, conf, authToken), LOCK_FAILURE_REQUEUE_INTERVAL);
251                XLog.getLog(getClass())
252                        .warn("ReRunCommand lock was not acquired - interrupted exception failed {0}", jobId);
253            }
254            XLog.getLog(getClass()).debug("ENDED ReRunCommand for job " + jobId);
255            return null;
256        }
257    }