001 /** 002 * Copyright (c) 2010 Yahoo! Inc. All rights reserved. 003 * Licensed under the Apache License, Version 2.0 (the "License"); 004 * you may not use this file except in compliance with the License. 005 * You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software 010 * distributed under the License is distributed on an "AS IS" BASIS, 011 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 012 * See the License for the specific language governing permissions and 013 * limitations under the License. See accompanying LICENSE file. 014 */ 015 package org.apache.oozie.command.coord; 016 017 import java.io.IOException; 018 import java.io.StringReader; 019 import java.util.ArrayList; 020 import java.util.Date; 021 import java.util.HashSet; 022 import java.util.List; 023 import java.util.Set; 024 025 import org.apache.hadoop.conf.Configuration; 026 import org.apache.hadoop.fs.Path; 027 import org.apache.oozie.CoordinatorActionBean; 028 import org.apache.oozie.CoordinatorActionInfo; 029 import org.apache.oozie.CoordinatorJobBean; 030 import org.apache.oozie.ErrorCode; 031 import org.apache.oozie.XException; 032 import org.apache.oozie.action.ActionExecutorException; 033 import org.apache.oozie.action.hadoop.FsActionExecutor; 034 import org.apache.oozie.client.CoordinatorAction; 035 import org.apache.oozie.client.CoordinatorJob; 036 import org.apache.oozie.client.SLAEvent.SlaAppType; 037 import org.apache.oozie.client.rest.RestConstants; 038 import org.apache.oozie.command.CommandException; 039 import org.apache.oozie.coord.CoordELFunctions; 040 import org.apache.oozie.store.CoordinatorStore; 041 import org.apache.oozie.store.StoreException; 042 import org.apache.oozie.util.DateUtils; 043 import org.apache.oozie.util.ParamChecker; 044 import org.apache.oozie.util.XConfiguration; 045 import org.apache.oozie.util.XLog; 046 import org.apache.oozie.util.XmlUtils; 047 import org.apache.oozie.util.db.SLADbOperations; 048 import org.jdom.Element; 049 import org.jdom.JDOMException; 050 051 public class CoordRerunCommand extends CoordinatorCommand<CoordinatorActionInfo> { 052 053 private String jobId; 054 private String rerunType; 055 private String scope; 056 private boolean refresh; 057 private boolean noCleanup; 058 private final XLog log = XLog.getLog(getClass()); 059 060 public CoordRerunCommand(String jobId, String rerunType, String scope, boolean refresh, boolean noCleanup) { 061 super("coord_rerun", "coord_rerun", 1, XLog.STD); 062 this.jobId = ParamChecker.notEmpty(jobId, "jobId"); 063 this.rerunType = ParamChecker.notEmpty(rerunType, "rerunType"); 064 this.scope = ParamChecker.notEmpty(scope, "scope"); 065 this.refresh = refresh; 066 this.noCleanup = noCleanup; 067 } 068 069 @Override 070 protected CoordinatorActionInfo call(CoordinatorStore store) throws StoreException, CommandException { 071 try { 072 CoordinatorJobBean coordJob = store.getCoordinatorJob(jobId, false); 073 CoordinatorActionInfo coordInfo = null; 074 setLogInfo(coordJob); 075 if (coordJob.getStatus() != CoordinatorJob.Status.KILLED 076 && coordJob.getStatus() != CoordinatorJob.Status.FAILED) { 077 incrJobCounter(1); 078 079 List<CoordinatorActionBean> coordActions; 080 if (rerunType.equals(RestConstants.JOB_COORD_RERUN_DATE)) { 081 coordActions = getCoordActionsFromDates(jobId, scope, store); 082 } 083 else if (rerunType.equals(RestConstants.JOB_COORD_RERUN_ACTION)) { 084 coordActions = getCoordActionsFromIds(jobId, scope, store); 085 } 086 else { 087 throw new CommandException(ErrorCode.E1018, "date or action expected."); 088 } 089 if (checkAllActionsRunnable(coordActions)) { 090 for (CoordinatorActionBean coordAction : coordActions) { 091 String actionXml = coordAction.getActionXml(); 092 if (!noCleanup) { 093 Element eAction = XmlUtils.parseXml(actionXml); 094 cleanupOutputEvents(eAction, coordJob.getUser(), coordJob.getGroup()); 095 } 096 if (refresh) { 097 refreshAction(coordJob, coordAction, store); 098 } 099 updateAction(coordJob, coordAction, actionXml, store); 100 101 // TODO: time 100s should be configurable 102 queueCallable(new CoordActionNotification(coordAction), 100); 103 queueCallable(new CoordActionInputCheckCommand(coordAction.getId()), 100); 104 } 105 } 106 else { 107 throw new CommandException(ErrorCode.E1018, "part or all actions are not eligible to rerun!"); 108 } 109 coordInfo = new CoordinatorActionInfo(coordActions); 110 } 111 else { 112 log.info("CoordRerunCommand is not able to run, job status=" + coordJob.getStatus() + ", jobid=" 113 + jobId); 114 throw new CommandException(ErrorCode.E1018, 115 "coordinator job is killed or failed so all actions are not eligible to rerun!"); 116 } 117 return coordInfo; 118 } 119 catch (XException xex) { 120 throw new CommandException(xex); 121 } 122 catch (JDOMException jex) { 123 throw new CommandException(ErrorCode.E0700, jex); 124 } 125 catch (Exception ex) { 126 throw new CommandException(ErrorCode.E1018, ex); 127 } 128 } 129 130 /** 131 * Get the list of actions for given id ranges 132 * 133 * @param jobId 134 * @param scope 135 * @param store 136 * @return the list of all actions to rerun 137 * @throws CommandException 138 * @throws StoreException 139 */ 140 private List<CoordinatorActionBean> getCoordActionsFromIds(String jobId, String scope, CoordinatorStore store) 141 throws CommandException, StoreException { 142 ParamChecker.notEmpty(jobId, "jobId"); 143 ParamChecker.notEmpty(scope, "scope"); 144 145 Set<String> actions = new HashSet<String>(); 146 String[] list = scope.split(","); 147 for (String s : list) { 148 s = s.trim(); 149 if (s.contains("-")) { 150 String[] range = s.split("-"); 151 if (range.length != 2) { 152 throw new CommandException(ErrorCode.E0302, "format is wrong for action's range '" + s + "'"); 153 } 154 int start; 155 int end; 156 try { 157 start = Integer.parseInt(range[0].trim()); 158 end = Integer.parseInt(range[1].trim()); 159 if (start > end) { 160 throw new CommandException(ErrorCode.E0302, "format is wrong for action's range '" + s + "'"); 161 } 162 } 163 catch (NumberFormatException ne) { 164 throw new CommandException(ErrorCode.E0302, ne); 165 } 166 for (int i = start; i <= end; i++) { 167 actions.add(jobId + "@" + i); 168 } 169 } 170 else { 171 try { 172 Integer.parseInt(s); 173 } 174 catch (NumberFormatException ne) { 175 throw new CommandException(ErrorCode.E0302, "format is wrong for action id'" + s 176 + "'. Integer only."); 177 } 178 actions.add(jobId + "@" + s); 179 } 180 } 181 182 List<CoordinatorActionBean> coordActions = new ArrayList<CoordinatorActionBean>(); 183 for (String id : actions) { 184 CoordinatorActionBean coordAction = store.getCoordinatorAction(id, false); 185 coordActions.add(coordAction); 186 log.debug("Rerun coordinator for actionId='" + id + "'"); 187 } 188 return coordActions; 189 } 190 191 /** 192 * Get the list of actions for given date ranges 193 * 194 * @param jobId 195 * @param scope 196 * @param store 197 * @return the list of dates to rerun 198 * @throws CommandException 199 * @throws StoreException 200 */ 201 private List<CoordinatorActionBean> getCoordActionsFromDates(String jobId, String scope, CoordinatorStore store) 202 throws CommandException, StoreException { 203 ParamChecker.notEmpty(jobId, "jobId"); 204 ParamChecker.notEmpty(scope, "scope"); 205 206 Set<CoordinatorActionBean> actionSet = new HashSet<CoordinatorActionBean>(); 207 String[] list = scope.split(","); 208 for (String s : list) { 209 s = s.trim(); 210 if (s.contains("::")) { 211 String[] dateRange = s.split("::"); 212 if (dateRange.length != 2) { 213 throw new CommandException(ErrorCode.E0302, "format is wrong for date's range '" + s + "'"); 214 } 215 Date start; 216 Date end; 217 try { 218 start = DateUtils.parseDateUTC(dateRange[0].trim()); 219 end = DateUtils.parseDateUTC(dateRange[1].trim()); 220 if (start.after(end)) { 221 throw new CommandException(ErrorCode.E0302, "start date is older than end date: '" + s + "'"); 222 } 223 } 224 catch (Exception e) { 225 throw new CommandException(ErrorCode.E0302, e); 226 } 227 228 List<CoordinatorActionBean> listOfActions = getActionIdsFromDateRange(jobId, start, end, store); 229 actionSet.addAll(listOfActions); 230 } 231 else { 232 Date date; 233 try { 234 date = DateUtils.parseDateUTC(s.trim()); 235 } 236 catch (Exception e) { 237 throw new CommandException(ErrorCode.E0302, e); 238 } 239 240 CoordinatorActionBean coordAction = store.getCoordActionForNominalTime(jobId, date); 241 actionSet.add(coordAction); 242 } 243 } 244 245 List<CoordinatorActionBean> coordActions = new ArrayList<CoordinatorActionBean>(); 246 for (CoordinatorActionBean coordAction : actionSet) { 247 coordActions.add(coordAction); 248 log.debug("Rerun coordinator for actionId='" + coordAction.getId() + "'"); 249 } 250 return coordActions; 251 } 252 253 private List<CoordinatorActionBean> getActionIdsFromDateRange(String jobId, Date start, Date end, 254 CoordinatorStore store) 255 throws StoreException { 256 List<CoordinatorActionBean> list = store.getCoordActionsForDates(jobId, start, end); 257 return list; 258 } 259 260 /** 261 * Check if all given actions are eligible to rerun. 262 * 263 * @param actions list of CoordinatorActionBean 264 * @return true if all actions are eligible to rerun 265 */ 266 private boolean checkAllActionsRunnable(List<CoordinatorActionBean> coordActions) { 267 for (CoordinatorActionBean coordAction : coordActions) { 268 if (!coordAction.isTerminalStatus()) { 269 return false; 270 } 271 } 272 return true; 273 } 274 275 /** 276 * Cleanup output-events directories 277 * 278 * @param eAction 279 * @param workflow 280 * @param action 281 */ 282 @SuppressWarnings("unchecked") 283 private void cleanupOutputEvents(Element eAction, String user, String group) { 284 Element outputList = eAction.getChild("output-events", eAction.getNamespace()); 285 for (Element data : (List<Element>) outputList.getChildren("data-out", eAction.getNamespace())) { 286 if (data.getChild("uris", data.getNamespace()) != null) { 287 String uris = data.getChild("uris", data.getNamespace()).getTextTrim(); 288 if (uris != null) { 289 String[] uriArr = uris.split(CoordELFunctions.INSTANCE_SEPARATOR); 290 FsActionExecutor fsAe = new FsActionExecutor(); 291 for (String uri : uriArr) { 292 Path path = new Path(uri); 293 try { 294 fsAe.delete(user, group, path); 295 log.debug("Cleanup the output dir " + path); 296 } 297 catch (ActionExecutorException ae) { 298 log.warn("Failed to cleanup the output dir " + uri, ae); 299 } 300 } 301 } 302 303 } 304 } 305 } 306 307 /** 308 * Refresh an Action 309 * 310 * @param coordJob 311 * @param coordAction 312 * @param store 313 * @throws Exception 314 */ 315 private void refreshAction(CoordinatorJobBean coordJob, CoordinatorActionBean coordAction, CoordinatorStore store) 316 throws Exception { 317 Configuration jobConf = null; 318 try { 319 jobConf = new XConfiguration(new StringReader(coordJob.getConf())); 320 } 321 catch (IOException ioe) { 322 log.warn("Configuration parse error. read from DB :" + coordJob.getConf(), ioe); 323 throw new CommandException(ErrorCode.E1005, ioe); 324 } 325 String jobXml = coordJob.getJobXml(); 326 Element eJob = XmlUtils.parseXml(jobXml); 327 String actionXml = CoordCommandUtils.materializeOneInstance(jobId, dryrun, (Element) eJob.clone(), coordAction 328 .getNominalTime(), coordAction.getActionNumber(), jobConf, coordAction); 329 log.debug("Refresh Action actionId=" + coordAction.getId() + ", actionXml=" 330 + XmlUtils.prettyPrint(actionXml).toString()); 331 coordAction.setActionXml(actionXml); 332 } 333 334 /** 335 * Update an Action into database table 336 * 337 * @param coordJob 338 * @param coordAction 339 * @param actionXml 340 * @param store 341 * @throws Exception 342 */ 343 private void updateAction(CoordinatorJobBean coordJob, CoordinatorActionBean coordAction, String actionXml, 344 CoordinatorStore store) throws Exception { 345 log.debug("updateAction for actionId=" + coordAction.getId()); 346 coordAction.setStatus(CoordinatorAction.Status.WAITING); 347 coordAction.setExternalId(""); 348 coordAction.setExternalStatus(""); 349 coordAction.setRerunTime(new Date()); 350 store.updateCoordinatorAction(coordAction); 351 writeActionRegistration(coordAction.getActionXml(), coordAction, store, coordJob.getUser(), coordJob.getGroup()); 352 } 353 354 /** 355 * Create SLA RegistrationEvent 356 * 357 * @param actionXml 358 * @param actionBean 359 * @param store 360 * @param user 361 * @param group 362 * @throws Exception 363 */ 364 private void writeActionRegistration(String actionXml, CoordinatorActionBean actionBean, CoordinatorStore store, 365 String user, String group) 366 throws Exception { 367 Element eAction = XmlUtils.parseXml(actionXml); 368 Element eSla = eAction.getChild("action", eAction.getNamespace()).getChild("info", eAction.getNamespace("sla")); 369 SLADbOperations.writeSlaRegistrationEvent(eSla, store, actionBean.getId(), SlaAppType.COORDINATOR_ACTION, user, 370 group); 371 } 372 373 @Override 374 protected CoordinatorActionInfo execute(CoordinatorStore store) throws StoreException, CommandException { 375 log.info("STARTED CoordRerunCommand for jobId=" + jobId + ", scope=" + scope); 376 CoordinatorActionInfo coordInfo = null; 377 try { 378 if (lock(jobId)) { 379 coordInfo = call(store); 380 } 381 else { 382 queueCallable(new CoordResumeCommand(jobId), LOCK_FAILURE_REQUEUE_INTERVAL); 383 log.warn("CoordRerunCommand lock was not acquired - " + " failed " + jobId + ". Requeing the same."); 384 } 385 } 386 catch (InterruptedException e) { 387 queueCallable(new CoordResumeCommand(jobId), LOCK_FAILURE_REQUEUE_INTERVAL); 388 log.warn("CoordRerunCommand lock acquiring failed " + " with exception " + e.getMessage() + " for job id " 389 + jobId + ". Requeing the same."); 390 } 391 finally { 392 log.info("ENDED CoordRerunCommand for jobId=" + jobId + ", scope=" + scope); 393 } 394 return coordInfo; 395 } 396 397 }