001 /** 002 * Copyright (c) 2010 Yahoo! Inc. All rights reserved. 003 * Licensed under the Apache License, Version 2.0 (the "License"); 004 * you may not use this file except in compliance with the License. 005 * You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software 010 * distributed under the License is distributed on an "AS IS" BASIS, 011 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 012 * See the License for the specific language governing permissions and 013 * limitations under the License. See accompanying LICENSE file. 014 */ 015 package org.apache.oozie.command.coord; 016 017 import java.io.IOException; 018 import java.io.StringReader; 019 import java.util.ArrayList; 020 import java.util.Date; 021 import java.util.HashSet; 022 import java.util.List; 023 import java.util.Set; 024 025 import org.apache.hadoop.conf.Configuration; 026 import org.apache.hadoop.fs.FileSystem; 027 import org.apache.hadoop.fs.Path; 028 import org.apache.oozie.CoordinatorActionBean; 029 import org.apache.oozie.CoordinatorActionInfo; 030 import org.apache.oozie.CoordinatorJobBean; 031 import org.apache.oozie.ErrorCode; 032 import org.apache.oozie.XException; 033 import org.apache.oozie.client.CoordinatorAction; 034 import org.apache.oozie.client.CoordinatorJob; 035 import org.apache.oozie.client.SLAEvent.SlaAppType; 036 import org.apache.oozie.client.rest.RestConstants; 037 import org.apache.oozie.command.CommandException; 038 import org.apache.oozie.coord.CoordELFunctions; 039 import org.apache.oozie.service.HadoopAccessorService; 040 import org.apache.oozie.service.Services; 041 import org.apache.oozie.store.CoordinatorStore; 042 import org.apache.oozie.store.StoreException; 043 import org.apache.oozie.util.DateUtils; 044 import org.apache.oozie.util.ParamChecker; 045 import org.apache.oozie.util.XConfiguration; 046 import org.apache.oozie.util.XLog; 047 import org.apache.oozie.util.XmlUtils; 048 import org.apache.oozie.util.db.SLADbOperations; 049 import org.jdom.Element; 050 import org.jdom.JDOMException; 051 052 public class CoordRerunCommand extends CoordinatorCommand<CoordinatorActionInfo> { 053 054 private String jobId; 055 private String rerunType; 056 private String scope; 057 private boolean refresh; 058 private boolean noCleanup; 059 private final XLog log = XLog.getLog(getClass()); 060 061 public CoordRerunCommand(String jobId, String rerunType, String scope, boolean refresh, boolean noCleanup) { 062 super("coord_rerun", "coord_rerun", 1, XLog.STD); 063 this.jobId = ParamChecker.notEmpty(jobId, "jobId"); 064 this.rerunType = ParamChecker.notEmpty(rerunType, "rerunType"); 065 this.scope = ParamChecker.notEmpty(scope, "scope"); 066 this.refresh = refresh; 067 this.noCleanup = noCleanup; 068 } 069 070 @Override 071 protected CoordinatorActionInfo call(CoordinatorStore store) throws StoreException, CommandException { 072 try { 073 CoordinatorJobBean coordJob = store.getCoordinatorJob(jobId, false); 074 CoordinatorActionInfo coordInfo = null; 075 setLogInfo(coordJob); 076 if (coordJob.getStatus() != CoordinatorJob.Status.KILLED 077 && coordJob.getStatus() != CoordinatorJob.Status.FAILED) { 078 incrJobCounter(1); 079 080 List<CoordinatorActionBean> coordActions; 081 if (rerunType.equals(RestConstants.JOB_COORD_RERUN_DATE)) { 082 coordActions = getCoordActionsFromDates(jobId, scope, store); 083 } 084 else if (rerunType.equals(RestConstants.JOB_COORD_RERUN_ACTION)) { 085 coordActions = getCoordActionsFromIds(jobId, scope, store); 086 } 087 else { 088 throw new CommandException(ErrorCode.E1018, "date or action expected."); 089 } 090 if (checkAllActionsRunnable(coordActions)) { 091 Configuration conf = new XConfiguration(new StringReader(coordJob.getConf())); 092 for (CoordinatorActionBean coordAction : coordActions) { 093 String actionXml = coordAction.getActionXml(); 094 if (!noCleanup) { 095 Element eAction = XmlUtils.parseXml(actionXml); 096 cleanupOutputEvents(eAction, coordJob.getUser(), coordJob.getGroup(), conf); 097 } 098 if (refresh) { 099 refreshAction(coordJob, coordAction, store); 100 } 101 updateAction(coordJob, coordAction, actionXml, store); 102 103 // TODO: time 100s should be configurable 104 queueCallable(new CoordActionNotification(coordAction), 100); 105 CoordActionInputCheckCommand.queue(new CoordActionInputCheckCommand(coordAction.getId()), 100); 106 } 107 } 108 else { 109 throw new CommandException(ErrorCode.E1018, "part or all actions are not eligible to rerun!"); 110 } 111 coordInfo = new CoordinatorActionInfo(coordActions); 112 } 113 else { 114 log.info("CoordRerunCommand is not able to run, job status=" + coordJob.getStatus() + ", jobid=" 115 + jobId); 116 throw new CommandException(ErrorCode.E1018, 117 "coordinator job is killed or failed so all actions are not eligible to rerun!"); 118 } 119 return coordInfo; 120 } 121 catch (XException xex) { 122 throw new CommandException(xex); 123 } 124 catch (JDOMException jex) { 125 throw new CommandException(ErrorCode.E0700, jex); 126 } 127 catch (Exception ex) { 128 throw new CommandException(ErrorCode.E1018, ex); 129 } 130 } 131 132 /** 133 * Get the list of actions for given id ranges 134 * 135 * @param jobId 136 * @param scope 137 * @param store 138 * @return the list of all actions to rerun 139 * @throws CommandException 140 * @throws StoreException 141 */ 142 private List<CoordinatorActionBean> getCoordActionsFromIds(String jobId, String scope, CoordinatorStore store) 143 throws CommandException, StoreException { 144 ParamChecker.notEmpty(jobId, "jobId"); 145 ParamChecker.notEmpty(scope, "scope"); 146 147 Set<String> actions = new HashSet<String>(); 148 String[] list = scope.split(","); 149 for (String s : list) { 150 s = s.trim(); 151 if (s.contains("-")) { 152 String[] range = s.split("-"); 153 if (range.length != 2) { 154 throw new CommandException(ErrorCode.E0302, "format is wrong for action's range '" + s + "'"); 155 } 156 int start; 157 int end; 158 try { 159 start = Integer.parseInt(range[0].trim()); 160 end = Integer.parseInt(range[1].trim()); 161 if (start > end) { 162 throw new CommandException(ErrorCode.E0302, "format is wrong for action's range '" + s + "'"); 163 } 164 } 165 catch (NumberFormatException ne) { 166 throw new CommandException(ErrorCode.E0302, ne); 167 } 168 for (int i = start; i <= end; i++) { 169 actions.add(jobId + "@" + i); 170 } 171 } 172 else { 173 try { 174 Integer.parseInt(s); 175 } 176 catch (NumberFormatException ne) { 177 throw new CommandException(ErrorCode.E0302, "format is wrong for action id'" + s 178 + "'. Integer only."); 179 } 180 actions.add(jobId + "@" + s); 181 } 182 } 183 184 List<CoordinatorActionBean> coordActions = new ArrayList<CoordinatorActionBean>(); 185 for (String id : actions) { 186 CoordinatorActionBean coordAction = store.getCoordinatorAction(id, false); 187 coordActions.add(coordAction); 188 log.debug("Rerun coordinator for actionId='" + id + "'"); 189 } 190 return coordActions; 191 } 192 193 /** 194 * Get the list of actions for given date ranges 195 * 196 * @param jobId 197 * @param scope 198 * @param store 199 * @return the list of dates to rerun 200 * @throws CommandException 201 * @throws StoreException 202 */ 203 private List<CoordinatorActionBean> getCoordActionsFromDates(String jobId, String scope, CoordinatorStore store) 204 throws CommandException, StoreException { 205 ParamChecker.notEmpty(jobId, "jobId"); 206 ParamChecker.notEmpty(scope, "scope"); 207 208 Set<CoordinatorActionBean> actionSet = new HashSet<CoordinatorActionBean>(); 209 String[] list = scope.split(","); 210 for (String s : list) { 211 s = s.trim(); 212 if (s.contains("::")) { 213 String[] dateRange = s.split("::"); 214 if (dateRange.length != 2) { 215 throw new CommandException(ErrorCode.E0302, "format is wrong for date's range '" + s + "'"); 216 } 217 Date start; 218 Date end; 219 try { 220 start = DateUtils.parseDateUTC(dateRange[0].trim()); 221 end = DateUtils.parseDateUTC(dateRange[1].trim()); 222 if (start.after(end)) { 223 throw new CommandException(ErrorCode.E0302, "start date is older than end date: '" + s + "'"); 224 } 225 } 226 catch (Exception e) { 227 throw new CommandException(ErrorCode.E0302, e); 228 } 229 230 List<CoordinatorActionBean> listOfActions = getActionIdsFromDateRange(jobId, start, end, store); 231 actionSet.addAll(listOfActions); 232 } 233 else { 234 Date date; 235 try { 236 date = DateUtils.parseDateUTC(s.trim()); 237 } 238 catch (Exception e) { 239 throw new CommandException(ErrorCode.E0302, e); 240 } 241 242 CoordinatorActionBean coordAction = store.getCoordActionForNominalTime(jobId, date); 243 actionSet.add(coordAction); 244 } 245 } 246 247 List<CoordinatorActionBean> coordActions = new ArrayList<CoordinatorActionBean>(); 248 for (CoordinatorActionBean coordAction : actionSet) { 249 coordActions.add(coordAction); 250 log.debug("Rerun coordinator for actionId='" + coordAction.getId() + "'"); 251 } 252 return coordActions; 253 } 254 255 private List<CoordinatorActionBean> getActionIdsFromDateRange(String jobId, Date start, Date end, 256 CoordinatorStore store) 257 throws StoreException { 258 List<CoordinatorActionBean> list = store.getCoordActionsForDates(jobId, start, end); 259 return list; 260 } 261 262 /** 263 * Check if all given actions are eligible to rerun. 264 * 265 * @param actions list of CoordinatorActionBean 266 * @return true if all actions are eligible to rerun 267 */ 268 private boolean checkAllActionsRunnable(List<CoordinatorActionBean> coordActions) { 269 for (CoordinatorActionBean coordAction : coordActions) { 270 if (!coordAction.isTerminalStatus()) { 271 return false; 272 } 273 } 274 return true; 275 } 276 277 /** 278 * Cleanup output-events directories 279 * 280 * @param eAction 281 * @param workflow 282 * @param action 283 */ 284 @SuppressWarnings("unchecked") 285 private void cleanupOutputEvents(Element eAction, String user, String group, Configuration conf) { 286 Element outputList = eAction.getChild("output-events", eAction.getNamespace()); 287 if (outputList != null) { 288 for (Element data : (List<Element>) outputList.getChildren("data-out", eAction.getNamespace())) { 289 if (data.getChild("uris", data.getNamespace()) != null) { 290 String uris = data.getChild("uris", data.getNamespace()).getTextTrim(); 291 if (uris != null) { 292 String[] uriArr = uris.split(CoordELFunctions.INSTANCE_SEPARATOR); 293 for (String uri : uriArr) { 294 Path path = new Path(uri); 295 try { 296 FileSystem fs = Services.get().get(HadoopAccessorService.class). 297 createFileSystem(user, group, path.toUri(), conf); 298 if (fs.exists(path)) { 299 if (!fs.delete(path, true)) { 300 throw new IOException(); 301 } 302 } 303 log.debug("Cleanup the output dir " + path); 304 } 305 catch (Exception ex) { 306 log.warn("Failed to cleanup the output dir " + uri, ex); 307 } 308 } 309 } 310 311 } 312 } 313 } 314 else { 315 log.info("No output-events defined in coordinator xml. Therefore nothing to cleanup"); 316 } 317 } 318 319 /** 320 * Refresh an Action 321 * 322 * @param coordJob 323 * @param coordAction 324 * @param store 325 * @throws Exception 326 */ 327 private void refreshAction(CoordinatorJobBean coordJob, CoordinatorActionBean coordAction, CoordinatorStore store) 328 throws Exception { 329 Configuration jobConf = null; 330 try { 331 jobConf = new XConfiguration(new StringReader(coordJob.getConf())); 332 } 333 catch (IOException ioe) { 334 log.warn("Configuration parse error. read from DB :" + coordJob.getConf(), ioe); 335 throw new CommandException(ErrorCode.E1005, ioe); 336 } 337 String jobXml = coordJob.getJobXml(); 338 Element eJob = XmlUtils.parseXml(jobXml); 339 String actionXml = CoordCommandUtils.materializeOneInstance(jobId, dryrun, (Element) eJob.clone(), coordAction 340 .getNominalTime(), coordAction.getActionNumber(), jobConf, coordAction); 341 log.debug("Refresh Action actionId=" + coordAction.getId() + ", actionXml=" 342 + XmlUtils.prettyPrint(actionXml).toString()); 343 coordAction.setActionXml(actionXml); 344 } 345 346 /** 347 * Update an Action into database table 348 * 349 * @param coordJob 350 * @param coordAction 351 * @param actionXml 352 * @param store 353 * @throws Exception 354 */ 355 private void updateAction(CoordinatorJobBean coordJob, CoordinatorActionBean coordAction, String actionXml, 356 CoordinatorStore store) throws Exception { 357 log.debug("updateAction for actionId=" + coordAction.getId()); 358 coordAction.setStatus(CoordinatorAction.Status.WAITING); 359 coordAction.setExternalId(""); 360 coordAction.setExternalStatus(""); 361 coordAction.setRerunTime(new Date()); 362 store.updateCoordinatorAction(coordAction); 363 writeActionRegistration(coordAction.getActionXml(), coordAction, store, coordJob.getUser(), coordJob.getGroup()); 364 } 365 366 /** 367 * Create SLA RegistrationEvent 368 * 369 * @param actionXml 370 * @param actionBean 371 * @param store 372 * @param user 373 * @param group 374 * @throws Exception 375 */ 376 private void writeActionRegistration(String actionXml, CoordinatorActionBean actionBean, CoordinatorStore store, 377 String user, String group) 378 throws Exception { 379 Element eAction = XmlUtils.parseXml(actionXml); 380 Element eSla = eAction.getChild("action", eAction.getNamespace()).getChild("info", eAction.getNamespace("sla")); 381 SLADbOperations.writeSlaRegistrationEvent(eSla, store, actionBean.getId(), SlaAppType.COORDINATOR_ACTION, user, 382 group); 383 } 384 385 @Override 386 protected CoordinatorActionInfo execute(CoordinatorStore store) throws StoreException, CommandException { 387 log.info("STARTED CoordRerunCommand for jobId=" + jobId + ", scope=" + scope); 388 CoordinatorActionInfo coordInfo = null; 389 try { 390 if (lock(jobId)) { 391 coordInfo = call(store); 392 } 393 else { 394 queueCallable(new CoordResumeCommand(jobId), LOCK_FAILURE_REQUEUE_INTERVAL); 395 log.warn("CoordRerunCommand lock was not acquired - " + " failed " + jobId + ". Requeing the same."); 396 } 397 } 398 catch (InterruptedException e) { 399 queueCallable(new CoordResumeCommand(jobId), LOCK_FAILURE_REQUEUE_INTERVAL); 400 log.warn("CoordRerunCommand lock acquiring failed " + " with exception " + e.getMessage() + " for job id " 401 + jobId + ". Requeing the same."); 402 } 403 finally { 404 log.info("ENDED CoordRerunCommand for jobId=" + jobId + ", scope=" + scope); 405 } 406 return coordInfo; 407 } 408 409 }