1 /**
2 * Copyright 2010 The Apache Software Foundation
3 *
4 * Licensed to the Apache Software Foundation (ASF) under one
5 * or more contributor license agreements. See the NOTICE file
6 * distributed with this work for additional information
7 * regarding copyright ownership. The ASF licenses this file
8 * to you under the Apache License, Version 2.0 (the
9 * "License"); you may not use this file except in compliance
10 * with the License. You may obtain a copy of the License at
11 *
12 * http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
19 */
20 package org.apache.hadoop.hbase.master;
21
22 import java.io.IOException;
23
24 import org.apache.commons.logging.Log;
25 import org.apache.commons.logging.LogFactory;
26 import org.apache.hadoop.hbase.HBaseTestingUtility;
27 import org.apache.hadoop.hbase.HConstants;
28 import org.apache.hadoop.hbase.HRegionInfo;
29 import org.apache.hadoop.hbase.client.HTable;
30 import org.apache.hadoop.hbase.client.Put;
31 import org.apache.hadoop.hbase.client.Result;
32 import org.apache.hadoop.hbase.client.ResultScanner;
33 import org.apache.hadoop.hbase.client.Scan;
34 import org.apache.hadoop.hbase.util.Bytes;
35 import org.apache.hadoop.hbase.util.Writables;
36 import org.junit.AfterClass;
37 import org.junit.Assert;
38 import org.junit.Before;
39 import org.junit.BeforeClass;
40 import org.junit.Ignore;
41 import org.junit.Test;
42
43 /**
44 * Test transitions of state across the master. Sets up the cluster once and
45 * then runs a couple of tests.
46 */
47 public class TestMasterTransitions {
48 private static final Log LOG = LogFactory.getLog(TestMasterTransitions.class);
49 private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
50 private static final String TABLENAME = "master_transitions";
51 private static final byte [][] FAMILIES = new byte [][] {Bytes.toBytes("a"),
52 Bytes.toBytes("b"), Bytes.toBytes("c")};
53
54 /**
55 * Start up a mini cluster and put a small table of many empty regions into it.
56 * @throws Exception
57 */
58 @BeforeClass public static void beforeAllTests() throws Exception {
59 TEST_UTIL.getConfiguration().setBoolean("dfs.support.append", true);
60 TEST_UTIL.startMiniCluster(2);
61 // Create a table of three families. This will assign a region.
62 TEST_UTIL.createTable(Bytes.toBytes(TABLENAME), FAMILIES);
63 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
64 int countOfRegions = TEST_UTIL.createMultiRegions(t, getTestFamily());
65 TEST_UTIL.waitUntilAllRegionsAssigned(countOfRegions);
66 addToEachStartKey(countOfRegions);
67 }
68
69 @AfterClass public static void afterAllTests() throws Exception {
70 TEST_UTIL.shutdownMiniCluster();
71 }
72
73 @Before public void setup() throws IOException {
74 TEST_UTIL.ensureSomeRegionServersAvailable(2);
75 }
76
77 /**
78 * Listener for regionserver events testing hbase-2428 (Infinite loop of
79 * region closes if META region is offline). In particular, listen
80 * for the close of the 'metaServer' and when it comes in, requeue it with a
81 * delay as though there were an issue processing the shutdown. As part of
82 * the requeuing, send over a close of a region on 'otherServer' so it comes
83 * into a master that has its meta region marked as offline.
84 */
85 /*
86 static class HBase2428Listener implements RegionServerOperationListener {
87 // Map of what we've delayed so we don't do do repeated delays.
88 private final Set<RegionServerOperation> postponed =
89 new CopyOnWriteArraySet<RegionServerOperation>();
90 private boolean done = false;;
91 private boolean metaShutdownReceived = false;
92 private final HServerAddress metaAddress;
93 private final MiniHBaseCluster cluster;
94 private final int otherServerIndex;
95 private final HRegionInfo hri;
96 private int closeCount = 0;
97 static final int SERVER_DURATION = 3 * 1000;
98 static final int CLOSE_DURATION = 1 * 1000;
99
100 HBase2428Listener(final MiniHBaseCluster c, final HServerAddress metaAddress,
101 final HRegionInfo closingHRI, final int otherServerIndex) {
102 this.cluster = c;
103 this.metaAddress = metaAddress;
104 this.hri = closingHRI;
105 this.otherServerIndex = otherServerIndex;
106 }
107
108 @Override
109 public boolean process(final RegionServerOperation op) throws IOException {
110 // If a regionserver shutdown and its of the meta server, then we want to
111 // delay the processing of the shutdown and send off a close of a region on
112 // the 'otherServer.
113 boolean result = true;
114 if (op instanceof ProcessServerShutdown) {
115 ProcessServerShutdown pss = (ProcessServerShutdown)op;
116 if (pss.getDeadServerAddress().equals(this.metaAddress)) {
117 // Don't postpone more than once.
118 if (!this.postponed.contains(pss)) {
119 // Close some region.
120 this.cluster.addMessageToSendRegionServer(this.otherServerIndex,
121 new HMsg(HMsg.Type.MSG_REGION_CLOSE, hri,
122 Bytes.toBytes("Forcing close in test")));
123 this.postponed.add(pss);
124 // Put off the processing of the regionserver shutdown processing.
125 pss.setDelay(SERVER_DURATION);
126 this.metaShutdownReceived = true;
127 // Return false. This will add this op to the delayed queue.
128 result = false;
129 }
130 }
131 } else {
132 // Have the close run frequently.
133 if (isWantedCloseOperation(op) != null) {
134 op.setDelay(CLOSE_DURATION);
135 // Count how many times it comes through here.
136 this.closeCount++;
137 }
138 }
139 return result;
140 }
141
142 public void processed(final RegionServerOperation op) {
143 if (isWantedCloseOperation(op) != null) return;
144 this.done = true;
145 }
146 */
147 /*
148 * @param op
149 * @return Null if not the wanted ProcessRegionClose, else <code>op</code>
150 * cast as a ProcessRegionClose.
151 */
152 /*
153 private ProcessRegionClose isWantedCloseOperation(final RegionServerOperation op) {
154 // Count every time we get a close operation.
155 if (op instanceof ProcessRegionClose) {
156 ProcessRegionClose c = (ProcessRegionClose)op;
157 if (c.regionInfo.equals(hri)) {
158 return c;
159 }
160 }
161 return null;
162 }
163
164 boolean isDone() {
165 return this.done;
166 }
167
168 boolean isMetaShutdownReceived() {
169 return metaShutdownReceived;
170 }
171
172 int getCloseCount() {
173 return this.closeCount;
174 }
175
176 @Override
177 public boolean process(HServerInfo serverInfo, HMsg incomingMsg) {
178 return true;
179 }
180 }
181 */
182 /**
183 * In 2428, the meta region has just been set offline and then a close comes
184 * in.
185 * @see <a href="https://issues.apache.org/jira/browse/HBASE-2428">HBASE-2428</a>
186 */
187 @Ignore @Test (timeout=300000) public void testRegionCloseWhenNoMetaHBase2428()
188 throws Exception {
189 /*
190 LOG.info("Running testRegionCloseWhenNoMetaHBase2428");
191 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
192 final HMaster master = cluster.getMaster();
193 int metaIndex = cluster.getServerWithMeta();
194 // Figure the index of the server that is not server the .META.
195 int otherServerIndex = -1;
196 for (int i = 0; i < cluster.getRegionServerThreads().size(); i++) {
197 if (i == metaIndex) continue;
198 otherServerIndex = i;
199 break;
200 }
201 final HRegionServer otherServer = cluster.getRegionServer(otherServerIndex);
202 final HRegionServer metaHRS = cluster.getRegionServer(metaIndex);
203
204 // Get a region out on the otherServer.
205 final HRegionInfo hri =
206 otherServer.getOnlineRegions().iterator().next().getRegionInfo();
207
208 // Add our RegionServerOperationsListener
209 HBase2428Listener listener = new HBase2428Listener(cluster,
210 metaHRS.getHServerInfo().getServerAddress(), hri, otherServerIndex);
211 master.getRegionServerOperationQueue().
212 registerRegionServerOperationListener(listener);
213 try {
214 // Now close the server carrying meta.
215 cluster.abortRegionServer(metaIndex);
216
217 // First wait on receipt of meta server shutdown message.
218 while(!listener.metaShutdownReceived) Threads.sleep(100);
219 while(!listener.isDone()) Threads.sleep(10);
220 // We should not have retried the close more times than it took for the
221 // server shutdown message to exit the delay queue and get processed
222 // (Multiple by two to add in some slop in case of GC or something).
223 assertTrue(listener.getCloseCount() > 1);
224 assertTrue(listener.getCloseCount() <
225 ((HBase2428Listener.SERVER_DURATION/HBase2428Listener.CLOSE_DURATION) * 2));
226
227 // Assert the closed region came back online
228 assertRegionIsBackOnline(hri);
229 } finally {
230 master.getRegionServerOperationQueue().
231 unregisterRegionServerOperationListener(listener);
232 }
233 */
234 }
235
236 /**
237 * Test adding in a new server before old one on same host+port is dead.
238 * Make the test more onerous by having the server under test carry the meta.
239 * If confusion between old and new, purportedly meta never comes back. Test
240 * that meta gets redeployed.
241 */
242 @Ignore @Test (timeout=300000) public void testAddingServerBeforeOldIsDead2413()
243 throws IOException {
244 /*
245 LOG.info("Running testAddingServerBeforeOldIsDead2413");
246 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
247 int count = count();
248 int metaIndex = cluster.getServerWithMeta();
249 MiniHBaseClusterRegionServer metaHRS =
250 (MiniHBaseClusterRegionServer)cluster.getRegionServer(metaIndex);
251 int port = metaHRS.getServerInfo().getServerAddress().getPort();
252 Configuration c = TEST_UTIL.getConfiguration();
253 String oldPort = c.get(HConstants.REGIONSERVER_PORT, "0");
254 try {
255 LOG.info("KILLED=" + metaHRS);
256 metaHRS.kill();
257 c.set(HConstants.REGIONSERVER_PORT, Integer.toString(port));
258 // Try and start new regionserver. It might clash with the old
259 // regionserver port so keep trying to get past the BindException.
260 HRegionServer hrs = null;
261 while (true) {
262 try {
263 hrs = cluster.startRegionServer().getRegionServer();
264 break;
265 } catch (IOException e) {
266 if (e.getCause() != null && e.getCause() instanceof InvocationTargetException) {
267 InvocationTargetException ee = (InvocationTargetException)e.getCause();
268 if (ee.getCause() != null && ee.getCause() instanceof BindException) {
269 LOG.info("BindException; retrying: " + e.toString());
270 }
271 }
272 }
273 }
274 LOG.info("STARTED=" + hrs);
275 // Wait until he's been given at least 3 regions before we go on to try
276 // and count rows in table.
277 while (hrs.getOnlineRegions().size() < 3) Threads.sleep(100);
278 LOG.info(hrs.toString() + " has " + hrs.getOnlineRegions().size() +
279 " regions");
280 assertEquals(count, count());
281 } finally {
282 c.set(HConstants.REGIONSERVER_PORT, oldPort);
283 }
284 */
285 }
286
287 /**
288 * HBase2482 is about outstanding region openings. If any are outstanding
289 * when a regionserver goes down, then they'll never deploy. They'll be
290 * stuck in the regions-in-transition list for ever. This listener looks
291 * for a region opening HMsg and if its from the server passed on construction,
292 * then we kill it. It also looks out for a close message on the victim
293 * server because that signifies start of the fireworks.
294 */
295 /*
296 static class HBase2482Listener implements RegionServerOperationListener {
297 private final HRegionServer victim;
298 private boolean abortSent = false;
299 // We closed regions on new server.
300 private volatile boolean closed = false;
301 // Copy of regions on new server
302 private final Collection<HRegion> copyOfOnlineRegions;
303 // This is the region that was in transition on the server we aborted. Test
304 // passes if this region comes back online successfully.
305 private HRegionInfo regionToFind;
306
307 HBase2482Listener(final HRegionServer victim) {
308 this.victim = victim;
309 // Copy regions currently open on this server so I can notice when
310 // there is a close.
311 this.copyOfOnlineRegions =
312 this.victim.getCopyOfOnlineRegionsSortedBySize().values();
313 }
314
315 @Override
316 public boolean process(HServerInfo serverInfo, HMsg incomingMsg) {
317 if (!victim.getServerInfo().equals(serverInfo) ||
318 this.abortSent || !this.closed) {
319 return true;
320 }
321 if (!incomingMsg.isType(HMsg.Type.MSG_REPORT_PROCESS_OPEN)) return true;
322 // Save the region that is in transition so can test later it came back.
323 this.regionToFind = incomingMsg.getRegionInfo();
324 String msg = "ABORTING " + this.victim + " because got a " +
325 HMsg.Type.MSG_REPORT_PROCESS_OPEN + " on this server for " +
326 incomingMsg.getRegionInfo().getRegionNameAsString();
327 this.victim.abort(msg);
328 this.abortSent = true;
329 return true;
330 }
331
332 @Override
333 public boolean process(RegionServerOperation op) throws IOException {
334 return true;
335 }
336
337 @Override
338 public void processed(RegionServerOperation op) {
339 if (this.closed || !(op instanceof ProcessRegionClose)) return;
340 ProcessRegionClose close = (ProcessRegionClose)op;
341 for (HRegion r: this.copyOfOnlineRegions) {
342 if (r.getRegionInfo().equals(close.regionInfo)) {
343 // We've closed one of the regions that was on the victim server.
344 // Now can start testing for when all regions are back online again
345 LOG.info("Found close of " +
346 r.getRegionInfo().getRegionNameAsString() +
347 "; setting close happened flag");
348 this.closed = true;
349 break;
350 }
351 }
352 }
353 }
354 */
355 /**
356 * In 2482, a RS with an opening region on it dies. The said region is then
357 * stuck in the master's regions-in-transition and never leaves it. This
358 * test works by bringing up a new regionserver, waiting for the load
359 * balancer to give it some regions. Then, we close all on the new server.
360 * After sending all the close messages, we send the new regionserver the
361 * special blocking message so it can not process any more messages.
362 * Meantime reopening of the just-closed regions is backed up on the new
363 * server. Soon as master gets an opening region from the new regionserver,
364 * we kill it. We then wait on all regions to come back on line. If bug
365 * is fixed, this should happen soon as the processing of the killed server is
366 * done.
367 * @see <a href="https://issues.apache.org/jira/browse/HBASE-2482">HBASE-2482</a>
368 */
369 @Ignore @Test (timeout=300000) public void testKillRSWithOpeningRegion2482()
370 throws Exception {
371 /*
372 LOG.info("Running testKillRSWithOpeningRegion2482");
373 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
374 if (cluster.getLiveRegionServerThreads().size() < 2) {
375 // Need at least two servers.
376 cluster.startRegionServer();
377 }
378 // Count how many regions are online. They need to be all back online for
379 // this test to succeed.
380 int countOfMetaRegions = countOfMetaRegions();
381 // Add a listener on the server.
382 HMaster m = cluster.getMaster();
383 // Start new regionserver.
384 MiniHBaseClusterRegionServer hrs =
385 (MiniHBaseClusterRegionServer)cluster.startRegionServer().getRegionServer();
386 LOG.info("Started new regionserver: " + hrs.toString());
387 // Wait until has some regions before proceeding. Balancer will give it some.
388 int minimumRegions =
389 countOfMetaRegions/(cluster.getRegionServerThreads().size() * 2);
390 while (hrs.getOnlineRegions().size() < minimumRegions) Threads.sleep(100);
391 // Set the listener only after some regions have been opened on new server.
392 HBase2482Listener listener = new HBase2482Listener(hrs);
393 m.getRegionServerOperationQueue().
394 registerRegionServerOperationListener(listener);
395 try {
396 // Go close all non-catalog regions on this new server
397 closeAllNonCatalogRegions(cluster, hrs);
398 // After all closes, add blocking message before the region opens start to
399 // come in.
400 cluster.addMessageToSendRegionServer(hrs,
401 new HMsg(HMsg.Type.TESTING_BLOCK_REGIONSERVER));
402 // Wait till one of the above close messages has an effect before we start
403 // wait on all regions back online.
404 while (!listener.closed) Threads.sleep(100);
405 LOG.info("Past close");
406 // Make sure the abort server message was sent.
407 while(!listener.abortSent) Threads.sleep(100);
408 LOG.info("Past abort send; waiting on all regions to redeploy");
409 // Now wait for regions to come back online.
410 assertRegionIsBackOnline(listener.regionToFind);
411 } finally {
412 m.getRegionServerOperationQueue().
413 unregisterRegionServerOperationListener(listener);
414 }
415 */
416 }
417
418 /*
419 * @return Count of all non-catalog regions on the designated server
420 */
421 /*
422 private int closeAllNonCatalogRegions(final MiniHBaseCluster cluster,
423 final MiniHBaseCluster.MiniHBaseClusterRegionServer hrs)
424 throws IOException {
425 int countOfRegions = 0;
426 for (HRegion r: hrs.getOnlineRegions()) {
427 if (r.getRegionInfo().isMetaRegion()) continue;
428 cluster.addMessageToSendRegionServer(hrs,
429 new HMsg(HMsg.Type.MSG_REGION_CLOSE, r.getRegionInfo()));
430 LOG.info("Sent close of " + r.getRegionInfo().getRegionNameAsString() +
431 " on " + hrs.toString());
432 countOfRegions++;
433 }
434 return countOfRegions;
435 }
436
437 private void assertRegionIsBackOnline(final HRegionInfo hri)
438 throws IOException {
439 // Region should have an entry in its startkey because of addRowToEachRegion.
440 byte [] row = getStartKey(hri);
441 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
442 Get g = new Get(row);
443 assertTrue((t.get(g)).size() > 0);
444 }
445
446 /*
447 * @return Count of regions in meta table.
448 * @throws IOException
449 */
450 /*
451 private static int countOfMetaRegions()
452 throws IOException {
453 HTable meta = new HTable(TEST_UTIL.getConfiguration(),
454 HConstants.META_TABLE_NAME);
455 int rows = 0;
456 Scan scan = new Scan();
457 scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
458 ResultScanner s = meta.getScanner(scan);
459 for (Result r = null; (r = s.next()) != null;) {
460 byte [] b =
461 r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
462 if (b == null || b.length <= 0) break;
463 rows++;
464 }
465 s.close();
466 return rows;
467 }
468 */
469 /*
470 * Add to each of the regions in .META. a value. Key is the startrow of the
471 * region (except its 'aaa' for first region). Actual value is the row name.
472 * @param expected
473 * @return
474 * @throws IOException
475 */
476 private static int addToEachStartKey(final int expected) throws IOException {
477 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
478 HTable meta = new HTable(TEST_UTIL.getConfiguration(),
479 HConstants.META_TABLE_NAME);
480 int rows = 0;
481 Scan scan = new Scan();
482 scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
483 ResultScanner s = meta.getScanner(scan);
484 for (Result r = null; (r = s.next()) != null;) {
485 byte [] b =
486 r.getValue(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
487 if (b == null || b.length <= 0) break;
488 HRegionInfo hri = Writables.getHRegionInfo(b);
489 // If start key, add 'aaa'.
490 byte [] row = getStartKey(hri);
491 Put p = new Put(row);
492 p.setWriteToWAL(false);
493 p.add(getTestFamily(), getTestQualifier(), row);
494 t.put(p);
495 rows++;
496 }
497 s.close();
498 Assert.assertEquals(expected, rows);
499 return rows;
500 }
501
502 /*
503 * @return Count of rows in TABLENAME
504 * @throws IOException
505 */
506 private static int count() throws IOException {
507 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
508 int rows = 0;
509 Scan scan = new Scan();
510 ResultScanner s = t.getScanner(scan);
511 for (Result r = null; (r = s.next()) != null;) {
512 rows++;
513 }
514 s.close();
515 LOG.info("Counted=" + rows);
516 return rows;
517 }
518
519 /*
520 * @param hri
521 * @return Start key for hri (If start key is '', then return 'aaa'.
522 */
523 private static byte [] getStartKey(final HRegionInfo hri) {
524 return Bytes.equals(HConstants.EMPTY_START_ROW, hri.getStartKey())?
525 Bytes.toBytes("aaa"): hri.getStartKey();
526 }
527
528 private static byte [] getTestFamily() {
529 return FAMILIES[0];
530 }
531
532 private static byte [] getTestQualifier() {
533 return getTestFamily();
534 }
535 }