1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package org.apache.hadoop.hbase.master;
21
22 import static org.junit.Assert.*;
23
24 import java.io.IOException;
25 import java.util.List;
26 import java.util.NavigableSet;
27 import java.util.Set;
28 import java.util.TreeSet;
29
30 import org.apache.commons.logging.Log;
31 import org.apache.commons.logging.LogFactory;
32 import org.apache.hadoop.conf.Configuration;
33 import org.apache.hadoop.hbase.HBaseConfiguration;
34 import org.apache.hadoop.hbase.HBaseTestingUtility;
35 import org.apache.hadoop.hbase.HRegionInfo;
36 import org.apache.hadoop.hbase.MiniHBaseCluster;
37 import org.apache.hadoop.hbase.ServerName;
38 import org.apache.hadoop.hbase.client.HTable;
39 import org.apache.hadoop.hbase.util.Bytes;
40 import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
41 import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
42 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
43 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
44 import org.apache.zookeeper.KeeperException;
45 import org.junit.Test;
46
47
48
49
50 public class TestRollingRestart {
51 private static final Log LOG = LogFactory.getLog(TestRollingRestart.class);
52
53 @Test (timeout=300000)
54 public void testBasicRollingRestart() throws Exception {
55
56
57 final int NUM_MASTERS = 2;
58 final int NUM_RS = 3;
59 final int NUM_REGIONS_TO_CREATE = 20;
60
61 int expectedNumRS = 3;
62
63
64 log("Starting cluster");
65 Configuration conf = HBaseConfiguration.create();
66 conf.setInt("hbase.master.assignment.timeoutmonitor.period", 2000);
67 conf.setInt("hbase.master.assignment.timeoutmonitor.timeout", 5000);
68 HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
69 TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
70 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
71 log("Waiting for active/ready master");
72 cluster.waitForActiveAndReadyMaster();
73 ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "testRollingRestart",
74 null);
75 HMaster master = cluster.getMaster();
76
77
78 byte [] table = Bytes.toBytes("tableRestart");
79 byte [] family = Bytes.toBytes("family");
80 log("Creating table with " + NUM_REGIONS_TO_CREATE + " regions");
81 HTable ht = TEST_UTIL.createTable(table, family);
82 int numRegions = TEST_UTIL.createMultiRegions(conf, ht, family,
83 NUM_REGIONS_TO_CREATE);
84 numRegions += 2;
85 log("Waiting for no more RIT\n");
86 blockUntilNoRIT(zkw, master);
87 log("Disabling table\n");
88 TEST_UTIL.getHBaseAdmin().disableTable(table);
89 log("Waiting for no more RIT\n");
90 blockUntilNoRIT(zkw, master);
91 NavigableSet<String> regions = getAllOnlineRegions(cluster);
92 log("Verifying only catalog regions are assigned\n");
93 if (regions.size() != 2) {
94 for (String oregion : regions) log("Region still online: " + oregion);
95 }
96 assertEquals(2, regions.size());
97 log("Enabling table\n");
98 TEST_UTIL.getHBaseAdmin().enableTable(table);
99 log("Waiting for no more RIT\n");
100 blockUntilNoRIT(zkw, master);
101 log("Verifying there are " + numRegions + " assigned on cluster\n");
102 regions = getAllOnlineRegions(cluster);
103 assertRegionsAssigned(cluster, regions);
104 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
105
106
107 log("Adding a fourth RS");
108 RegionServerThread restarted = cluster.startRegionServer();
109 expectedNumRS++;
110 restarted.waitForServerOnline();
111 log("Additional RS is online");
112 log("Waiting for no more RIT");
113 blockUntilNoRIT(zkw, master);
114 log("Verifying there are " + numRegions + " assigned on cluster");
115 assertRegionsAssigned(cluster, regions);
116 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
117
118
119 List<MasterThread> masterThreads = cluster.getMasterThreads();
120 MasterThread activeMaster = null;
121 MasterThread backupMaster = null;
122 assertEquals(2, masterThreads.size());
123 if (masterThreads.get(0).getMaster().isActiveMaster()) {
124 activeMaster = masterThreads.get(0);
125 backupMaster = masterThreads.get(1);
126 } else {
127 activeMaster = masterThreads.get(1);
128 backupMaster = masterThreads.get(0);
129 }
130
131
132 log("Stopping backup master\n\n");
133 backupMaster.getMaster().stop("Stop of backup during rolling restart");
134 cluster.hbaseCluster.waitOnMaster(backupMaster);
135
136
137 log("Stopping primary master\n\n");
138 activeMaster.getMaster().stop("Stop of active during rolling restart");
139 cluster.hbaseCluster.waitOnMaster(activeMaster);
140
141
142 log("Restarting primary master\n\n");
143 activeMaster = cluster.startMaster();
144 cluster.waitForActiveAndReadyMaster();
145 master = activeMaster.getMaster();
146
147
148 log("Restarting backup master\n\n");
149 backupMaster = cluster.startMaster();
150
151 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
152
153
154
155
156 List<RegionServerThread> regionServers =
157 cluster.getLiveRegionServerThreads();
158 int num = 1;
159 int total = regionServers.size();
160 for (RegionServerThread rst : regionServers) {
161 ServerName serverName = rst.getRegionServer().getServerName();
162 log("Stopping region server " + num + " of " + total + " [ " +
163 serverName + "]");
164 rst.getRegionServer().stop("Stopping RS during rolling restart");
165 cluster.hbaseCluster.waitOnRegionServer(rst);
166 log("Waiting for RS shutdown to be handled by master");
167 waitForRSShutdownToStartAndFinish(activeMaster, serverName);
168 log("RS shutdown done, waiting for no more RIT");
169 blockUntilNoRIT(zkw, master);
170 log("Verifying there are " + numRegions + " assigned on cluster");
171 assertRegionsAssigned(cluster, regions);
172 expectedNumRS--;
173 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
174 log("Restarting region server " + num + " of " + total);
175 restarted = cluster.startRegionServer();
176 restarted.waitForServerOnline();
177 expectedNumRS++;
178 log("Region server " + num + " is back online");
179 log("Waiting for no more RIT");
180 blockUntilNoRIT(zkw, master);
181 log("Verifying there are " + numRegions + " assigned on cluster");
182 assertRegionsAssigned(cluster, regions);
183 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
184 num++;
185 }
186 Thread.sleep(2000);
187 assertRegionsAssigned(cluster, regions);
188
189
190 RegionServerThread rootServer = getServerHostingRoot(cluster);
191 RegionServerThread metaServer = getServerHostingMeta(cluster);
192 if (rootServer == metaServer) {
193 log("ROOT and META on the same server so killing another random server");
194 int i=0;
195 while (rootServer == metaServer) {
196 metaServer = cluster.getRegionServerThreads().get(i);
197 i++;
198 }
199 }
200 log("Stopping server hosting ROOT");
201 rootServer.getRegionServer().stop("Stopping ROOT server");
202 log("Stopping server hosting META #1");
203 metaServer.getRegionServer().stop("Stopping META server");
204 cluster.hbaseCluster.waitOnRegionServer(rootServer);
205 log("Root server down");
206 cluster.hbaseCluster.waitOnRegionServer(metaServer);
207 log("Meta server down #1");
208 expectedNumRS -= 2;
209 log("Waiting for meta server #1 RS shutdown to be handled by master");
210 waitForRSShutdownToStartAndFinish(activeMaster,
211 metaServer.getRegionServer().getServerName());
212 log("Waiting for no more RIT");
213 long start = System.currentTimeMillis();
214 do {
215 blockUntilNoRIT(zkw, master);
216 } while (getNumberOfOnlineRegions(cluster) < numRegions
217 && System.currentTimeMillis()-start < 60000);
218 log("Verifying there are " + numRegions + " assigned on cluster");
219 assertRegionsAssigned(cluster, regions);
220 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
221
222
223 metaServer = getServerHostingMeta(cluster);
224 log("Stopping server hosting META #2");
225 metaServer.getRegionServer().stop("Stopping META server");
226 cluster.hbaseCluster.waitOnRegionServer(metaServer);
227 log("Meta server down");
228 expectedNumRS--;
229 log("Waiting for RS shutdown to be handled by master");
230 waitForRSShutdownToStartAndFinish(activeMaster,
231 metaServer.getRegionServer().getServerName());
232 log("RS shutdown done, waiting for no more RIT");
233 blockUntilNoRIT(zkw, master);
234 log("Verifying there are " + numRegions + " assigned on cluster");
235 assertRegionsAssigned(cluster, regions);
236 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
237
238
239 cluster.startRegionServer().waitForServerOnline();
240 cluster.startRegionServer().waitForServerOnline();
241 cluster.startRegionServer().waitForServerOnline();
242 Thread.sleep(1000);
243 log("Waiting for no more RIT");
244 blockUntilNoRIT(zkw, master);
245 log("Verifying there are " + numRegions + " assigned on cluster");
246 assertRegionsAssigned(cluster, regions);
247
248 metaServer = getServerHostingMeta(cluster);
249 log("Stopping server hosting META (1 of 3)");
250 metaServer.getRegionServer().stop("Stopping META server");
251 cluster.hbaseCluster.waitOnRegionServer(metaServer);
252 log("Meta server down (1 of 3)");
253 log("Waiting for RS shutdown to be handled by master");
254 waitForRSShutdownToStartAndFinish(activeMaster,
255 metaServer.getRegionServer().getServerName());
256 log("RS shutdown done, waiting for no more RIT");
257 blockUntilNoRIT(zkw, master);
258 log("Verifying there are " + numRegions + " assigned on cluster");
259 assertRegionsAssigned(cluster, regions);
260
261
262 metaServer = getServerHostingMeta(cluster);
263 log("Stopping server hosting META (2 of 3)");
264 metaServer.getRegionServer().stop("Stopping META server");
265 cluster.hbaseCluster.waitOnRegionServer(metaServer);
266 log("Meta server down (2 of 3)");
267 log("Waiting for RS shutdown to be handled by master");
268 waitForRSShutdownToStartAndFinish(activeMaster,
269 metaServer.getRegionServer().getServerName());
270 log("RS shutdown done, waiting for no more RIT");
271 blockUntilNoRIT(zkw, master);
272 log("Verifying there are " + numRegions + " assigned on cluster");
273 assertRegionsAssigned(cluster, regions);
274
275
276 metaServer = getServerHostingMeta(cluster);
277 log("Stopping server hosting META (3 of 3)");
278 metaServer.getRegionServer().stop("Stopping META server");
279 cluster.hbaseCluster.waitOnRegionServer(metaServer);
280 log("Meta server down (3 of 3)");
281 log("Waiting for RS shutdown to be handled by master");
282 waitForRSShutdownToStartAndFinish(activeMaster,
283 metaServer.getRegionServer().getServerName());
284 log("RS shutdown done, waiting for no more RIT");
285 blockUntilNoRIT(zkw, master);
286 log("Verifying there are " + numRegions + " assigned on cluster");
287 assertRegionsAssigned(cluster, regions);
288
289 if (cluster.getRegionServerThreads().size() != 1) {
290 log("Online regionservers:");
291 for (RegionServerThread rst : cluster.getRegionServerThreads()) {
292 log("RS: " + rst.getRegionServer().getServerName());
293 }
294 }
295 assertEquals(1, cluster.getRegionServerThreads().size());
296
297
298
299
300
301
302 TEST_UTIL.shutdownMiniCluster();
303 }
304
305 private void blockUntilNoRIT(ZooKeeperWatcher zkw, HMaster master)
306 throws KeeperException, InterruptedException {
307 ZKAssign.blockUntilNoRIT(zkw);
308 master.assignmentManager.waitUntilNoRegionsInTransition(60000);
309 }
310
311 private void waitForRSShutdownToStartAndFinish(MasterThread activeMaster,
312 ServerName serverName) throws InterruptedException {
313 ServerManager sm = activeMaster.getMaster().getServerManager();
314
315 while (!sm.getDeadServers().contains(serverName)) {
316 log("Waiting for [" + serverName + "] to be listed as dead in master");
317 Thread.sleep(1);
318 }
319 log("Server [" + serverName + "] marked as dead, waiting for it to " +
320 "finish dead processing");
321 while (sm.areDeadServersInProgress()) {
322 log("Server [" + serverName + "] still being processed, waiting");
323 Thread.sleep(100);
324 }
325 log("Server [" + serverName + "] done with server shutdown processing");
326 }
327
328 private void log(String msg) {
329 LOG.debug("\n\nTRR: " + msg + "\n");
330 }
331
332 private RegionServerThread getServerHostingMeta(MiniHBaseCluster cluster)
333 throws IOException {
334 return getServerHosting(cluster, HRegionInfo.FIRST_META_REGIONINFO);
335 }
336
337 private RegionServerThread getServerHostingRoot(MiniHBaseCluster cluster)
338 throws IOException {
339 return getServerHosting(cluster, HRegionInfo.ROOT_REGIONINFO);
340 }
341
342 private RegionServerThread getServerHosting(MiniHBaseCluster cluster,
343 HRegionInfo region) throws IOException {
344 for (RegionServerThread rst : cluster.getRegionServerThreads()) {
345 if (rst.getRegionServer().getOnlineRegions().contains(region)) {
346 return rst;
347 }
348 }
349 return null;
350 }
351
352 private int getNumberOfOnlineRegions(MiniHBaseCluster cluster) {
353 int numFound = 0;
354 for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
355 numFound += rst.getRegionServer().getNumberOfOnlineRegions();
356 }
357 return numFound;
358 }
359
360 private void assertRegionsAssigned(MiniHBaseCluster cluster,
361 Set<String> expectedRegions) throws IOException {
362 int numFound = getNumberOfOnlineRegions(cluster);
363 if (expectedRegions.size() > numFound) {
364 log("Expected to find " + expectedRegions.size() + " but only found"
365 + " " + numFound);
366 NavigableSet<String> foundRegions = getAllOnlineRegions(cluster);
367 for (String region : expectedRegions) {
368 if (!foundRegions.contains(region)) {
369 log("Missing region: " + region);
370 }
371 }
372 assertEquals(expectedRegions.size(), numFound);
373 } else if (expectedRegions.size() < numFound) {
374 int doubled = numFound - expectedRegions.size();
375 log("Expected to find " + expectedRegions.size() + " but found"
376 + " " + numFound + " (" + doubled + " double assignments?)");
377 NavigableSet<String> doubleRegions = getDoubleAssignedRegions(cluster);
378 for (String region : doubleRegions) {
379 log("Region is double assigned: " + region);
380 }
381 assertEquals(expectedRegions.size(), numFound);
382 } else {
383 log("Success! Found expected number of " + numFound + " regions");
384 }
385 }
386
387 private NavigableSet<String> getAllOnlineRegions(MiniHBaseCluster cluster)
388 throws IOException {
389 NavigableSet<String> online = new TreeSet<String>();
390 for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
391 for (HRegionInfo region : rst.getRegionServer().getOnlineRegions()) {
392 online.add(region.getRegionNameAsString());
393 }
394 }
395 return online;
396 }
397
398 private NavigableSet<String> getDoubleAssignedRegions(
399 MiniHBaseCluster cluster) throws IOException {
400 NavigableSet<String> online = new TreeSet<String>();
401 NavigableSet<String> doubled = new TreeSet<String>();
402 for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
403 for (HRegionInfo region : rst.getRegionServer().getOnlineRegions()) {
404 if(!online.add(region.getRegionNameAsString())) {
405 doubled.add(region.getRegionNameAsString());
406 }
407 }
408 }
409 return doubled;
410 }
411
412 }