View Javadoc

1   /**
2    * Copyright 2010 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.util;
21  
22  import java.io.IOException;
23  import java.math.BigInteger;
24  import java.util.Collections;
25  import java.util.Comparator;
26  import java.util.LinkedList;
27  import java.util.List;
28  import java.util.Map;
29  import java.util.Set;
30  import java.util.TreeMap;
31  
32  import org.apache.commons.cli.CommandLine;
33  import org.apache.commons.cli.GnuParser;
34  import org.apache.commons.cli.HelpFormatter;
35  import org.apache.commons.cli.OptionBuilder;
36  import org.apache.commons.cli.Options;
37  import org.apache.commons.cli.ParseException;
38  import org.apache.commons.lang.StringUtils;
39  import org.apache.commons.logging.Log;
40  import org.apache.commons.logging.LogFactory;
41  import org.apache.hadoop.conf.Configuration;
42  import org.apache.hadoop.fs.FSDataInputStream;
43  import org.apache.hadoop.fs.FSDataOutputStream;
44  import org.apache.hadoop.fs.FileStatus;
45  import org.apache.hadoop.fs.FileSystem;
46  import org.apache.hadoop.fs.Path;
47  import org.apache.hadoop.hbase.HBaseConfiguration;
48  import org.apache.hadoop.hbase.HColumnDescriptor;
49  import org.apache.hadoop.hbase.HConstants;
50  import org.apache.hadoop.hbase.HRegionInfo;
51  import org.apache.hadoop.hbase.HRegionLocation;
52  import org.apache.hadoop.hbase.HTableDescriptor;
53  import org.apache.hadoop.hbase.ServerName;
54  import org.apache.hadoop.hbase.client.HBaseAdmin;
55  import org.apache.hadoop.hbase.client.HTable;
56  import org.apache.hadoop.hbase.client.NoServerForRegionException;
57  import org.apache.hadoop.hbase.regionserver.Store;
58  import org.apache.hadoop.hbase.regionserver.StoreFile;
59  
60  import com.google.common.base.Preconditions;
61  import com.google.common.collect.Lists;
62  import com.google.common.collect.Maps;
63  import com.google.common.collect.Sets;
64  
65  /**
66   * The {@link RegionSplitter} class provides several utilities to help in the
67   * administration lifecycle for developers who choose to manually split regions
68   * instead of having HBase handle that automatically. The most useful utilities
69   * are:
70   * <p>
71   * <ul>
72   * <li>Create a table with a specified number of pre-split regions
73   * <li>Execute a rolling split of all regions on an existing table
74   * </ul>
75   * <p>
76   * Both operations can be safely done on a live server.
77   * <p>
78   * <b>Question:</b> How do I turn off automatic splitting? <br>
79   * <b>Answer:</b> Automatic splitting is determined by the configuration value
80   * <i>"hbase.hregion.max.filesize"</i>. It is not recommended that you set this
81   * to Long.MAX_VALUE in case you forget about manual splits. A suggested setting
82   * is 100GB, which would result in > 1hr major compactions if reached.
83   * <p>
84   * <b>Question:</b> Why did the original authors decide to manually split? <br>
85   * <b>Answer:</b> Specific workload characteristics of our use case allowed us
86   * to benefit from a manual split system.
87   * <p>
88   * <ul>
89   * <li>Data (~1k) that would grow instead of being replaced
90   * <li>Data growth was roughly uniform across all regions
91   * <li>OLTP workload. Data loss is a big deal.
92   * </ul>
93   * <p>
94   * <b>Question:</b> Why is manual splitting good for this workload? <br>
95   * <b>Answer:</b> Although automated splitting is not a bad option, there are
96   * benefits to manual splitting.
97   * <p>
98   * <ul>
99   * <li>With growing amounts of data, splits will continually be needed. Since
100  * you always know exactly what regions you have, long-term debugging and
101  * profiling is much easier with manual splits. It is hard to trace the logs to
102  * understand region level problems if it keeps splitting and getting renamed.
103  * <li>Data offlining bugs + unknown number of split regions == oh crap! If an
104  * HLog or StoreFile was mistakenly unprocessed by HBase due to a weird bug and
105  * you notice it a day or so later, you can be assured that the regions
106  * specified in these files are the same as the current regions and you have
107  * less headaches trying to restore/replay your data.
108  * <li>You can finely tune your compaction algorithm. With roughly uniform data
109  * growth, it's easy to cause split / compaction storms as the regions all
110  * roughly hit the same data size at the same time. With manual splits, you can
111  * let staggered, time-based major compactions spread out your network IO load.
112  * </ul>
113  * <p>
114  * <b>Question:</b> What's the optimal number of pre-split regions to create? <br>
115  * <b>Answer:</b> Mileage will vary depending upon your application.
116  * <p>
117  * The short answer for our application is that we started with 10 pre-split
118  * regions / server and watched our data growth over time. It's better to err on
119  * the side of too little regions and rolling split later.
120  * <p>
121  * The more complicated answer is that this depends upon the largest storefile
122  * in your region. With a growing data size, this will get larger over time. You
123  * want the largest region to be just big enough that the {@link Store} compact
124  * selection algorithm only compacts it due to a timed major. If you don't, your
125  * cluster can be prone to compaction storms as the algorithm decides to run
126  * major compactions on a large series of regions all at once. Note that
127  * compaction storms are due to the uniform data growth, not the manual split
128  * decision.
129  * <p>
130  * If you pre-split your regions too thin, you can increase the major compaction
131  * interval by configuring HConstants.MAJOR_COMPACTION_PERIOD. If your data size
132  * grows too large, use this script to perform a network IO safe rolling split
133  * of all regions.
134  */
135 public class RegionSplitter {
136   static final Log LOG = LogFactory.getLog(RegionSplitter.class);
137 
138   /**
139    * A generic interface for the RegionSplitter code to use for all it's
140    * functionality. Note that the original authors of this code use
141    * {@link MD5StringSplit} to partition their table and set it as default, but
142    * provided this for your custom algorithm. To use, create a new derived class
143    * from this interface and call the RegionSplitter class with the argument: <br>
144    * <b>-D split.algorithm=<your_class_path></b>
145    */
146   public static interface SplitAlgorithm {
147     /**
148      * Split a pre-existing region into 2 regions.
149      *
150      * @param start
151      *          row
152      * @param end
153      *          row
154      * @return the split row to use
155      */
156     byte[] split(byte[] start, byte[] end);
157 
158     /**
159      * Split an entire table.
160      *
161      * @param numberOfSplits
162      *          number of regions to split the table into
163      *
164      * @return array of split keys for the initial regions of the table
165      */
166     byte[][] split(int numberOfSplits);
167 
168     /**
169      * In HBase, the first row is represented by an empty byte array. This might
170      * cause problems with your split algorithm or row printing. All your APIs
171      * will be passed firstRow() instead of empty array.
172      *
173      * @return your representation of your first row
174      */
175     byte[] firstRow();
176 
177     /**
178      * In HBase, the last row is represented by an empty byte array. This might
179      * cause problems with your split algorithm or row printing. All your APIs
180      * will be passed firstRow() instead of empty array.
181      *
182      * @return your representation of your last row
183      */
184     byte[] lastRow();
185 
186     /**
187      * @param input
188      *          user or file input for row
189      * @return byte array representation of this row for HBase
190      */
191     byte[] strToRow(String input);
192 
193     /**
194      * @param row
195      *          byte array representing a row in HBase
196      * @return String to use for debug & file printing
197      */
198     String rowToStr(byte[] row);
199 
200     /**
201      * @return the separator character to use when storing / printing the row
202      */
203     String separator();
204   }
205 
206   /**
207    * The main function for the RegionSplitter application. Common uses:
208    * <p>
209    * <ul>
210    * <li>create a table named 'myTable' with 60 pre-split regions containing 2
211    * column families 'test' & 'rs' bin/hbase
212    * <ul>
213    * <li>org.apache.hadoop.hbase.util.RegionSplitter -c 60 -f test:rs myTable
214    * </ul>
215    * <li>perform a rolling split of 'myTable' (i.e. 60 => 120 regions), # 2
216    * outstanding splits at a time bin/hbase
217    * <ul>
218    * <li>org.apache.hadoop.hbase.util.RegionSplitter -r -o 2 myTable
219    * </ul>
220    * </ul>
221    *
222    * @param args
223    *          Usage: RegionSplitter &lt;TABLE&gt; &lt;-c &lt;# regions&gt; -f
224    *          &lt;family:family:...&gt; | -r [-o &lt;# outstanding
225    *          splits&gt;]&gt; [-D &lt;conf.param=value&gt;]
226    * @throws IOException
227    *           HBase IO problem
228    * @throws InterruptedException
229    *           user requested exit
230    * @throws ParseException
231    *           problem parsing user input
232    */
233   @SuppressWarnings("static-access")
234   public static void main(String[] args) throws IOException,
235       InterruptedException, ParseException {
236     Configuration conf = HBaseConfiguration.create();
237 
238     // parse user input
239     Options opt = new Options();
240     opt.addOption(OptionBuilder.withArgName("property=value").hasArg()
241         .withDescription("Override HBase Configuration Settings").create("D"));
242     opt.addOption(OptionBuilder.withArgName("region count").hasArg()
243         .withDescription(
244             "Create a new table with a pre-split number of regions")
245         .create("c"));
246     opt.addOption(OptionBuilder.withArgName("family:family:...").hasArg()
247         .withDescription(
248             "Column Families to create with new table.  Required with -c")
249         .create("f"));
250     opt.addOption("h", false, "Print this usage help");
251     opt.addOption("r", false, "Perform a rolling split of an existing region");
252     opt.addOption(OptionBuilder.withArgName("count").hasArg().withDescription(
253         "Max outstanding splits that have unfinished major compactions")
254         .create("o"));
255     opt.addOption(null, "risky", false,
256         "Skip verification steps to complete quickly."
257             + "STRONGLY DISCOURAGED for production systems.  ");
258     CommandLine cmd = new GnuParser().parse(opt, args);
259 
260     if (cmd.hasOption("D")) {
261       for (String confOpt : cmd.getOptionValues("D")) {
262         String[] kv = confOpt.split("=", 2);
263         if (kv.length == 2) {
264           conf.set(kv[0], kv[1]);
265           LOG.debug("-D configuration override: " + kv[0] + "=" + kv[1]);
266         } else {
267           throw new ParseException("-D option format invalid: " + confOpt);
268         }
269       }
270     }
271 
272     if (cmd.hasOption("risky")) {
273       conf.setBoolean("split.verify", false);
274     }
275 
276     boolean createTable = cmd.hasOption("c") && cmd.hasOption("f");
277     boolean rollingSplit = cmd.hasOption("r");
278     boolean oneOperOnly = createTable ^ rollingSplit;
279 
280     if (1 != cmd.getArgList().size() || !oneOperOnly || cmd.hasOption("h")) {
281       new HelpFormatter().printHelp("RegionSplitter <TABLE>", opt);
282       return;
283     }
284     String tableName = cmd.getArgs()[0];
285 
286     if (createTable) {
287       conf.set("split.count", cmd.getOptionValue("c"));
288       createPresplitTable(tableName, cmd.getOptionValue("f").split(":"), conf);
289     }
290 
291     if (rollingSplit) {
292       if (cmd.hasOption("o")) {
293         conf.set("split.outstanding", cmd.getOptionValue("o"));
294       }
295       rollingSplit(tableName, conf);
296     }
297   }
298 
299   static void createPresplitTable(String tableName, String[] columnFamilies,
300       Configuration conf) throws IOException, InterruptedException {
301     Class<? extends SplitAlgorithm> splitClass = conf.getClass(
302         "split.algorithm", MD5StringSplit.class, SplitAlgorithm.class);
303     SplitAlgorithm splitAlgo;
304     try {
305       splitAlgo = splitClass.newInstance();
306     } catch (Exception e) {
307       throw new IOException("Problem loading split algorithm: ", e);
308     }
309     final int splitCount = conf.getInt("split.count", 0);
310     Preconditions.checkArgument(splitCount > 1, "Split count must be > 1");
311 
312     Preconditions.checkArgument(columnFamilies.length > 0,
313         "Must specify at least one column family. ");
314     LOG.debug("Creating table " + tableName + " with " + columnFamilies.length
315         + " column families.  Presplitting to " + splitCount + " regions");
316 
317     HTableDescriptor desc = new HTableDescriptor(tableName);
318     for (String cf : columnFamilies) {
319       desc.addFamily(new HColumnDescriptor(Bytes.toBytes(cf)));
320     }
321     HBaseAdmin admin = new HBaseAdmin(conf);
322     Preconditions.checkArgument(!admin.tableExists(tableName),
323         "Table already exists: " + tableName);
324     admin.createTable(desc, splitAlgo.split(splitCount));
325     LOG.debug("Table created!  Waiting for regions to show online in META...");
326 
327     if (!conf.getBoolean("split.verify", true)) {
328       // NOTE: createTable is synchronous on the table, but not on the regions
329       HTable table = new HTable(tableName);
330       int onlineRegions = 0;
331       while (onlineRegions < splitCount) {
332         onlineRegions = table.getRegionsInfo().size();
333         LOG.debug(onlineRegions + " of " + splitCount + " regions online...");
334         if (onlineRegions < splitCount) {
335           Thread.sleep(10 * 1000); // sleep
336         }
337       }
338     }
339 
340     LOG.debug("Finished creating table with " + splitCount + " regions");
341   }
342 
343   static void rollingSplit(String tableName, Configuration conf)
344       throws IOException, InterruptedException {
345     Class<? extends SplitAlgorithm> splitClass = conf.getClass(
346         "split.algorithm", MD5StringSplit.class, SplitAlgorithm.class);
347     SplitAlgorithm splitAlgo;
348     try {
349       splitAlgo = splitClass.newInstance();
350     } catch (Exception e) {
351       throw new IOException("Problem loading split algorithm: ", e);
352     }
353     final int minOS = conf.getInt("split.outstanding", 2);
354 
355     HTable table = new HTable(conf, tableName);
356 
357     // max outstanding splits. default == 50% of servers
358     final int MAX_OUTSTANDING =
359         Math.max(table.getConnection().getCurrentNrHRS() / 2, minOS);
360 
361     Path hbDir = new Path(conf.get(HConstants.HBASE_DIR));
362     Path tableDir = HTableDescriptor.getTableDir(hbDir, table.getTableName());
363     Path splitFile = new Path(tableDir, "_balancedSplit");
364     FileSystem fs = FileSystem.get(conf);
365 
366     // get a list of daughter regions to create
367     LinkedList<Pair<byte[], byte[]>> tmpRegionSet = getSplits(table, splitAlgo);
368     LinkedList<Pair<byte[], byte[]>> outstanding = Lists.newLinkedList();
369     int splitCount = 0;
370     final int origCount = tmpRegionSet.size();
371 
372     // all splits must compact & we have 1 compact thread, so 2 split
373     // requests to the same RS can stall the outstanding split queue.
374     // To fix, group the regions into an RS pool and round-robin through it
375     LOG.debug("Bucketing regions by regionserver...");
376     TreeMap<String, LinkedList<Pair<byte[], byte[]>>> daughterRegions =
377       Maps.newTreeMap();
378     for (Pair<byte[], byte[]> dr : tmpRegionSet) {
379       String rsLocation = table.getRegionLocation(dr.getSecond()).
380         getHostnamePort();
381       if (!daughterRegions.containsKey(rsLocation)) {
382         LinkedList<Pair<byte[], byte[]>> entry = Lists.newLinkedList();
383         daughterRegions.put(rsLocation, entry);
384       }
385       daughterRegions.get(rsLocation).add(dr);
386     }
387     LOG.debug("Done with bucketing.  Split time!");
388     long startTime = System.currentTimeMillis();
389 
390     // open the split file and modify it as splits finish
391     FSDataInputStream tmpIn = fs.open(splitFile);
392     byte[] rawData = new byte[tmpIn.available()];
393     tmpIn.readFully(rawData);
394     tmpIn.close();
395     FSDataOutputStream splitOut = fs.create(splitFile);
396     splitOut.write(rawData);
397 
398     try {
399       // *** split code ***
400       while (!daughterRegions.isEmpty()) {
401         LOG.debug(daughterRegions.size() + " RS have regions to splt.");
402 
403         // Get RegionServer : region count mapping
404         final TreeMap<ServerName, Integer> rsSizes = Maps.newTreeMap();
405         Map<HRegionInfo, ServerName> regionsInfo = table.getRegionLocations();
406         for (ServerName rs : regionsInfo.values()) {
407           if (rsSizes.containsKey(rs)) {
408             rsSizes.put(rs, rsSizes.get(rs) + 1);
409           } else {
410             rsSizes.put(rs, 1);
411           }
412         }
413 
414         // sort the RS by the number of regions they have
415         List<String> serversLeft = Lists.newArrayList(daughterRegions .keySet());
416         Collections.sort(serversLeft, new Comparator<String>() {
417           public int compare(String o1, String o2) {
418             return rsSizes.get(o1).compareTo(rsSizes.get(o2));
419           }
420         });
421 
422         // round-robin through the RS list. Choose the lightest-loaded servers
423         // first to keep the master from load-balancing regions as we split.
424         for (String rsLoc : serversLeft) {
425           Pair<byte[], byte[]> dr = null;
426 
427           // find a region in the RS list that hasn't been moved
428           LOG.debug("Finding a region on " + rsLoc);
429           LinkedList<Pair<byte[], byte[]>> regionList = daughterRegions
430               .get(rsLoc);
431           while (!regionList.isEmpty()) {
432             dr = regionList.pop();
433 
434             // get current region info
435             byte[] split = dr.getSecond();
436             HRegionLocation regionLoc = table.getRegionLocation(split);
437 
438             // if this region moved locations
439             String newRs = regionLoc.getHostnamePort();
440             if (newRs.compareTo(rsLoc) != 0) {
441               LOG.debug("Region with " + splitAlgo.rowToStr(split)
442                   + " moved to " + newRs + ". Relocating...");
443               // relocate it, don't use it right now
444               if (!daughterRegions.containsKey(newRs)) {
445                 LinkedList<Pair<byte[], byte[]>> entry = Lists.newLinkedList();
446                 daughterRegions.put(newRs, entry);
447               }
448               daughterRegions.get(newRs).add(dr);
449               dr = null;
450               continue;
451             }
452 
453             // make sure this region wasn't already split
454             byte[] sk = regionLoc.getRegionInfo().getStartKey();
455             if (sk.length != 0) {
456               if (Bytes.equals(split, sk)) {
457                 LOG.debug("Region already split on "
458                     + splitAlgo.rowToStr(split) + ".  Skipping this region...");
459                 ++splitCount;
460                 dr = null;
461                 continue;
462               }
463               byte[] start = dr.getFirst();
464               Preconditions.checkArgument(Bytes.equals(start, sk), splitAlgo
465                   .rowToStr(start) + " != " + splitAlgo.rowToStr(sk));
466             }
467 
468             // passed all checks! found a good region
469             break;
470           }
471           if (regionList.isEmpty()) {
472             daughterRegions.remove(rsLoc);
473           }
474           if (dr == null)
475             continue;
476 
477           // we have a good region, time to split!
478           byte[] split = dr.getSecond();
479           LOG.debug("Splitting at " + splitAlgo.rowToStr(split));
480           HBaseAdmin admin = new HBaseAdmin(table.getConfiguration());
481           admin.split(table.getTableName(), split);
482 
483           LinkedList<Pair<byte[], byte[]>> finished = Lists.newLinkedList();
484           if (conf.getBoolean("split.verify", true)) {
485             // we need to verify and rate-limit our splits
486             outstanding.addLast(dr);
487             // with too many outstanding splits, wait for some to finish
488             while (outstanding.size() >= MAX_OUTSTANDING) {
489               finished = splitScan(outstanding, table, splitAlgo);
490               if (finished.isEmpty()) {
491                 Thread.sleep(30 * 1000);
492               } else {
493                 outstanding.removeAll(finished);
494               }
495             }
496           } else {
497             finished.add(dr);
498           }
499 
500           // mark each finished region as successfully split.
501           for (Pair<byte[], byte[]> region : finished) {
502             splitOut.writeChars("- " + splitAlgo.rowToStr(region.getFirst())
503                 + " " + splitAlgo.rowToStr(region.getSecond()) + "\n");
504             splitCount++;
505             if (splitCount % 10 == 0) {
506               long tDiff = (System.currentTimeMillis() - startTime)
507                   / splitCount;
508               LOG.debug("STATUS UPDATE: " + splitCount + " / " + origCount
509                   + ". Avg Time / Split = "
510                   + org.apache.hadoop.util.StringUtils.formatTime(tDiff));
511             }
512           }
513         }
514       }
515       if (conf.getBoolean("split.verify", true)) {
516         while (!outstanding.isEmpty()) {
517           LinkedList<Pair<byte[], byte[]>> finished = splitScan(outstanding,
518               table, splitAlgo);
519           if (finished.isEmpty()) {
520             Thread.sleep(30 * 1000);
521           } else {
522             outstanding.removeAll(finished);
523             for (Pair<byte[], byte[]> region : finished) {
524               splitOut.writeChars("- " + splitAlgo.rowToStr(region.getFirst())
525                   + " " + splitAlgo.rowToStr(region.getSecond()) + "\n");
526             }
527           }
528         }
529       }
530       LOG.debug("All regions have been sucesfully split!");
531     } finally {
532       long tDiff = System.currentTimeMillis() - startTime;
533       LOG.debug("TOTAL TIME = "
534           + org.apache.hadoop.util.StringUtils.formatTime(tDiff));
535       LOG.debug("Splits = " + splitCount);
536       LOG.debug("Avg Time / Split = "
537           + org.apache.hadoop.util.StringUtils.formatTime(tDiff / splitCount));
538 
539       splitOut.close();
540     }
541     fs.delete(splitFile, false);
542   }
543 
544   static LinkedList<Pair<byte[], byte[]>> splitScan(
545       LinkedList<Pair<byte[], byte[]>> regionList, HTable table,
546       SplitAlgorithm splitAlgo)
547       throws IOException, InterruptedException {
548     LinkedList<Pair<byte[], byte[]>> finished = Lists.newLinkedList();
549     LinkedList<Pair<byte[], byte[]>> logicalSplitting = Lists.newLinkedList();
550     LinkedList<Pair<byte[], byte[]>> physicalSplitting = Lists.newLinkedList();
551 
552     // get table info
553     Path hbDir = new Path(table.getConfiguration().get(HConstants.HBASE_DIR));
554     Path tableDir = HTableDescriptor.getTableDir(hbDir, table.getTableName());
555     Path splitFile = new Path(tableDir, "_balancedSplit");
556     FileSystem fs = FileSystem.get(table.getConfiguration());
557 
558     // clear the cache to forcibly refresh region information
559     table.clearRegionCache();
560 
561     // for every region that hasn't been verified as a finished split
562     for (Pair<byte[], byte[]> region : regionList) {
563       byte[] start = region.getFirst();
564       byte[] split = region.getSecond();
565 
566       // see if the new split daughter region has come online
567       HRegionInfo dri = table.getRegionLocation(split).getRegionInfo();
568       if (dri.isOffline() || !Bytes.equals(dri.getStartKey(), split)) {
569         logicalSplitting.add(region);
570         continue;
571       }
572 
573       try {
574         // when a daughter region is opened, a compaction is triggered
575         // wait until compaction completes for both daughter regions
576         LinkedList<HRegionInfo> check = Lists.newLinkedList();
577         check.add(table.getRegionLocation(start).getRegionInfo());
578         check.add(table.getRegionLocation(split).getRegionInfo());
579         for (HRegionInfo hri : check.toArray(new HRegionInfo[] {})) {
580           boolean refFound = false;
581           byte[] sk = hri.getStartKey();
582           if (sk.length == 0)
583             sk = splitAlgo.firstRow();
584           String startKey = splitAlgo.rowToStr(sk);
585           HTableDescriptor htd = table.getTableDescriptor();
586           // check every Column Family for that region
587           for (HColumnDescriptor c : htd.getFamilies()) {
588             Path cfDir = Store.getStoreHomedir(tableDir, hri.getEncodedName(),
589                 c.getName());
590             if (fs.exists(cfDir)) {
591               for (FileStatus file : fs.listStatus(cfDir)) {
592                 refFound |= StoreFile.isReference(file.getPath());
593                 if (refFound)
594                   break;
595               }
596             }
597             if (refFound)
598               break;
599           }
600           // compaction is completed when all reference files are gone
601           if (!refFound) {
602             check.remove(hri);
603           }
604         }
605         if (check.isEmpty()) {
606           finished.add(region);
607         } else {
608           physicalSplitting.add(region);
609         }
610       } catch (NoServerForRegionException nsfre) {
611         LOG.debug("No Server Exception thrown for: "
612             + splitAlgo.rowToStr(start));
613         physicalSplitting.add(region);
614         table.clearRegionCache();
615       }
616     }
617 
618     LOG.debug("Split Scan: " + finished.size() + " finished / "
619         + logicalSplitting.size() + " split wait / "
620         + physicalSplitting.size() + " reference wait");
621 
622     return finished;
623   }
624 
625   static LinkedList<Pair<byte[], byte[]>> getSplits(HTable table,
626       SplitAlgorithm splitAlgo) throws IOException {
627     Path hbDir = new Path(table.getConfiguration().get(HConstants.HBASE_DIR));
628     Path tableDir = HTableDescriptor.getTableDir(hbDir, table.getTableName());
629     Path splitFile = new Path(tableDir, "_balancedSplit");
630     FileSystem fs = FileSystem.get(table.getConfiguration());
631 
632     // using strings because (new byte[]{0}).equals(new byte[]{0}) == false
633     Set<Pair<String, String>> daughterRegions = Sets.newHashSet();
634 
635     // does a split file exist?
636     if (!fs.exists(splitFile)) {
637       // NO = fresh start. calculate splits to make
638       LOG.debug("No _balancedSplit file.  Calculating splits...");
639 
640       // query meta for all regions in the table
641       Set<Pair<byte[], byte[]>> rows = Sets.newHashSet();
642       Pair<byte[][], byte[][]> tmp = table.getStartEndKeys();
643       Preconditions.checkArgument(
644           tmp.getFirst().length == tmp.getSecond().length,
645           "Start and End rows should be equivalent");
646       for (int i = 0; i < tmp.getFirst().length; ++i) {
647         byte[] start = tmp.getFirst()[i], end = tmp.getSecond()[i];
648         if (start.length == 0)
649           start = splitAlgo.firstRow();
650         if (end.length == 0)
651           end = splitAlgo.lastRow();
652         rows.add(Pair.newPair(start, end));
653       }
654       LOG.debug("Table " + Bytes.toString(table.getTableName()) + " has "
655           + rows.size() + " regions that will be split.");
656 
657       // prepare the split file
658       Path tmpFile = new Path(tableDir, "_balancedSplit_prepare");
659       FSDataOutputStream tmpOut = fs.create(tmpFile);
660 
661       // calculate all the splits == [daughterRegions] = [(start, splitPoint)]
662       for (Pair<byte[], byte[]> r : rows) {
663         byte[] splitPoint = splitAlgo.split(r.getFirst(), r.getSecond());
664         String startStr = splitAlgo.rowToStr(r.getFirst());
665         String splitStr = splitAlgo.rowToStr(splitPoint);
666         daughterRegions.add(Pair.newPair(startStr, splitStr));
667         LOG.debug("Will Split [" + startStr + " , "
668             + splitAlgo.rowToStr(r.getSecond()) + ") at " + splitStr);
669         tmpOut.writeChars("+ " + startStr + splitAlgo.separator() + splitStr
670             + "\n");
671       }
672       tmpOut.close();
673       fs.rename(tmpFile, splitFile);
674     } else {
675       LOG.debug("_balancedSplit file found. Replay log to restore state...");
676       FSUtils.getInstance(fs, table.getConfiguration())
677         .recoverFileLease(fs, splitFile, table.getConfiguration());
678 
679       // parse split file and process remaining splits
680       FSDataInputStream tmpIn = fs.open(splitFile);
681       StringBuilder sb = new StringBuilder(tmpIn.available());
682       while (tmpIn.available() > 0) {
683         sb.append(tmpIn.readChar());
684       }
685       tmpIn.close();
686       for (String line : sb.toString().split("\n")) {
687         String[] cmd = line.split(splitAlgo.separator());
688         Preconditions.checkArgument(3 == cmd.length);
689         byte[] start = splitAlgo.strToRow(cmd[1]);
690         String startStr = splitAlgo.rowToStr(start);
691         byte[] splitPoint = splitAlgo.strToRow(cmd[2]);
692         String splitStr = splitAlgo.rowToStr(splitPoint);
693         Pair<String, String> r = Pair.newPair(startStr, splitStr);
694         if (cmd[0].equals("+")) {
695           LOG.debug("Adding: " + r);
696           daughterRegions.add(r);
697         } else {
698           LOG.debug("Removing: " + r);
699           Preconditions.checkArgument(cmd[0].equals("-"),
700               "Unknown option: " + cmd[0]);
701           Preconditions.checkState(daughterRegions.contains(r),
702               "Missing row: " + r);
703           daughterRegions.remove(r);
704         }
705       }
706       LOG.debug("Done reading. " + daughterRegions.size() + " regions left.");
707     }
708     LinkedList<Pair<byte[], byte[]>> ret = Lists.newLinkedList();
709     for (Pair<String, String> r : daughterRegions) {
710       ret.add(Pair.newPair(splitAlgo.strToRow(r.getFirst()), splitAlgo
711           .strToRow(r.getSecond())));
712     }
713     return ret;
714   }
715 
716   /**
717    * MD5StringSplit is the default {@link SplitAlgorithm} for creating pre-split
718    * tables. The format of MD5StringSplit is the ASCII representation of an MD5
719    * checksum. Row are long values in the range <b>"00000000" => "7FFFFFFF"</b>
720    * and are left-padded with zeros to keep the same order lexographically as if
721    * they were binary.
722    */
723   public static class MD5StringSplit implements SplitAlgorithm {
724     final static String MAXMD5 = "7FFFFFFF";
725     final static BigInteger MAXMD5_INT = new BigInteger(MAXMD5, 16);
726     final static int rowComparisonLength = MAXMD5.length();
727 
728     public byte[] split(byte[] start, byte[] end) {
729       BigInteger s = convertToBigInteger(start);
730       BigInteger e = convertToBigInteger(end);
731       Preconditions.checkArgument(!e.equals(BigInteger.ZERO));
732       return convertToByte(split2(s, e));
733     }
734 
735     public byte[][] split(int n) {
736       BigInteger[] splits = new BigInteger[n - 1];
737       BigInteger sizeOfEachSplit = MAXMD5_INT.divide(BigInteger.valueOf(n));
738       for (int i = 1; i < n; i++) {
739         // NOTE: this means the last region gets all the slop.
740         // This is not a big deal if we're assuming n << MAXMD5
741         splits[i - 1] = sizeOfEachSplit.multiply(BigInteger.valueOf(i));
742       }
743       return convertToBytes(splits);
744     }
745 
746     public byte[] firstRow() {
747       return convertToByte(BigInteger.ZERO);
748     }
749 
750     public byte[] lastRow() {
751       return convertToByte(MAXMD5_INT);
752     }
753 
754     public byte[] strToRow(String in) {
755       return convertToByte(new BigInteger(in, 16));
756     }
757 
758     public String rowToStr(byte[] row) {
759       return Bytes.toStringBinary(row);
760     }
761 
762     public String separator() {
763       return " ";
764     }
765 
766     static BigInteger split2(BigInteger minValue, BigInteger maxValue) {
767       return maxValue.add(minValue).divide(BigInteger.valueOf(2));
768     }
769 
770     /**
771      * Returns an array of bytes corresponding to an array of BigIntegers
772      *
773      * @param bigIntegers
774      * @return bytes corresponding to the bigIntegers
775      */
776     static byte[][] convertToBytes(BigInteger[] bigIntegers) {
777       byte[][] returnBytes = new byte[bigIntegers.length][];
778       for (int i = 0; i < bigIntegers.length; i++) {
779         returnBytes[i] = convertToByte(bigIntegers[i]);
780       }
781       return returnBytes;
782     }
783 
784     /**
785      * Returns the bytes corresponding to the BigInteger
786      *
787      * @param bigInteger
788      * @return byte corresponding to input BigInteger
789      */
790     static byte[] convertToByte(BigInteger bigInteger) {
791       String bigIntegerString = bigInteger.toString(16);
792       bigIntegerString = StringUtils.leftPad(bigIntegerString,
793           rowComparisonLength, '0');
794       return Bytes.toBytes(bigIntegerString);
795     }
796 
797     /**
798      * Returns the BigInteger represented by thebyte array
799      *
800      * @param row
801      * @return the corresponding BigInteger
802      */
803     static BigInteger convertToBigInteger(byte[] row) {
804       return (row.length > 0) ? new BigInteger(Bytes.toString(row), 16)
805           : BigInteger.ZERO;
806     }
807   }
808 
809 }