1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package org.apache.hadoop.hbase.mapreduce;
21
22 import java.io.IOException;
23 import java.io.UnsupportedEncodingException;
24 import java.net.URI;
25 import java.net.URISyntaxException;
26 import java.net.URLDecoder;
27 import java.net.URLEncoder;
28 import java.util.ArrayList;
29 import java.util.Collection;
30 import java.util.List;
31 import java.util.Map;
32 import java.util.TreeMap;
33 import java.util.TreeSet;
34
35 import org.apache.commons.logging.Log;
36 import org.apache.commons.logging.LogFactory;
37 import org.apache.hadoop.conf.Configuration;
38 import org.apache.hadoop.filecache.DistributedCache;
39 import org.apache.hadoop.fs.FileSystem;
40 import org.apache.hadoop.fs.Path;
41 import org.apache.hadoop.hbase.HColumnDescriptor;
42 import org.apache.hadoop.hbase.HConstants;
43 import org.apache.hadoop.hbase.HTableDescriptor;
44 import org.apache.hadoop.hbase.KeyValue;
45 import org.apache.hadoop.hbase.client.HTable;
46 import org.apache.hadoop.hbase.client.Put;
47 import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
48 import org.apache.hadoop.hbase.io.hfile.CacheConfig;
49 import org.apache.hadoop.hbase.io.hfile.Compression;
50 import org.apache.hadoop.hbase.io.hfile.HFile;
51 import org.apache.hadoop.hbase.regionserver.StoreFile;
52 import org.apache.hadoop.hbase.regionserver.TimeRangeTracker;
53 import org.apache.hadoop.hbase.util.Bytes;
54 import org.apache.hadoop.io.NullWritable;
55 import org.apache.hadoop.io.SequenceFile;
56 import org.apache.hadoop.io.WritableUtils;
57 import org.apache.hadoop.mapreduce.Job;
58 import org.apache.hadoop.mapreduce.Partitioner;
59 import org.apache.hadoop.mapreduce.RecordWriter;
60 import org.apache.hadoop.mapreduce.TaskAttemptContext;
61 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
62 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
63
64
65
66
67
68
69
70
71
72
73 public class HFileOutputFormat extends FileOutputFormat<ImmutableBytesWritable, KeyValue> {
74 static Log LOG = LogFactory.getLog(HFileOutputFormat.class);
75 static final String COMPRESSION_CONF_KEY = "hbase.hfileoutputformat.families.compression";
76 TimeRangeTracker trt = new TimeRangeTracker();
77
78 public RecordWriter<ImmutableBytesWritable, KeyValue> getRecordWriter(final TaskAttemptContext context)
79 throws IOException, InterruptedException {
80
81 final Path outputPath = FileOutputFormat.getOutputPath(context);
82 final Path outputdir = new FileOutputCommitter(outputPath, context).getWorkPath();
83 final Configuration conf = context.getConfiguration();
84 final FileSystem fs = outputdir.getFileSystem(conf);
85
86 final long maxsize = conf.getLong("hbase.hregion.max.filesize",
87 HConstants.DEFAULT_MAX_FILE_SIZE);
88 final int blocksize = conf.getInt("hbase.mapreduce.hfileoutputformat.blocksize",
89 HFile.DEFAULT_BLOCKSIZE);
90
91 final String defaultCompression = conf.get("hfile.compression",
92 Compression.Algorithm.NONE.getName());
93
94
95 final Map<byte[], String> compressionMap = createFamilyCompressionMap(conf);
96
97 return new RecordWriter<ImmutableBytesWritable, KeyValue>() {
98
99 private final Map<byte [], WriterLength> writers =
100 new TreeMap<byte [], WriterLength>(Bytes.BYTES_COMPARATOR);
101 private byte [] previousRow = HConstants.EMPTY_BYTE_ARRAY;
102 private final byte [] now = Bytes.toBytes(System.currentTimeMillis());
103 private boolean rollRequested = false;
104
105 public void write(ImmutableBytesWritable row, KeyValue kv)
106 throws IOException {
107
108 if (row == null && kv == null) {
109 rollWriters();
110 return;
111 }
112
113 byte [] rowKey = kv.getRow();
114 long length = kv.getLength();
115 byte [] family = kv.getFamily();
116 WriterLength wl = this.writers.get(family);
117
118
119 if (wl == null) {
120 fs.mkdirs(new Path(outputdir, Bytes.toString(family)));
121 }
122
123
124
125 if (wl != null && wl.written + length >= maxsize) {
126 this.rollRequested = true;
127 }
128
129
130 if (rollRequested && Bytes.compareTo(this.previousRow, rowKey) != 0) {
131 rollWriters();
132 }
133
134
135 if (wl == null || wl.writer == null) {
136 wl = getNewWriter(family, conf);
137 }
138
139
140 kv.updateLatestStamp(this.now);
141 trt.includeTimestamp(kv);
142 wl.writer.append(kv);
143 wl.written += length;
144
145
146 this.previousRow = rowKey;
147 }
148
149 private void rollWriters() throws IOException {
150 for (WriterLength wl : this.writers.values()) {
151 if (wl.writer != null) {
152 LOG.info("Writer=" + wl.writer.getPath() +
153 ((wl.written == 0)? "": ", wrote=" + wl.written));
154 close(wl.writer);
155 }
156 wl.writer = null;
157 wl.written = 0;
158 }
159 this.rollRequested = false;
160 }
161
162
163
164
165
166
167 private WriterLength getNewWriter(byte[] family, Configuration conf)
168 throws IOException {
169 WriterLength wl = new WriterLength();
170 Path familydir = new Path(outputdir, Bytes.toString(family));
171 String compression = compressionMap.get(family);
172 compression = compression == null ? defaultCompression : compression;
173 wl.writer =
174 HFile.getWriterFactory(conf).createWriter(fs,
175 StoreFile.getUniqueFile(fs, familydir), blocksize,
176 compression, KeyValue.KEY_COMPARATOR);
177 this.writers.put(family, wl);
178 return wl;
179 }
180
181 private void close(final HFile.Writer w) throws IOException {
182 if (w != null) {
183 w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY,
184 Bytes.toBytes(System.currentTimeMillis()));
185 w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY,
186 Bytes.toBytes(context.getTaskAttemptID().toString()));
187 w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY,
188 Bytes.toBytes(true));
189 w.appendFileInfo(StoreFile.TIMERANGE_KEY,
190 WritableUtils.toByteArray(trt));
191 w.close();
192 }
193 }
194
195 public void close(TaskAttemptContext c)
196 throws IOException, InterruptedException {
197 for (WriterLength wl: this.writers.values()) {
198 close(wl.writer);
199 }
200 }
201 };
202 }
203
204
205
206
207 static class WriterLength {
208 long written = 0;
209 HFile.Writer writer = null;
210 }
211
212
213
214
215
216 private static List<ImmutableBytesWritable> getRegionStartKeys(HTable table)
217 throws IOException {
218 byte[][] byteKeys = table.getStartKeys();
219 ArrayList<ImmutableBytesWritable> ret =
220 new ArrayList<ImmutableBytesWritable>(byteKeys.length);
221 for (byte[] byteKey : byteKeys) {
222 ret.add(new ImmutableBytesWritable(byteKey));
223 }
224 return ret;
225 }
226
227
228
229
230
231
232
233 private static void writePartitions(Configuration conf, Path partitionsPath,
234 List<ImmutableBytesWritable> startKeys) throws IOException {
235 if (startKeys.isEmpty()) {
236 throw new IllegalArgumentException("No regions passed");
237 }
238
239
240
241
242
243 TreeSet<ImmutableBytesWritable> sorted =
244 new TreeSet<ImmutableBytesWritable>(startKeys);
245
246 ImmutableBytesWritable first = sorted.first();
247 if (!first.equals(HConstants.EMPTY_BYTE_ARRAY)) {
248 throw new IllegalArgumentException(
249 "First region of table should have empty start key. Instead has: "
250 + Bytes.toStringBinary(first.get()));
251 }
252 sorted.remove(first);
253
254
255 FileSystem fs = partitionsPath.getFileSystem(conf);
256 SequenceFile.Writer writer = SequenceFile.createWriter(fs,
257 conf, partitionsPath, ImmutableBytesWritable.class, NullWritable.class);
258
259 try {
260 for (ImmutableBytesWritable startKey : sorted) {
261 writer.append(startKey, NullWritable.get());
262 }
263 } finally {
264 writer.close();
265 }
266 }
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282 public static void configureIncrementalLoad(Job job, HTable table)
283 throws IOException {
284 Configuration conf = job.getConfiguration();
285 Class<? extends Partitioner> topClass;
286 try {
287 topClass = getTotalOrderPartitionerClass();
288 } catch (ClassNotFoundException e) {
289 throw new IOException("Failed getting TotalOrderPartitioner", e);
290 }
291 job.setPartitionerClass(topClass);
292 job.setOutputKeyClass(ImmutableBytesWritable.class);
293 job.setOutputValueClass(KeyValue.class);
294 job.setOutputFormatClass(HFileOutputFormat.class);
295
296
297
298
299 if (KeyValue.class.equals(job.getMapOutputValueClass())) {
300 job.setReducerClass(KeyValueSortReducer.class);
301 } else if (Put.class.equals(job.getMapOutputValueClass())) {
302 job.setReducerClass(PutSortReducer.class);
303 } else {
304 LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass());
305 }
306
307 LOG.info("Looking up current regions for table " + table);
308 List<ImmutableBytesWritable> startKeys = getRegionStartKeys(table);
309 LOG.info("Configuring " + startKeys.size() + " reduce partitions " +
310 "to match current region count");
311 job.setNumReduceTasks(startKeys.size());
312
313 Path partitionsPath = new Path(job.getWorkingDirectory(),
314 "partitions_" + System.currentTimeMillis());
315 LOG.info("Writing partition information to " + partitionsPath);
316
317 FileSystem fs = partitionsPath.getFileSystem(conf);
318 writePartitions(conf, partitionsPath, startKeys);
319 partitionsPath.makeQualified(fs);
320
321 URI cacheUri;
322 try {
323
324
325
326 cacheUri = new URI(partitionsPath.toString() + "#" +
327 org.apache.hadoop.hbase.mapreduce.hadoopbackport.TotalOrderPartitioner.DEFAULT_PATH);
328 } catch (URISyntaxException e) {
329 throw new IOException(e);
330 }
331 DistributedCache.addCacheFile(cacheUri, conf);
332 DistributedCache.createSymlink(conf);
333
334
335 configureCompression(table, conf);
336
337 LOG.info("Incremental table output configured.");
338 }
339
340
341
342
343
344
345
346
347
348
349 private static Class<? extends Partitioner> getTotalOrderPartitionerClass()
350 throws ClassNotFoundException {
351 Class<? extends Partitioner> clazz = null;
352 try {
353 clazz = (Class<? extends Partitioner>) Class.forName("org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner");
354 } catch (ClassNotFoundException e) {
355 clazz =
356 (Class<? extends Partitioner>) Class.forName("org.apache.hadoop.hbase.mapreduce.hadoopbackport.TotalOrderPartitioner");
357 }
358 return clazz;
359 }
360
361
362
363
364
365
366
367
368
369
370
371 static Map<byte[], String> createFamilyCompressionMap(Configuration conf) {
372 Map<byte[], String> compressionMap = new TreeMap<byte[], String>(Bytes.BYTES_COMPARATOR);
373 String compressionConf = conf.get(COMPRESSION_CONF_KEY, "");
374 for (String familyConf : compressionConf.split("&")) {
375 String[] familySplit = familyConf.split("=");
376 if (familySplit.length != 2) {
377 continue;
378 }
379
380 try {
381 compressionMap.put(URLDecoder.decode(familySplit[0], "UTF-8").getBytes(),
382 URLDecoder.decode(familySplit[1], "UTF-8"));
383 } catch (UnsupportedEncodingException e) {
384
385 throw new AssertionError(e);
386 }
387 }
388 return compressionMap;
389 }
390
391
392
393
394
395
396
397
398
399
400 static void configureCompression(HTable table, Configuration conf) throws IOException {
401 StringBuilder compressionConfigValue = new StringBuilder();
402 HTableDescriptor tableDescriptor = table.getTableDescriptor();
403 if(tableDescriptor == null){
404
405 return;
406 }
407 Collection<HColumnDescriptor> families = tableDescriptor.getFamilies();
408 int i = 0;
409 for (HColumnDescriptor familyDescriptor : families) {
410 if (i++ > 0) {
411 compressionConfigValue.append('&');
412 }
413 compressionConfigValue.append(URLEncoder.encode(familyDescriptor.getNameAsString(), "UTF-8"));
414 compressionConfigValue.append('=');
415 compressionConfigValue.append(URLEncoder.encode(familyDescriptor.getCompression().getName(), "UTF-8"));
416 }
417
418 conf.set(COMPRESSION_CONF_KEY, compressionConfigValue.toString());
419 }
420 }