1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package org.apache.hadoop.hbase.mapreduce;
21
22 import java.io.ByteArrayInputStream;
23 import java.io.ByteArrayOutputStream;
24 import java.io.DataInputStream;
25 import java.io.DataOutputStream;
26 import java.io.IOException;
27 import java.net.URL;
28 import java.net.URLDecoder;
29 import java.util.ArrayList;
30 import java.util.Enumeration;
31 import java.util.HashSet;
32 import java.util.List;
33 import java.util.Set;
34
35 import org.apache.commons.logging.Log;
36 import org.apache.commons.logging.LogFactory;
37 import org.apache.hadoop.fs.FileSystem;
38 import org.apache.hadoop.fs.Path;
39 import org.apache.hadoop.hbase.HBaseConfiguration;
40 import org.apache.hadoop.hbase.HConstants;
41 import org.apache.hadoop.hbase.client.HTable;
42 import org.apache.hadoop.hbase.client.Scan;
43 import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
44 import org.apache.hadoop.hbase.util.Base64;
45 import org.apache.hadoop.io.Writable;
46 import org.apache.hadoop.io.WritableComparable;
47 import org.apache.hadoop.mapreduce.Job;
48 import org.apache.hadoop.util.StringUtils;
49 import org.apache.hadoop.conf.Configuration;
50
51
52
53
54 @SuppressWarnings("unchecked")
55 public class TableMapReduceUtil {
56 static Log LOG = LogFactory.getLog(TableMapReduceUtil.class);
57
58
59
60
61
62
63
64
65
66
67
68
69
70 public static void initTableMapperJob(String table, Scan scan,
71 Class<? extends TableMapper> mapper,
72 Class<? extends WritableComparable> outputKeyClass,
73 Class<? extends Writable> outputValueClass, Job job) throws IOException {
74 job.setInputFormatClass(TableInputFormat.class);
75 if (outputValueClass != null) job.setMapOutputValueClass(outputValueClass);
76 if (outputKeyClass != null) job.setMapOutputKeyClass(outputKeyClass);
77 job.setMapperClass(mapper);
78 job.getConfiguration().set(TableInputFormat.INPUT_TABLE, table);
79 job.getConfiguration().set(TableInputFormat.SCAN,
80 convertScanToString(scan));
81 addDependencyJars(job);
82 }
83
84
85
86
87
88
89
90
91 static String convertScanToString(Scan scan) throws IOException {
92 ByteArrayOutputStream out = new ByteArrayOutputStream();
93 DataOutputStream dos = new DataOutputStream(out);
94 scan.write(dos);
95 return Base64.encodeBytes(out.toByteArray());
96 }
97
98
99
100
101
102
103
104
105 static Scan convertStringToScan(String base64) throws IOException {
106 ByteArrayInputStream bis = new ByteArrayInputStream(Base64.decode(base64));
107 DataInputStream dis = new DataInputStream(bis);
108 Scan scan = new Scan();
109 scan.readFields(dis);
110 return scan;
111 }
112
113
114
115
116
117
118
119
120
121
122 public static void initTableReducerJob(String table,
123 Class<? extends TableReducer> reducer, Job job)
124 throws IOException {
125 initTableReducerJob(table, reducer, job, null);
126 }
127
128
129
130
131
132
133
134
135
136
137
138
139 public static void initTableReducerJob(String table,
140 Class<? extends TableReducer> reducer, Job job,
141 Class partitioner) throws IOException {
142 initTableReducerJob(table, reducer, job, partitioner, null, null, null);
143 }
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159 public static void initTableReducerJob(String table,
160 Class<? extends TableReducer> reducer, Job job,
161 Class partitioner, String quorumAddress, String serverClass,
162 String serverImpl) throws IOException {
163
164 Configuration conf = job.getConfiguration();
165 job.setOutputFormatClass(TableOutputFormat.class);
166 if (reducer != null) job.setReducerClass(reducer);
167 conf.set(TableOutputFormat.OUTPUT_TABLE, table);
168 if (quorumAddress != null) {
169 if (quorumAddress.split(":").length == 2) {
170 conf.set(TableOutputFormat.QUORUM_ADDRESS, quorumAddress);
171 } else {
172 throw new IOException("Please specify the peer cluster as " +
173 HConstants.ZOOKEEPER_QUORUM+":"+HConstants.ZOOKEEPER_ZNODE_PARENT);
174 }
175 }
176 if (serverClass != null && serverImpl != null) {
177 conf.set(TableOutputFormat.REGION_SERVER_CLASS, serverClass);
178 conf.set(TableOutputFormat.REGION_SERVER_IMPL, serverImpl);
179 }
180 job.setOutputKeyClass(ImmutableBytesWritable.class);
181 job.setOutputValueClass(Writable.class);
182 if (partitioner == HRegionPartitioner.class) {
183 HBaseConfiguration.addHbaseResources(conf);
184 job.setPartitionerClass(HRegionPartitioner.class);
185 HTable outputTable = new HTable(conf, table);
186 int regions = outputTable.getRegionsInfo().size();
187 if (job.getNumReduceTasks() > regions) {
188 job.setNumReduceTasks(outputTable.getRegionsInfo().size());
189 }
190 } else if (partitioner != null) {
191 job.setPartitionerClass(partitioner);
192 }
193 addDependencyJars(job);
194 }
195
196
197
198
199
200
201
202
203
204 public static void limitNumReduceTasks(String table, Job job)
205 throws IOException {
206 HTable outputTable = new HTable(job.getConfiguration(), table);
207 int regions = outputTable.getRegionsInfo().size();
208 if (job.getNumReduceTasks() > regions)
209 job.setNumReduceTasks(regions);
210 }
211
212
213
214
215
216
217
218
219
220 public static void setNumReduceTasks(String table, Job job)
221 throws IOException {
222 HTable outputTable = new HTable(job.getConfiguration(), table);
223 int regions = outputTable.getRegionsInfo().size();
224 job.setNumReduceTasks(regions);
225 }
226
227
228
229
230
231
232
233
234
235
236 public static void setScannerCaching(Job job, int batchSize) {
237 job.getConfiguration().setInt("hbase.client.scanner.caching", batchSize);
238 }
239
240
241
242
243
244
245 public static void addDependencyJars(Job job) throws IOException {
246 try {
247 addDependencyJars(job.getConfiguration(),
248 org.apache.zookeeper.ZooKeeper.class,
249 com.google.common.base.Function.class,
250 job.getMapOutputKeyClass(),
251 job.getMapOutputValueClass(),
252 job.getInputFormatClass(),
253 job.getOutputKeyClass(),
254 job.getOutputValueClass(),
255 job.getOutputFormatClass(),
256 job.getPartitionerClass(),
257 job.getCombinerClass());
258 } catch (ClassNotFoundException e) {
259 throw new IOException(e);
260 }
261 }
262
263
264
265
266
267
268 public static void addDependencyJars(Configuration conf,
269 Class... classes) throws IOException {
270
271 FileSystem localFs = FileSystem.getLocal(conf);
272
273 Set<String> jars = new HashSet<String>();
274
275
276 jars.addAll( conf.getStringCollection("tmpjars") );
277
278
279 for (Class clazz : classes) {
280 if (clazz == null) continue;
281
282 String pathStr = findContainingJar(clazz);
283 if (pathStr == null) {
284 LOG.warn("Could not find jar for class " + clazz +
285 " in order to ship it to the cluster.");
286 continue;
287 }
288 Path path = new Path(pathStr);
289 if (!localFs.exists(path)) {
290 LOG.warn("Could not validate jar file " + path + " for class "
291 + clazz);
292 continue;
293 }
294 jars.add(path.makeQualified(localFs).toString());
295 }
296 if (jars.isEmpty()) return;
297
298 conf.set("tmpjars",
299 StringUtils.arrayToString(jars.toArray(new String[0])));
300 }
301
302
303
304
305
306
307
308
309
310
311
312
313 private static String findContainingJar(Class my_class) {
314 ClassLoader loader = my_class.getClassLoader();
315 String class_file = my_class.getName().replaceAll("\\.", "/") + ".class";
316 try {
317 for(Enumeration itr = loader.getResources(class_file);
318 itr.hasMoreElements();) {
319 URL url = (URL) itr.nextElement();
320 if ("jar".equals(url.getProtocol())) {
321 String toReturn = url.getPath();
322 if (toReturn.startsWith("file:")) {
323 toReturn = toReturn.substring("file:".length());
324 }
325
326
327
328
329
330
331 toReturn = toReturn.replaceAll("\\+", "%2B");
332 toReturn = URLDecoder.decode(toReturn, "UTF-8");
333 return toReturn.replaceAll("!.*$", "");
334 }
335 }
336 } catch (IOException e) {
337 throw new RuntimeException(e);
338 }
339 return null;
340 }
341
342
343 }