View Javadoc

1   /*
2    * Copyright 2011 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  
21  package org.apache.hadoop.hbase.client.coprocessor;
22  
23  import java.io.IOException;
24  import java.util.ArrayList;
25  import java.util.List;
26  import java.util.concurrent.atomic.AtomicLong;
27  
28  import org.apache.commons.logging.Log;
29  import org.apache.commons.logging.LogFactory;
30  import org.apache.hadoop.conf.Configuration;
31  import org.apache.hadoop.hbase.HConstants;
32  import org.apache.hadoop.hbase.client.HTable;
33  import org.apache.hadoop.hbase.client.Scan;
34  import org.apache.hadoop.hbase.coprocessor.AggregateProtocol;
35  import org.apache.hadoop.hbase.coprocessor.ColumnInterpreter;
36  import org.apache.hadoop.hbase.util.Bytes;
37  import org.apache.hadoop.hbase.util.Pair;
38  
39  /**
40   * This client class is for invoking the aggregate functions deployed on the
41   * Region Server side via the AggregateProtocol. This class will implement the
42   * supporting functionality for summing/processing the individual results
43   * obtained from the AggregateProtocol for each region.
44   * <p>
45   * This will serve as the client side handler for invoking the aggregate
46   * functions.
47   * <ul>
48   * For all aggregate functions,
49   * <li>start row < end row is an essential condition (if they are not
50   * {@link HConstants#EMPTY_BYTE_ARRAY})
51   * <li>Column family can't be null. In case where multiple families are
52   * provided, an IOException will be thrown. An optional column qualifier can
53   * also be defined.
54   * <li>For methods to find maximum, minimum, sum, rowcount, it returns the
55   * parameter type. For average and std, it returns a double value. For row
56   * count, it returns a long value.
57   */
58  public class AggregationClient {
59  
60    private static final Log log = LogFactory.getLog(AggregationClient.class);
61    Configuration conf;
62  
63    /**
64     * Constructor with Conf object
65     * @param cfg
66     */
67    public AggregationClient(Configuration cfg) {
68      this.conf = cfg;
69    }
70  
71    /**
72     * It gives the maximum value of a column for a given column family for the
73     * given range. In case qualifier is null, a max of all values for the given
74     * family is returned.
75     * @param tableName
76     * @param ci
77     * @param scan
78     * @return max val <R>
79     * @throws Throwable
80     *           The caller is supposed to handle the exception as they are thrown
81     *           & propagated to it.
82     */
83    public <R, S> R max(final byte[] tableName, final ColumnInterpreter<R, S> ci,
84        final Scan scan) throws Throwable {
85      validateParameters(scan);
86      HTable table = new HTable(conf, tableName);
87  
88      class MaxCallBack implements Batch.Callback<R> {
89        R max = null;
90  
91        R getMax() {
92          return max;
93        }
94  
95        @Override
96        public synchronized void update(byte[] region, byte[] row, R result) {
97          max = ci.compare(max, result) < 0 ? result : max;
98        }
99      }
100     MaxCallBack aMaxCallBack = new MaxCallBack();
101     table.coprocessorExec(AggregateProtocol.class, scan.getStartRow(), scan
102         .getStopRow(), new Batch.Call<AggregateProtocol, R>() {
103       @Override
104       public R call(AggregateProtocol instance) throws IOException {
105         return instance.getMax(ci, scan);
106       }
107     }, aMaxCallBack);
108     return aMaxCallBack.getMax();
109   }
110 
111   private void validateParameters(Scan scan) throws IOException {
112     if (scan == null
113         || (Bytes.equals(scan.getStartRow(), scan.getStopRow()) && !Bytes
114             .equals(scan.getStartRow(), HConstants.EMPTY_START_ROW))
115         || Bytes.compareTo(scan.getStartRow(), scan.getStopRow()) > 0) {
116       throw new IOException(
117           "Agg client Exception: Startrow should be smaller than Stoprow");
118     } else if (scan.getFamilyMap().size() != 1) {
119       throw new IOException("There must be only one family.");
120     }
121   }
122 
123   /**
124    * It gives the minimum value of a column for a given column family for the
125    * given range. In case qualifier is null, a min of all values for the given
126    * family is returned.
127    * @param tableName
128    * @param ci
129    * @param scan
130    * @return min val <R>
131    * @throws Throwable
132    */
133   public <R, S> R min(final byte[] tableName, final ColumnInterpreter<R, S> ci,
134       final Scan scan) throws Throwable {
135     validateParameters(scan);
136     class MinCallBack implements Batch.Callback<R> {
137 
138       private R min = null;
139 
140       public R getMinimum() {
141         return min;
142       }
143 
144       @Override
145       public synchronized void update(byte[] region, byte[] row, R result) {
146         min = (min == null || ci.compare(result, min) < 0) ? result : min;
147       }
148     }
149     HTable table = new HTable(conf, tableName);
150     MinCallBack minCallBack = new MinCallBack();
151     table.coprocessorExec(AggregateProtocol.class, scan.getStartRow(), scan
152         .getStopRow(), new Batch.Call<AggregateProtocol, R>() {
153 
154       @Override
155       public R call(AggregateProtocol instance) throws IOException {
156         return instance.getMin(ci, scan);
157       }
158     }, minCallBack);
159     log.debug("Min fom all regions is: " + minCallBack.getMinimum());
160     return minCallBack.getMinimum();
161   }
162 
163   /**
164    * It gives the row count, by summing up the individual results obtained from
165    * regions. In case the qualifier is null, FirstKEyValueFilter is used to
166    * optimised the operation. In case qualifier is provided, I can't use the
167    * filter as it may set the flag to skip to next row, but the value read is
168    * not of the given filter: in this case, this particular row will not be
169    * counted ==> an error.
170    * @param tableName
171    * @param ci
172    * @param scan
173    * @return <R, S>
174    * @throws Throwable
175    */
176   public <R, S> long rowCount(final byte[] tableName,
177       final ColumnInterpreter<R, S> ci, final Scan scan) throws Throwable {
178     validateParameters(scan);
179     class RowNumCallback implements Batch.Callback<Long> {
180       private final AtomicLong rowCountL = new AtomicLong(0);
181 
182       public long getRowNumCount() {
183         return rowCountL.get();
184       }
185 
186       @Override
187       public void update(byte[] region, byte[] row, Long result) {
188         rowCountL.addAndGet(result.longValue());
189       }
190     }
191     RowNumCallback rowNum = new RowNumCallback();
192     HTable table = new HTable(conf, tableName);
193     table.coprocessorExec(AggregateProtocol.class, scan.getStartRow(), scan
194         .getStopRow(), new Batch.Call<AggregateProtocol, Long>() {
195       @Override
196       public Long call(AggregateProtocol instance) throws IOException {
197         return instance.getRowNum(ci, scan);
198       }
199     }, rowNum);
200     return rowNum.getRowNumCount();
201   }
202 
203   /**
204    * It sums up the value returned from various regions. In case qualifier is
205    * null, summation of all the column qualifiers in the given family is done.
206    * @param tableName
207    * @param ci
208    * @param scan
209    * @return sum <S>
210    * @throws Throwable
211    */
212   public <R, S> S sum(final byte[] tableName, final ColumnInterpreter<R, S> ci,
213       final Scan scan) throws Throwable {
214     validateParameters(scan);
215     class SumCallBack implements Batch.Callback<S> {
216       S sumVal = null;
217 
218       public S getSumResult() {
219         return sumVal;
220       }
221 
222       @Override
223       public synchronized void update(byte[] region, byte[] row, S result) {
224         sumVal = ci.add(sumVal, result);
225       }
226     }
227     SumCallBack sumCallBack = new SumCallBack();
228     HTable table = new HTable(conf, tableName);
229     table.coprocessorExec(AggregateProtocol.class, scan.getStartRow(), scan
230         .getStopRow(), new Batch.Call<AggregateProtocol, S>() {
231       @Override
232       public S call(AggregateProtocol instance) throws IOException {
233         return instance.getSum(ci, scan);
234       }
235     }, sumCallBack);
236     return sumCallBack.getSumResult();
237   }
238 
239   /**
240    * It computes average while fetching sum and row count from all the
241    * corresponding regions. Approach is to compute a global sum of region level
242    * sum and rowcount and then compute the average.
243    * @param tableName
244    * @param scan
245    * @throws Throwable
246    */
247   private <R, S> Pair<S, Long> getAvgArgs(final byte[] tableName,
248       final ColumnInterpreter<R, S> ci, final Scan scan) throws Throwable {
249     validateParameters(scan);
250     class AvgCallBack implements Batch.Callback<Pair<S, Long>> {
251       S sum = null;
252       Long rowCount = 0l;
253 
254       public Pair<S, Long> getAvgArgs() {
255         return new Pair<S, Long>(sum, rowCount);
256       }
257 
258       @Override
259       public synchronized void update(byte[] region, byte[] row, Pair<S, Long> result) {
260         sum = ci.add(sum, result.getFirst());
261         rowCount += result.getSecond();
262       }
263     }
264     AvgCallBack avgCallBack = new AvgCallBack();
265     HTable table = new HTable(conf, tableName);
266     table.coprocessorExec(AggregateProtocol.class, scan.getStartRow(), scan
267         .getStopRow(), new Batch.Call<AggregateProtocol, Pair<S, Long>>() {
268       @Override
269       public Pair<S, Long> call(AggregateProtocol instance) throws IOException {
270         return instance.getAvg(ci, scan);
271       }
272     }, avgCallBack);
273     return avgCallBack.getAvgArgs();
274   }
275 
276   /**
277    * This is the client side interface/handle for calling the average method for
278    * a given cf-cq combination. It was necessary to add one more call stack as
279    * its return type should be a decimal value, irrespective of what
280    * columninterpreter says. So, this methods collects the necessary parameters
281    * to compute the average and returs the double value.
282    * @param tableName
283    * @param ci
284    * @param scan
285    * @return <R, S>
286    * @throws Throwable
287    */
288   public <R, S> double avg(final byte[] tableName,
289       final ColumnInterpreter<R, S> ci, Scan scan) throws Throwable {
290     Pair<S, Long> p = getAvgArgs(tableName, ci, scan);
291     return ci.divideForAvg(p.getFirst(), p.getSecond());
292   }
293 
294   /**
295    * It computes a global standard deviation for a given column and its value.
296    * Standard deviation is square root of (average of squares -
297    * average*average). From individual regions, it obtains sum, square sum and
298    * number of rows. With these, the above values are computed to get the global
299    * std.
300    * @param tableName
301    * @param scan
302    * @return
303    * @throws Throwable
304    */
305   private <R, S> Pair<List<S>, Long> getStdArgs(final byte[] tableName,
306       final ColumnInterpreter<R, S> ci, final Scan scan) throws Throwable {
307     validateParameters(scan);
308     class StdCallback implements Batch.Callback<Pair<List<S>, Long>> {
309       long rowCountVal = 0l;
310       S sumVal = null, sumSqVal = null;
311 
312       public Pair<List<S>, Long> getStdParams() {
313         List<S> l = new ArrayList<S>();
314         l.add(sumVal);
315         l.add(sumSqVal);
316         Pair<List<S>, Long> p = new Pair<List<S>, Long>(l, rowCountVal);
317         return p;
318       }
319 
320       @Override
321       public synchronized void update(byte[] region, byte[] row, Pair<List<S>, Long> result) {
322         sumVal = ci.add(sumVal, result.getFirst().get(0));
323         sumSqVal = ci.add(sumSqVal, result.getFirst().get(1));
324         rowCountVal += result.getSecond();
325       }
326     }
327     StdCallback stdCallback = new StdCallback();
328     HTable table = new HTable(conf, tableName);
329     table.coprocessorExec(AggregateProtocol.class, scan.getStartRow(), scan
330         .getStopRow(),
331         new Batch.Call<AggregateProtocol, Pair<List<S>, Long>>() {
332           @Override
333           public Pair<List<S>, Long> call(AggregateProtocol instance)
334               throws IOException {
335             return instance.getStd(ci, scan);
336           }
337 
338         }, stdCallback);
339     return stdCallback.getStdParams();
340   }
341 
342   /**
343    * This is the client side interface/handle for calling the std method for a
344    * given cf-cq combination. It was necessary to add one more call stack as its
345    * return type should be a decimal value, irrespective of what
346    * columninterpreter says. So, this methods collects the necessary parameters
347    * to compute the std and returns the double value.
348    * @param tableName
349    * @param ci
350    * @param scan
351    * @return <R, S>
352    * @throws Throwable
353    */
354   public <R, S> double std(final byte[] tableName, ColumnInterpreter<R, S> ci,
355       Scan scan) throws Throwable {
356     Pair<List<S>, Long> p = getStdArgs(tableName, ci, scan);
357     double res = 0d;
358     double avg = ci.divideForAvg(p.getFirst().get(0), p.getSecond());
359     double avgOfSumSq = ci.divideForAvg(p.getFirst().get(1), p.getSecond());
360     res = avgOfSumSq - (avg) * (avg); // variance
361     res = Math.pow(res, 0.5);
362     return res;
363   }
364 
365 }