1 /**
2 * Copyright 2011 The Apache Software Foundation
3 *
4 * Licensed to the Apache Software Foundation (ASF) under one
5 * or more contributor license agreements. See the NOTICE file
6 * distributed with this work for additional information
7 * regarding copyright ownership. The ASF licenses this file
8 * to you under the Apache License, Version 2.0 (the
9 * "License"); you may not use this file except in compliance
10 * with the License. You may obtain a copy of the License at
11 *
12 * http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
19 */
20 package org.apache.hadoop.hbase.regionserver;
21
22 import java.util.concurrent.atomic.AtomicInteger;
23 import java.util.concurrent.atomic.AtomicReference;
24
25 import org.apache.hadoop.conf.Configuration;
26 import com.google.common.base.Preconditions;
27
28 /**
29 * A memstore-local allocation buffer.
30 * <p>
31 * The MemStoreLAB is basically a bump-the-pointer allocator that allocates
32 * big (2MB) byte[] chunks from and then doles it out to threads that request
33 * slices into the array.
34 * <p>
35 * The purpose of this class is to combat heap fragmentation in the
36 * regionserver. By ensuring that all KeyValues in a given memstore refer
37 * only to large chunks of contiguous memory, we ensure that large blocks
38 * get freed up when the memstore is flushed.
39 * <p>
40 * Without the MSLAB, the byte array allocated during insertion end up
41 * interleaved throughout the heap, and the old generation gets progressively
42 * more fragmented until a stop-the-world compacting collection occurs.
43 * <p>
44 * TODO: we should probably benchmark whether word-aligning the allocations
45 * would provide a performance improvement - probably would speed up the
46 * Bytes.toLong/Bytes.toInt calls in KeyValue, but some of those are cached
47 * anyway
48 */
49 public class MemStoreLAB {
50 private AtomicReference<Chunk> curChunk = new AtomicReference<Chunk>();
51
52 final static String CHUNK_SIZE_KEY = "hbase.hregion.memstore.mslab.chunksize";
53 final static int CHUNK_SIZE_DEFAULT = 2048 * 1024;
54 final int chunkSize;
55
56 final static String MAX_ALLOC_KEY = "hbase.hregion.memstore.mslab.max.allocation";
57 final static int MAX_ALLOC_DEFAULT = 256 * 1024; // allocs bigger than this don't go through allocator
58 final int maxAlloc;
59
60 public MemStoreLAB() {
61 this(new Configuration());
62 }
63
64 public MemStoreLAB(Configuration conf) {
65 chunkSize = conf.getInt(CHUNK_SIZE_KEY, CHUNK_SIZE_DEFAULT);
66 maxAlloc = conf.getInt(MAX_ALLOC_KEY, MAX_ALLOC_DEFAULT);
67
68 // if we don't exclude allocations >CHUNK_SIZE, we'd infiniteloop on one!
69 Preconditions.checkArgument(
70 maxAlloc <= chunkSize,
71 MAX_ALLOC_KEY + " must be less than " + CHUNK_SIZE_KEY);
72 }
73
74 /**
75 * Allocate a slice of the given length.
76 *
77 * If the size is larger than the maximum size specified for this
78 * allocator, returns null.
79 */
80 public Allocation allocateBytes(int size) {
81 Preconditions.checkArgument(size >= 0, "negative size");
82
83 // Callers should satisfy large allocations directly from JVM since they
84 // don't cause fragmentation as badly.
85 if (size > maxAlloc) {
86 return null;
87 }
88
89 while (true) {
90 Chunk c = getOrMakeChunk();
91
92 // Try to allocate from this chunk
93 int allocOffset = c.alloc(size);
94 if (allocOffset != -1) {
95 // We succeeded - this is the common case - small alloc
96 // from a big buffer
97 return new Allocation(c.data, allocOffset);
98 }
99
100 // not enough space!
101 // try to retire this chunk
102 tryRetireChunk(c);
103 }
104 }
105
106 /**
107 * Try to retire the current chunk if it is still
108 * <code>c</code>. Postcondition is that curChunk.get()
109 * != c
110 */
111 private void tryRetireChunk(Chunk c) {
112 @SuppressWarnings("unused")
113 boolean weRetiredIt = curChunk.compareAndSet(c, null);
114 // If the CAS succeeds, that means that we won the race
115 // to retire the chunk. We could use this opportunity to
116 // update metrics on external fragmentation.
117 //
118 // If the CAS fails, that means that someone else already
119 // retired the chunk for us.
120 }
121
122 /**
123 * Get the current chunk, or, if there is no current chunk,
124 * allocate a new one from the JVM.
125 */
126 private Chunk getOrMakeChunk() {
127 while (true) {
128 // Try to get the chunk
129 Chunk c = curChunk.get();
130 if (c != null) {
131 return c;
132 }
133
134 // No current chunk, so we want to allocate one. We race
135 // against other allocators to CAS in an uninitialized chunk
136 // (which is cheap to allocate)
137 c = new Chunk(chunkSize);
138 if (curChunk.compareAndSet(null, c)) {
139 // we won race - now we need to actually do the expensive
140 // allocation step
141 c.init();
142 return c;
143 }
144 // someone else won race - that's fine, we'll try to grab theirs
145 // in the next iteration of the loop.
146 }
147 }
148
149 /**
150 * A chunk of memory out of which allocations are sliced.
151 */
152 private static class Chunk {
153 /** Actual underlying data */
154 private byte[] data;
155
156 private static final int UNINITIALIZED = -1;
157 private static final int OOM = -2;
158 /**
159 * Offset for the next allocation, or the sentinel value -1
160 * which implies that the chunk is still uninitialized.
161 * */
162 private AtomicInteger nextFreeOffset = new AtomicInteger(UNINITIALIZED);
163
164 /** Total number of allocations satisfied from this buffer */
165 private AtomicInteger allocCount = new AtomicInteger();
166
167 /** Size of chunk in bytes */
168 private final int size;
169
170 /**
171 * Create an uninitialized chunk. Note that memory is not allocated yet, so
172 * this is cheap.
173 * @param size in bytes
174 */
175 private Chunk(int size) {
176 this.size = size;
177 }
178
179 /**
180 * Actually claim the memory for this chunk. This should only be called from
181 * the thread that constructed the chunk. It is thread-safe against other
182 * threads calling alloc(), who will block until the allocation is complete.
183 */
184 public void init() {
185 assert nextFreeOffset.get() == UNINITIALIZED;
186 try {
187 data = new byte[size];
188 } catch (OutOfMemoryError e) {
189 boolean failInit = nextFreeOffset.compareAndSet(UNINITIALIZED, OOM);
190 assert failInit; // should be true.
191 throw e;
192 }
193 // Mark that it's ready for use
194 boolean initted = nextFreeOffset.compareAndSet(
195 UNINITIALIZED, 0);
196 // We should always succeed the above CAS since only one thread
197 // calls init()!
198 Preconditions.checkState(initted,
199 "Multiple threads tried to init same chunk");
200 }
201
202 /**
203 * Try to allocate <code>size</code> bytes from the chunk.
204 * @return the offset of the successful allocation, or -1 to indicate not-enough-space
205 */
206 public int alloc(int size) {
207 while (true) {
208 int oldOffset = nextFreeOffset.get();
209 if (oldOffset == UNINITIALIZED) {
210 // The chunk doesn't have its data allocated yet.
211 // Since we found this in curChunk, we know that whoever
212 // CAS-ed it there is allocating it right now. So spin-loop
213 // shouldn't spin long!
214 Thread.yield();
215 continue;
216 }
217 if (oldOffset == OOM) {
218 // doh we ran out of ram. return -1 to chuck this away.
219 return -1;
220 }
221
222 if (oldOffset + size > data.length) {
223 return -1; // alloc doesn't fit
224 }
225
226 // Try to atomically claim this chunk
227 if (nextFreeOffset.compareAndSet(oldOffset, oldOffset + size)) {
228 // we got the alloc
229 allocCount.incrementAndGet();
230 return oldOffset;
231 }
232 // we raced and lost alloc, try again
233 }
234 }
235
236 @Override
237 public String toString() {
238 return "Chunk@" + System.identityHashCode(this) +
239 " allocs=" + allocCount.get() + "waste=" +
240 (data.length - nextFreeOffset.get());
241 }
242 }
243
244 /**
245 * The result of a single allocation. Contains the chunk that the
246 * allocation points into, and the offset in this array where the
247 * slice begins.
248 */
249 public static class Allocation {
250 private final byte[] data;
251 private final int offset;
252
253 private Allocation(byte[] data, int off) {
254 this.data = data;
255 this.offset = off;
256 }
257
258 @Override
259 public String toString() {
260 return "Allocation(data=" + data +
261 " with capacity=" + data.length +
262 ", off=" + offset + ")";
263 }
264
265 byte[] getData() {
266 return data;
267 }
268
269 int getOffset() {
270 return offset;
271 }
272 }
273 }