001 /**
002 *
003 * Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org)
004 * All rights reserved.
005 * Redistribution and use in source and binary forms, with or
006 * without modification, are permitted provided that the following
007 * conditions are met:
008 * - Redistributions of source code must retain the above copyright
009 * notice, this list of conditions and the following disclaimer.
010 * - Redistributions in binary form must reproduce the above copyright
011 * notice, this list of conditions and the following disclaimer in
012 * the documentation and/or other materials provided with the distribution.
013 * - Neither the name of the University Catholique de Louvain - UCL
014 * nor the names of its contributors may be used to endorse or
015 * promote products derived from this software without specific prior
016 * written permission.
017 *
018 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
019 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
020 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
021 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
022 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
023 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
024 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
026 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
027 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
028 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
029 * POSSIBILITY OF SUCH DAMAGE.
030 */
031
032 /**
033 * Licensed to the Apache Software Foundation (ASF) under one
034 * or more contributor license agreements. See the NOTICE file
035 * distributed with this work for additional information
036 * regarding copyright ownership. The ASF licenses this file
037 * to you under the Apache License, Version 2.0 (the
038 * "License"); you may not use this file except in compliance
039 * with the License. You may obtain a copy of the License at
040 *
041 * http://www.apache.org/licenses/LICENSE-2.0
042 *
043 * Unless required by applicable law or agreed to in writing, software
044 * distributed under the License is distributed on an "AS IS" BASIS,
045 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
046 * See the License for the specific language governing permissions and
047 * limitations under the License.
048 */
049 package org.apache.hadoop.util.bloom;
050
051 import java.io.DataInput;
052 import java.io.DataOutput;
053 import java.io.IOException;
054 import java.util.ArrayList;
055 import java.util.Collection;
056 import java.util.Collections;
057 import java.util.List;
058 import java.util.Random;
059
060 import org.apache.hadoop.classification.InterfaceAudience;
061 import org.apache.hadoop.classification.InterfaceStability;
062
063 /**
064 * Implements a <i>retouched Bloom filter</i>, as defined in the CoNEXT 2006 paper.
065 * <p>
066 * It allows the removal of selected false positives at the cost of introducing
067 * random false negatives, and with the benefit of eliminating some random false
068 * positives at the same time.
069 *
070 * <p>
071 * Originally created by
072 * <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>.
073 *
074 * @see Filter The general behavior of a filter
075 * @see BloomFilter A Bloom filter
076 * @see RemoveScheme The different selective clearing algorithms
077 *
078 * @see <a href="http://www-rp.lip6.fr/site_npa/site_rp/_publications/740-rbf_cameraready.pdf">Retouched Bloom Filters: Allowing Networked Applications to Trade Off Selected False Positives Against False Negatives</a>
079 */
080 @InterfaceAudience.Public
081 @InterfaceStability.Stable
082 public final class RetouchedBloomFilter extends BloomFilter
083 implements RemoveScheme {
084 /**
085 * KeyList vector (or ElementList Vector, as defined in the paper) of false positives.
086 */
087 List<Key>[] fpVector;
088
089 /**
090 * KeyList vector of keys recorded in the filter.
091 */
092 List<Key>[] keyVector;
093
094 /**
095 * Ratio vector.
096 */
097 double[] ratio;
098
099 private Random rand;
100
101 /** Default constructor - use with readFields */
102 public RetouchedBloomFilter() {}
103
104 /**
105 * Constructor
106 * @param vectorSize The vector size of <i>this</i> filter.
107 * @param nbHash The number of hash function to consider.
108 * @param hashType type of the hashing function (see
109 * {@link org.apache.hadoop.util.hash.Hash}).
110 */
111 public RetouchedBloomFilter(int vectorSize, int nbHash, int hashType) {
112 super(vectorSize, nbHash, hashType);
113
114 this.rand = null;
115 createVector();
116 }
117
118 @Override
119 public void add(Key key) {
120 if (key == null) {
121 throw new NullPointerException("key can not be null");
122 }
123
124 int[] h = hash.hash(key);
125 hash.clear();
126
127 for (int i = 0; i < nbHash; i++) {
128 bits.set(h[i]);
129 keyVector[h[i]].add(key);
130 }
131 }
132
133 /**
134 * Adds a false positive information to <i>this</i> retouched Bloom filter.
135 * <p>
136 * <b>Invariant</b>: if the false positive is <code>null</code>, nothing happens.
137 * @param key The false positive key to add.
138 */
139 public void addFalsePositive(Key key) {
140 if (key == null) {
141 throw new NullPointerException("key can not be null");
142 }
143
144 int[] h = hash.hash(key);
145 hash.clear();
146
147 for (int i = 0; i < nbHash; i++) {
148 fpVector[h[i]].add(key);
149 }
150 }
151
152 /**
153 * Adds a collection of false positive information to <i>this</i> retouched Bloom filter.
154 * @param coll The collection of false positive.
155 */
156 public void addFalsePositive(Collection<Key> coll) {
157 if (coll == null) {
158 throw new NullPointerException("Collection<Key> can not be null");
159 }
160
161 for (Key k : coll) {
162 addFalsePositive(k);
163 }
164 }
165
166 /**
167 * Adds a list of false positive information to <i>this</i> retouched Bloom filter.
168 * @param keys The list of false positive.
169 */
170 public void addFalsePositive(List<Key> keys) {
171 if (keys == null) {
172 throw new NullPointerException("ArrayList<Key> can not be null");
173 }
174
175 for (Key k : keys) {
176 addFalsePositive(k);
177 }
178 }
179
180 /**
181 * Adds an array of false positive information to <i>this</i> retouched Bloom filter.
182 * @param keys The array of false positive.
183 */
184 public void addFalsePositive(Key[] keys) {
185 if (keys == null) {
186 throw new NullPointerException("Key[] can not be null");
187 }
188
189 for (int i = 0; i < keys.length; i++) {
190 addFalsePositive(keys[i]);
191 }
192 }
193
194 /**
195 * Performs the selective clearing for a given key.
196 * @param k The false positive key to remove from <i>this</i> retouched Bloom filter.
197 * @param scheme The selective clearing scheme to apply.
198 */
199 public void selectiveClearing(Key k, short scheme) {
200 if (k == null) {
201 throw new NullPointerException("Key can not be null");
202 }
203
204 if (!membershipTest(k)) {
205 throw new IllegalArgumentException("Key is not a member");
206 }
207
208 int index = 0;
209 int[] h = hash.hash(k);
210
211 switch(scheme) {
212
213 case RANDOM:
214 index = randomRemove();
215 break;
216
217 case MINIMUM_FN:
218 index = minimumFnRemove(h);
219 break;
220
221 case MAXIMUM_FP:
222 index = maximumFpRemove(h);
223 break;
224
225 case RATIO:
226 index = ratioRemove(h);
227 break;
228
229 default:
230 throw new AssertionError("Undefined selective clearing scheme");
231
232 }
233
234 clearBit(index);
235 }
236
237 private int randomRemove() {
238 if (rand == null) {
239 rand = new Random();
240 }
241
242 return rand.nextInt(nbHash);
243 }
244
245 /**
246 * Chooses the bit position that minimizes the number of false negative generated.
247 * @param h The different bit positions.
248 * @return The position that minimizes the number of false negative generated.
249 */
250 private int minimumFnRemove(int[] h) {
251 int minIndex = Integer.MAX_VALUE;
252 double minValue = Double.MAX_VALUE;
253
254 for (int i = 0; i < nbHash; i++) {
255 double keyWeight = getWeight(keyVector[h[i]]);
256
257 if (keyWeight < minValue) {
258 minIndex = h[i];
259 minValue = keyWeight;
260 }
261
262 }
263
264 return minIndex;
265 }
266
267 /**
268 * Chooses the bit position that maximizes the number of false positive removed.
269 * @param h The different bit positions.
270 * @return The position that maximizes the number of false positive removed.
271 */
272 private int maximumFpRemove(int[] h) {
273 int maxIndex = Integer.MIN_VALUE;
274 double maxValue = Double.MIN_VALUE;
275
276 for (int i = 0; i < nbHash; i++) {
277 double fpWeight = getWeight(fpVector[h[i]]);
278
279 if (fpWeight > maxValue) {
280 maxValue = fpWeight;
281 maxIndex = h[i];
282 }
283 }
284
285 return maxIndex;
286 }
287
288 /**
289 * Chooses the bit position that minimizes the number of false negative generated while maximizing.
290 * the number of false positive removed.
291 * @param h The different bit positions.
292 * @return The position that minimizes the number of false negative generated while maximizing.
293 */
294 private int ratioRemove(int[] h) {
295 computeRatio();
296 int minIndex = Integer.MAX_VALUE;
297 double minValue = Double.MAX_VALUE;
298
299 for (int i = 0; i < nbHash; i++) {
300 if (ratio[h[i]] < minValue) {
301 minValue = ratio[h[i]];
302 minIndex = h[i];
303 }
304 }
305
306 return minIndex;
307 }
308
309 /**
310 * Clears a specified bit in the bit vector and keeps up-to-date the KeyList vectors.
311 * @param index The position of the bit to clear.
312 */
313 private void clearBit(int index) {
314 if (index < 0 || index >= vectorSize) {
315 throw new ArrayIndexOutOfBoundsException(index);
316 }
317
318 List<Key> kl = keyVector[index];
319 List<Key> fpl = fpVector[index];
320
321 // update key list
322 int listSize = kl.size();
323 for (int i = 0; i < listSize && !kl.isEmpty(); i++) {
324 removeKey(kl.get(0), keyVector);
325 }
326
327 kl.clear();
328 keyVector[index].clear();
329
330 //update false positive list
331 listSize = fpl.size();
332 for (int i = 0; i < listSize && !fpl.isEmpty(); i++) {
333 removeKey(fpl.get(0), fpVector);
334 }
335
336 fpl.clear();
337 fpVector[index].clear();
338
339 //update ratio
340 ratio[index] = 0.0;
341
342 //update bit vector
343 bits.clear(index);
344 }
345
346 /**
347 * Removes a given key from <i>this</i> filer.
348 * @param k The key to remove.
349 * @param vector The counting vector associated to the key.
350 */
351 private void removeKey(Key k, List<Key>[] vector) {
352 if (k == null) {
353 throw new NullPointerException("Key can not be null");
354 }
355 if (vector == null) {
356 throw new NullPointerException("ArrayList<Key>[] can not be null");
357 }
358
359 int[] h = hash.hash(k);
360 hash.clear();
361
362 for (int i = 0; i < nbHash; i++) {
363 vector[h[i]].remove(k);
364 }
365 }
366
367 /**
368 * Computes the ratio A/FP.
369 */
370 private void computeRatio() {
371 for (int i = 0; i < vectorSize; i++) {
372 double keyWeight = getWeight(keyVector[i]);
373 double fpWeight = getWeight(fpVector[i]);
374
375 if (keyWeight > 0 && fpWeight > 0) {
376 ratio[i] = keyWeight / fpWeight;
377 }
378 }
379 }
380
381 private double getWeight(List<Key> keyList) {
382 double weight = 0.0;
383 for (Key k : keyList) {
384 weight += k.getWeight();
385 }
386 return weight;
387 }
388
389 /**
390 * Creates and initialises the various vectors.
391 */
392 @SuppressWarnings("unchecked")
393 private void createVector() {
394 fpVector = new List[vectorSize];
395 keyVector = new List[vectorSize];
396 ratio = new double[vectorSize];
397
398 for (int i = 0; i < vectorSize; i++) {
399 fpVector[i] = Collections.synchronizedList(new ArrayList<Key>());
400 keyVector[i] = Collections.synchronizedList(new ArrayList<Key>());
401 ratio[i] = 0.0;
402 }
403 }
404
405 // Writable
406
407 @Override
408 public void write(DataOutput out) throws IOException {
409 super.write(out);
410 for (int i = 0; i < fpVector.length; i++) {
411 List<Key> list = fpVector[i];
412 out.writeInt(list.size());
413 for (Key k : list) {
414 k.write(out);
415 }
416 }
417 for (int i = 0; i < keyVector.length; i++) {
418 List<Key> list = keyVector[i];
419 out.writeInt(list.size());
420 for (Key k : list) {
421 k.write(out);
422 }
423 }
424 for (int i = 0; i < ratio.length; i++) {
425 out.writeDouble(ratio[i]);
426 }
427 }
428
429 @Override
430 public void readFields(DataInput in) throws IOException {
431 super.readFields(in);
432 createVector();
433 for (int i = 0; i < fpVector.length; i++) {
434 List<Key> list = fpVector[i];
435 int size = in.readInt();
436 for (int j = 0; j < size; j++) {
437 Key k = new Key();
438 k.readFields(in);
439 list.add(k);
440 }
441 }
442 for (int i = 0; i < keyVector.length; i++) {
443 List<Key> list = keyVector[i];
444 int size = in.readInt();
445 for (int j = 0; j < size; j++) {
446 Key k = new Key();
447 k.readFields(in);
448 list.add(k);
449 }
450 }
451 for (int i = 0; i < ratio.length; i++) {
452 ratio[i] = in.readDouble();
453 }
454 }
455 }