001 /**
002 *
003 * Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org)
004 * All rights reserved.
005 * Redistribution and use in source and binary forms, with or
006 * without modification, are permitted provided that the following
007 * conditions are met:
008 * - Redistributions of source code must retain the above copyright
009 * notice, this list of conditions and the following disclaimer.
010 * - Redistributions in binary form must reproduce the above copyright
011 * notice, this list of conditions and the following disclaimer in
012 * the documentation and/or other materials provided with the distribution.
013 * - Neither the name of the University Catholique de Louvain - UCL
014 * nor the names of its contributors may be used to endorse or
015 * promote products derived from this software without specific prior
016 * written permission.
017 *
018 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
019 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
020 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
021 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
022 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
023 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
024 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
026 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
027 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
028 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
029 * POSSIBILITY OF SUCH DAMAGE.
030 */
031
032 /**
033 * Licensed to the Apache Software Foundation (ASF) under one
034 * or more contributor license agreements. See the NOTICE file
035 * distributed with this work for additional information
036 * regarding copyright ownership. The ASF licenses this file
037 * to you under the Apache License, Version 2.0 (the
038 * "License"); you may not use this file except in compliance
039 * with the License. You may obtain a copy of the License at
040 *
041 * http://www.apache.org/licenses/LICENSE-2.0
042 *
043 * Unless required by applicable law or agreed to in writing, software
044 * distributed under the License is distributed on an "AS IS" BASIS,
045 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
046 * See the License for the specific language governing permissions and
047 * limitations under the License.
048 */
049
050 package org.apache.hadoop.util.bloom;
051
052 import java.io.DataInput;
053 import java.io.DataOutput;
054 import java.io.IOException;
055
056 import java.util.BitSet;
057
058 import org.apache.hadoop.classification.InterfaceAudience;
059 import org.apache.hadoop.classification.InterfaceStability;
060
061 /**
062 * Implements a <i>Bloom filter</i>, as defined by Bloom in 1970.
063 * <p>
064 * The Bloom filter is a data structure that was introduced in 1970 and that has been adopted by
065 * the networking research community in the past decade thanks to the bandwidth efficiencies that it
066 * offers for the transmission of set membership information between networked hosts. A sender encodes
067 * the information into a bit vector, the Bloom filter, that is more compact than a conventional
068 * representation. Computation and space costs for construction are linear in the number of elements.
069 * The receiver uses the filter to test whether various elements are members of the set. Though the
070 * filter will occasionally return a false positive, it will never return a false negative. When creating
071 * the filter, the sender can choose its desired point in a trade-off between the false positive rate and the size.
072 *
073 * <p>
074 * Originally created by
075 * <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>.
076 *
077 * @see Filter The general behavior of a filter
078 *
079 * @see <a href="http://portal.acm.org/citation.cfm?id=362692&dl=ACM&coll=portal">Space/Time Trade-Offs in Hash Coding with Allowable Errors</a>
080 */
081 @InterfaceAudience.Public
082 @InterfaceStability.Stable
083 public class BloomFilter extends Filter {
084 private static final byte[] bitvalues = new byte[] {
085 (byte)0x01,
086 (byte)0x02,
087 (byte)0x04,
088 (byte)0x08,
089 (byte)0x10,
090 (byte)0x20,
091 (byte)0x40,
092 (byte)0x80
093 };
094
095 /** The bit vector. */
096 BitSet bits;
097
098 /** Default constructor - use with readFields */
099 public BloomFilter() {
100 super();
101 }
102
103 /**
104 * Constructor
105 * @param vectorSize The vector size of <i>this</i> filter.
106 * @param nbHash The number of hash function to consider.
107 * @param hashType type of the hashing function (see
108 * {@link org.apache.hadoop.util.hash.Hash}).
109 */
110 public BloomFilter(int vectorSize, int nbHash, int hashType) {
111 super(vectorSize, nbHash, hashType);
112
113 bits = new BitSet(this.vectorSize);
114 }
115
116 @Override
117 public void add(Key key) {
118 if(key == null) {
119 throw new NullPointerException("key cannot be null");
120 }
121
122 int[] h = hash.hash(key);
123 hash.clear();
124
125 for(int i = 0; i < nbHash; i++) {
126 bits.set(h[i]);
127 }
128 }
129
130 @Override
131 public void and(Filter filter) {
132 if(filter == null
133 || !(filter instanceof BloomFilter)
134 || filter.vectorSize != this.vectorSize
135 || filter.nbHash != this.nbHash) {
136 throw new IllegalArgumentException("filters cannot be and-ed");
137 }
138
139 this.bits.and(((BloomFilter) filter).bits);
140 }
141
142 @Override
143 public boolean membershipTest(Key key) {
144 if(key == null) {
145 throw new NullPointerException("key cannot be null");
146 }
147
148 int[] h = hash.hash(key);
149 hash.clear();
150 for(int i = 0; i < nbHash; i++) {
151 if(!bits.get(h[i])) {
152 return false;
153 }
154 }
155 return true;
156 }
157
158 @Override
159 public void not() {
160 bits.flip(0, vectorSize - 1);
161 }
162
163 @Override
164 public void or(Filter filter) {
165 if(filter == null
166 || !(filter instanceof BloomFilter)
167 || filter.vectorSize != this.vectorSize
168 || filter.nbHash != this.nbHash) {
169 throw new IllegalArgumentException("filters cannot be or-ed");
170 }
171 bits.or(((BloomFilter) filter).bits);
172 }
173
174 @Override
175 public void xor(Filter filter) {
176 if(filter == null
177 || !(filter instanceof BloomFilter)
178 || filter.vectorSize != this.vectorSize
179 || filter.nbHash != this.nbHash) {
180 throw new IllegalArgumentException("filters cannot be xor-ed");
181 }
182 bits.xor(((BloomFilter) filter).bits);
183 }
184
185 @Override
186 public String toString() {
187 return bits.toString();
188 }
189
190 /**
191 * @return size of the the bloomfilter
192 */
193 public int getVectorSize() {
194 return this.vectorSize;
195 }
196
197 // Writable
198
199 @Override
200 public void write(DataOutput out) throws IOException {
201 super.write(out);
202 byte[] bytes = new byte[getNBytes()];
203 for(int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) {
204 if (bitIndex == 8) {
205 bitIndex = 0;
206 byteIndex++;
207 }
208 if (bitIndex == 0) {
209 bytes[byteIndex] = 0;
210 }
211 if (bits.get(i)) {
212 bytes[byteIndex] |= bitvalues[bitIndex];
213 }
214 }
215 out.write(bytes);
216 }
217
218 @Override
219 public void readFields(DataInput in) throws IOException {
220 super.readFields(in);
221 bits = new BitSet(this.vectorSize);
222 byte[] bytes = new byte[getNBytes()];
223 in.readFully(bytes);
224 for(int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) {
225 if (bitIndex == 8) {
226 bitIndex = 0;
227 byteIndex++;
228 }
229 if ((bytes[byteIndex] & bitvalues[bitIndex]) != 0) {
230 bits.set(i);
231 }
232 }
233 }
234
235 /* @return number of bytes needed to hold bit vector */
236 private int getNBytes() {
237 return (vectorSize + 7) / 8;
238 }
239 }//end class