001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019 package org.apache.hadoop.io.compress;
020
021 import java.io.IOException;
022 import java.io.InputStream;
023
024 import org.apache.hadoop.classification.InterfaceAudience;
025 import org.apache.hadoop.classification.InterfaceStability;
026
027
028 /**
029 * This interface is meant to be implemented by those compression codecs
030 * which are capable to compress / de-compress a stream starting at any
031 * arbitrary position.
032 *
033 * Especially the process of de-compressing a stream starting at some arbitrary
034 * position is challenging. Most of the codecs are only able to successfully
035 * de-compress a stream, if they start from the very beginning till the end.
036 * One of the reasons is the stored state at the beginning of the stream which
037 * is crucial for de-compression.
038 *
039 * Yet there are few codecs which do not save the whole state at the beginning
040 * of the stream and hence can be used to de-compress stream starting at any
041 * arbitrary points. This interface is meant to be used by such codecs. Such
042 * codecs are highly valuable, especially in the context of Hadoop, because
043 * an input compressed file can be split and hence can be worked on by multiple
044 * machines in parallel.
045 */
046 @InterfaceAudience.Public
047 @InterfaceStability.Evolving
048 public interface SplittableCompressionCodec extends CompressionCodec {
049
050 /**
051 * During decompression, data can be read off from the decompressor in two
052 * modes, namely continuous and blocked. Few codecs (e.g. BZip2) are capable
053 * of compressing data in blocks and then decompressing the blocks. In
054 * Blocked reading mode codecs inform 'end of block' events to its caller.
055 * While in continuous mode, the caller of codecs is unaware about the blocks
056 * and uncompressed data is spilled out like a continuous stream.
057 */
058 public enum READ_MODE {CONTINUOUS, BYBLOCK};
059
060 /**
061 * Create a stream as dictated by the readMode. This method is used when
062 * the codecs wants the ability to work with the underlying stream positions.
063 *
064 * @param seekableIn The seekable input stream (seeks in compressed data)
065 * @param start The start offset into the compressed stream. May be changed
066 * by the underlying codec.
067 * @param end The end offset into the compressed stream. May be changed by
068 * the underlying codec.
069 * @param readMode Controls whether stream position is reported continuously
070 * from the compressed stream only only at block boundaries.
071 * @return a stream to read uncompressed bytes from
072 */
073 SplitCompressionInputStream createInputStream(InputStream seekableIn,
074 Decompressor decompressor, long start, long end, READ_MODE readMode)
075 throws IOException;
076
077 }