001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.io.compress;
019
020 import java.util.*;
021
022 import org.apache.commons.logging.Log;
023 import org.apache.commons.logging.LogFactory;
024 import org.apache.hadoop.classification.InterfaceAudience;
025 import org.apache.hadoop.classification.InterfaceStability;
026 import org.apache.hadoop.conf.Configuration;
027 import org.apache.hadoop.fs.Path;
028 import org.apache.hadoop.util.ReflectionUtils;
029
030 /**
031 * A factory that will find the correct codec for a given filename.
032 */
033 @InterfaceAudience.Public
034 @InterfaceStability.Evolving
035 public class CompressionCodecFactory {
036
037 public static final Log LOG =
038 LogFactory.getLog(CompressionCodecFactory.class.getName());
039
040 private static final ServiceLoader<CompressionCodec> CODEC_PROVIDERS =
041 ServiceLoader.load(CompressionCodec.class);
042
043 /**
044 * A map from the reversed filename suffixes to the codecs.
045 * This is probably overkill, because the maps should be small, but it
046 * automatically supports finding the longest matching suffix.
047 */
048 private SortedMap<String, CompressionCodec> codecs = null;
049
050 /**
051 * A map from the reversed filename suffixes to the codecs.
052 * This is probably overkill, because the maps should be small, but it
053 * automatically supports finding the longest matching suffix.
054 */
055 private Map<String, CompressionCodec> codecsByName = null;
056
057 /**
058 * A map from class names to the codecs
059 */
060 private HashMap<String, CompressionCodec> codecsByClassName = null;
061
062 private void addCodec(CompressionCodec codec) {
063 String suffix = codec.getDefaultExtension();
064 codecs.put(new StringBuilder(suffix).reverse().toString(), codec);
065 codecsByClassName.put(codec.getClass().getCanonicalName(), codec);
066
067 String codecName = codec.getClass().getSimpleName();
068 codecsByName.put(codecName.toLowerCase(), codec);
069 if (codecName.endsWith("Codec")) {
070 codecName = codecName.substring(0, codecName.length() - "Codec".length());
071 codecsByName.put(codecName.toLowerCase(), codec);
072 }
073 }
074
075 /**
076 * Print the extension map out as a string.
077 */
078 @Override
079 public String toString() {
080 StringBuilder buf = new StringBuilder();
081 Iterator<Map.Entry<String, CompressionCodec>> itr =
082 codecs.entrySet().iterator();
083 buf.append("{ ");
084 if (itr.hasNext()) {
085 Map.Entry<String, CompressionCodec> entry = itr.next();
086 buf.append(entry.getKey());
087 buf.append(": ");
088 buf.append(entry.getValue().getClass().getName());
089 while (itr.hasNext()) {
090 entry = itr.next();
091 buf.append(", ");
092 buf.append(entry.getKey());
093 buf.append(": ");
094 buf.append(entry.getValue().getClass().getName());
095 }
096 }
097 buf.append(" }");
098 return buf.toString();
099 }
100
101 /**
102 * Get the list of codecs discovered via a Java ServiceLoader, or
103 * listed in the configuration. Codecs specified in configuration come
104 * later in the returned list, and are considered to override those
105 * from the ServiceLoader.
106 * @param conf the configuration to look in
107 * @return a list of the {@link CompressionCodec} classes
108 */
109 public static List<Class<? extends CompressionCodec>> getCodecClasses(Configuration conf) {
110 List<Class<? extends CompressionCodec>> result
111 = new ArrayList<Class<? extends CompressionCodec>>();
112 // Add codec classes discovered via service loading
113 synchronized (CODEC_PROVIDERS) {
114 // CODEC_PROVIDERS is a lazy collection. Synchronize so it is
115 // thread-safe. See HADOOP-8406.
116 for (CompressionCodec codec : CODEC_PROVIDERS) {
117 result.add(codec.getClass());
118 }
119 }
120 // Add codec classes from configuration
121 String codecsString = conf.get("io.compression.codecs");
122 if (codecsString != null) {
123 StringTokenizer codecSplit = new StringTokenizer(codecsString, ",");
124 while (codecSplit.hasMoreElements()) {
125 String codecSubstring = codecSplit.nextToken().trim();
126 if (codecSubstring.length() != 0) {
127 try {
128 Class<?> cls = conf.getClassByName(codecSubstring);
129 if (!CompressionCodec.class.isAssignableFrom(cls)) {
130 throw new IllegalArgumentException("Class " + codecSubstring +
131 " is not a CompressionCodec");
132 }
133 result.add(cls.asSubclass(CompressionCodec.class));
134 } catch (ClassNotFoundException ex) {
135 throw new IllegalArgumentException("Compression codec " +
136 codecSubstring + " not found.",
137 ex);
138 }
139 }
140 }
141 }
142 return result;
143 }
144
145 /**
146 * Sets a list of codec classes in the configuration. In addition to any
147 * classes specified using this method, {@link CompressionCodec} classes on
148 * the classpath are discovered using a Java ServiceLoader.
149 * @param conf the configuration to modify
150 * @param classes the list of classes to set
151 */
152 public static void setCodecClasses(Configuration conf,
153 List<Class> classes) {
154 StringBuilder buf = new StringBuilder();
155 Iterator<Class> itr = classes.iterator();
156 if (itr.hasNext()) {
157 Class cls = itr.next();
158 buf.append(cls.getName());
159 while(itr.hasNext()) {
160 buf.append(',');
161 buf.append(itr.next().getName());
162 }
163 }
164 conf.set("io.compression.codecs", buf.toString());
165 }
166
167 /**
168 * Find the codecs specified in the config value io.compression.codecs
169 * and register them. Defaults to gzip and deflate.
170 */
171 public CompressionCodecFactory(Configuration conf) {
172 codecs = new TreeMap<String, CompressionCodec>();
173 codecsByClassName = new HashMap<String, CompressionCodec>();
174 codecsByName = new HashMap<String, CompressionCodec>();
175 List<Class<? extends CompressionCodec>> codecClasses = getCodecClasses(conf);
176 if (codecClasses == null || codecClasses.isEmpty()) {
177 addCodec(new GzipCodec());
178 addCodec(new DefaultCodec());
179 } else {
180 for (Class<? extends CompressionCodec> codecClass : codecClasses) {
181 addCodec(ReflectionUtils.newInstance(codecClass, conf));
182 }
183 }
184 }
185
186 /**
187 * Find the relevant compression codec for the given file based on its
188 * filename suffix.
189 * @param file the filename to check
190 * @return the codec object
191 */
192 public CompressionCodec getCodec(Path file) {
193 CompressionCodec result = null;
194 if (codecs != null) {
195 String filename = file.getName();
196 String reversedFilename = new StringBuilder(filename).reverse().toString();
197 SortedMap<String, CompressionCodec> subMap =
198 codecs.headMap(reversedFilename);
199 if (!subMap.isEmpty()) {
200 String potentialSuffix = subMap.lastKey();
201 if (reversedFilename.startsWith(potentialSuffix)) {
202 result = codecs.get(potentialSuffix);
203 }
204 }
205 }
206 return result;
207 }
208
209 /**
210 * Find the relevant compression codec for the codec's canonical class name.
211 * @param classname the canonical class name of the codec
212 * @return the codec object
213 */
214 public CompressionCodec getCodecByClassName(String classname) {
215 if (codecsByClassName == null) {
216 return null;
217 }
218 return codecsByClassName.get(classname);
219 }
220
221 /**
222 * Find the relevant compression codec for the codec's canonical class name
223 * or by codec alias.
224 * <p/>
225 * Codec aliases are case insensitive.
226 * <p/>
227 * The code alias is the short class name (without the package name).
228 * If the short class name ends with 'Codec', then there are two aliases for
229 * the codec, the complete short class name and the short class name without
230 * the 'Codec' ending. For example for the 'GzipCodec' codec class name the
231 * alias are 'gzip' and 'gzipcodec'.
232 *
233 * @param codecName the canonical class name of the codec
234 * @return the codec object
235 */
236 public CompressionCodec getCodecByName(String codecName) {
237 if (codecsByClassName == null) {
238 return null;
239 }
240 CompressionCodec codec = getCodecByClassName(codecName);
241 if (codec == null) {
242 // trying to get the codec by name in case the name was specified instead a class
243 codec = codecsByName.get(codecName.toLowerCase());
244 }
245 return codec;
246 }
247
248 /**
249 * Find the relevant compression codec for the codec's canonical class name
250 * or by codec alias and returns its implemetation class.
251 * <p/>
252 * Codec aliases are case insensitive.
253 * <p/>
254 * The code alias is the short class name (without the package name).
255 * If the short class name ends with 'Codec', then there are two aliases for
256 * the codec, the complete short class name and the short class name without
257 * the 'Codec' ending. For example for the 'GzipCodec' codec class name the
258 * alias are 'gzip' and 'gzipcodec'.
259 *
260 * @param codecName the canonical class name of the codec
261 * @return the codec class
262 */
263 public Class<? extends CompressionCodec> getCodecClassByName(String codecName) {
264 CompressionCodec codec = getCodecByName(codecName);
265 if (codec == null) {
266 return null;
267 }
268 return codec.getClass();
269 }
270
271 /**
272 * Removes a suffix from a filename, if it has it.
273 * @param filename the filename to strip
274 * @param suffix the suffix to remove
275 * @return the shortened filename
276 */
277 public static String removeSuffix(String filename, String suffix) {
278 if (filename.endsWith(suffix)) {
279 return filename.substring(0, filename.length() - suffix.length());
280 }
281 return filename;
282 }
283
284 /**
285 * A little test program.
286 * @param args
287 */
288 public static void main(String[] args) throws Exception {
289 Configuration conf = new Configuration();
290 CompressionCodecFactory factory = new CompressionCodecFactory(conf);
291 boolean encode = false;
292 for(int i=0; i < args.length; ++i) {
293 if ("-in".equals(args[i])) {
294 encode = true;
295 } else if ("-out".equals(args[i])) {
296 encode = false;
297 } else {
298 CompressionCodec codec = factory.getCodec(new Path(args[i]));
299 if (codec == null) {
300 System.out.println("Codec for " + args[i] + " not found.");
301 } else {
302 if (encode) {
303 CompressionOutputStream out = null;
304 java.io.InputStream in = null;
305 try {
306 out = codec.createOutputStream(
307 new java.io.FileOutputStream(args[i]));
308 byte[] buffer = new byte[100];
309 String inFilename = removeSuffix(args[i],
310 codec.getDefaultExtension());
311 in = new java.io.FileInputStream(inFilename);
312 int len = in.read(buffer);
313 while (len > 0) {
314 out.write(buffer, 0, len);
315 len = in.read(buffer);
316 }
317 } finally {
318 if(out != null) { out.close(); }
319 if(in != null) { in.close(); }
320 }
321 } else {
322 CompressionInputStream in = null;
323 try {
324 in = codec.createInputStream(
325 new java.io.FileInputStream(args[i]));
326 byte[] buffer = new byte[100];
327 int len = in.read(buffer);
328 while (len > 0) {
329 System.out.write(buffer, 0, len);
330 len = in.read(buffer);
331 }
332 } finally {
333 if(in != null) { in.close(); }
334 }
335 }
336 }
337 }
338 }
339 }
340 }