大数据培训压缩实操案例

1 数据流的压缩和解压缩

大数据培训课程

测试一下如下压缩方式:

表4-11

DEFLATE

org.apache.hadoop.io.compress.DefaultCodec

gzip

org.apache.hadoop.io.compress.GzipCodec

bzip2

org.apache.hadoop.io.compress.BZip2Codec

package com.atguigu.mapreduce.compress;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.FileOutputStream;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IOUtils;

import org.apache.hadoop.io.compress.CompressionCodec;

import org.apache.hadoop.io.compress.CompressionCodecFactory;

import org.apache.hadoop.io.compress.CompressionInputStream;

import org.apache.hadoop.io.compress.CompressionOutputStream;

import org.apache.hadoop.util.ReflectionUtils;

 

public class TestCompress {

 

   public static void main(String[] args) throws Exception {

   compress(“e:/hello.txt”,”org.apache.hadoop.io.compress.BZip2Codec”);

//    decompress(“e:/hello.txt.bz2”);

   }

 

   // 1、压缩

   private static void compress(String filename, String method) throws Exception {

 

      // (1)获取输入流

      FileInputStream fis = new FileInputStream(new File(filename));

 

      Class codecClass = Class.forName(method);

 

      CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, new Configuration());

 

      // (2)获取输出流

      FileOutputStream fos = new FileOutputStream(new File(filename + codec.getDefaultExtension()));

      CompressionOutputStream cos = codec.createOutputStream(fos);

 

      // (3)流的对拷

      IOUtils.copyBytes(fis, cos, 1024*1024*5, false);

 

      // (4)关闭资源

      cos.close();

      fos.close();

fis.close();

   }

 

   // 2、解压缩

   private static void decompress(String filename) throws FileNotFoundException, IOException {

 

      // (0)校验是否能解压缩

      CompressionCodecFactory factory = new CompressionCodecFactory(new Configuration());

 

      CompressionCodec codec = factory.getCodec(new Path(filename));

 

      if (codec == null) {

          System.out.println(“cannot find codec for file ” + filename);

          return;

      }

 

      // (1)获取输入流

      CompressionInputStream cis = codec.createInputStream(new FileInputStream(new File(filename)));

 

      // (2)获取输出流

      FileOutputStream fos = new FileOutputStream(new File(filename + “.decoded”));

 

      // (3)流的对拷

      IOUtils.copyBytes(cis, fos, 1024*1024*5, false);

 

      // (4)关闭资源

      cis.close();

      fos.close();

   }

}

2 Map输出端采用压缩

即使你的MapReduce的输入输出文件都是未压缩的文件,你仍然可以对Map任务的中间结果输出做压缩,因为它要写在硬盘并且通过网络传输到Reduce节点,对其压缩可以提高很多性能,这些工作只要设置两个属性即可,我们来看下代码怎么设置。

1.给大家提供的Hadoop源码支持的压缩格式有:BZip2Codec 、DefaultCodec

package com.atguigu.mapreduce.compress;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.compress.BZip2Codec; 

import org.apache.hadoop.io.compress.CompressionCodec;

import org.apache.hadoop.io.compress.GzipCodec;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 

public class WordCountDriver {

 

   public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

 

      Configuration configuration = new Configuration();

 

      // 开启map端输出压缩

   configuration.setBoolean(“mapreduce.map.output.compress”, true);

      // 设置map端输出压缩方式

   configuration.setClass(“mapreduce.map.output.compress.codec”, BZip2Codec.class, CompressionCodec.class);

 

      Job job = Job.getInstance(configuration);

 

      job.setJarByClass(WordCountDriver.class);

 

      job.setMapperClass(WordCountMapper.class);

      job.setReducerClass(WordCountReducer.class);

 

      job.setMapOutputKeyClass(Text.class);

      job.setMapOutputValueClass(IntWritable.class);

 

      job.setOutputKeyClass(Text.class);

      job.setOutputValueClass(IntWritable.class);

 

      FileInputFormat.setInputPaths(job, new Path(args[0]));

      FileOutputFormat.setOutputPath(job, new Path(args[1]));

 

      boolean result = job.waitForCompletion(true);

 

      System.exit(result ? 1 : 0);

   }

}

2.Mapper保持不变

package com.atguigu.mapreduce.compress;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

 

public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{

 

Text k = new Text();

   IntWritable v = new IntWritable(1);

 

   @Override

   protected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {

 

      // 1 获取一行

      String line = value.toString();

 

      // 2 切割

      String[] words = line.split(” “);

 

      // 3 循环写出

      for(String word:words){

k.set(word);

          context.write(k, v);

      }

   }

}

3.Reducer保持不变

package com.atguigu.mapreduce.compress;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Reducer;

 

public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{

 

   IntWritable v = new IntWritable();

 

   @Override

   protected void reduce(Text key, Iterable<IntWritable> values,

          Context context) throws IOException, InterruptedException {

 

      int sum = 0;

 

      // 1 汇总

      for(IntWritable value:values){

          sum += value.get();

      }

 

        v.set(sum);

 

        // 2 输出

      context.write(key, v);

   }

}

3 Reduce输出端采用压缩

基于WordCount案例处理。

1.修改驱动

package com.atguigu.mapreduce.compress;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.compress.BZip2Codec;

import org.apache.hadoop.io.compress.DefaultCodec;

import org.apache.hadoop.io.compress.GzipCodec;

import org.apache.hadoop.io.compress.Lz4Codec;

import org.apache.hadoop.io.compress.SnappyCodec;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 

public class WordCountDriver {

 

   public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

 

      Configuration configuration = new Configuration();

 

      Job job = Job.getInstance(configuration);

 

      job.setJarByClass(WordCountDriver.class);

 

      job.setMapperClass(WordCountMapper.class);

      job.setReducerClass(WordCountReducer.class);

 

      job.setMapOutputKeyClass(Text.class);

      job.setMapOutputValueClass(IntWritable.class);

 

      job.setOutputKeyClass(Text.class);

      job.setOutputValueClass(IntWritable.class);

 

      FileInputFormat.setInputPaths(job, new Path(args[0]));

      FileOutputFormat.setOutputPath(job, new Path(args[1]));

 

      // 设置reduce端输出压缩开启

      FileOutputFormat.setCompressOutput(job, true);

 

      // 设置压缩的方式

       FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);

//     FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

//     FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);

 

      boolean result = job.waitForCompletion(true);

 

      System.exit(result?1:0);

   }

}

2.Mapper和Reducer保持不变(详见4.6.2)

想要了解跟多关于大数据培训课程内容欢迎关注尚硅谷大数据培训,尚硅谷除了这些技术文章外还有免费的高质量大数据培训课程视频供广大学员下载学习


上一篇:
下一篇:
关于尚硅谷
教育理念
名师团队
学员心声
资源下载
视频下载
资料下载
工具下载
加入我们
招聘岗位
岗位介绍
招贤纳师
联系我们
电话:010-56253825
邮箱:info@atguigu.com
地址:北京市昌平区宏福科技园综合楼6层(北京校区)

 深圳市宝安区西部硅谷大厦B座C区一层(深圳校区)

上海市松江区谷阳北路166号大江商厦6层(上海校区)