大数据培训之序列化案例实操

  1. 需求

统计每一个手机号耗费的总上行流量、下行流量、总流量

(1)输入数据

(2)输入数据格式:

7      13560436666     120.196.100.99           1116          954                   200

id      手机号码           网络ip                        上行流量  下行流量     网络状态码

(3)期望输出数据格式

13560436666             1116                954                          2070

手机号码               上行流量        下行流量                  总流量

2.需求分析

大数据培训

3.编写MapReduce程序

(1)编写流量统计的Bean对象

package com.atguigu.mapreduce.flowsum;

import java.io.DataInput;

import java.io.DataOutput;

import java.io.IOException;

import org.apache.hadoop.io.Writable;

 

// 1 实现writable接口

public class FlowBean implements Writable{

 

  private long upFlow;

  private long downFlow;

  private long sumFlow;

 

  //2  反序列化时,需要反射调用空参构造函数,所以必须有

  public FlowBean() {

      super();

  }

 

  public FlowBean(long upFlow, long downFlow) {

      super();

      this.upFlow = upFlow;

      this.downFlow = downFlow;

      this.sumFlow = upFlow + downFlow;

  }

 

  //3  写序列化方法

  @Override

  public void write(DataOutput out) throws IOException {

      out.writeLong(upFlow);

      out.writeLong(downFlow);

      out.writeLong(sumFlow);

  }

 

  //4 反序列化方法

  //5 反序列化方法读顺序必须和写序列化方法的写顺序必须一致

  @Override

  public void readFields(DataInput in) throws IOException {

      this.upFlow  = in.readLong();

      this.downFlow = in.readLong();

      this.sumFlow = in.readLong();

  }

 

  // 6 编写toString方法,方便后续打印到文本

  @Override

  public String toString() {

      return upFlow + “\t” + downFlow + “\t” + sumFlow;

  }

 

  public long getUpFlow() {

      return upFlow;

  }

 

  public void setUpFlow(long upFlow) {

      this.upFlow = upFlow;

  }

 

  public long getDownFlow() {

      return downFlow;

  }

 

  public void setDownFlow(long downFlow) {

      this.downFlow = downFlow;

  }

 

  public long getSumFlow() {

      return sumFlow;

  }

 

  public void setSumFlow(long sumFlow) {

      this.sumFlow = sumFlow;

  }

}

(2)编写Mapper类

package com.atguigu.mapreduce.flowsum;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

 

public class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean>{

 

  FlowBean v = new FlowBean();

  Text k = new Text();

 

  @Override

  protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

 

      // 1 获取一行

      String line = value.toString();

 

      // 2 切割字段

      String[] fields = line.split(“\t”);

 

      // 3 封装对象

      // 取出手机号码

      String phoneNum = fields[1];

 

      // 取出上行流量和下行流量

      long upFlow = Long.parseLong(fields[fields.length – 3]);

      long downFlow = Long.parseLong(fields[fields.length – 2]);

 

      k.set(phoneNum);

      v.set(downFlow, upFlow);

 

      // 4 写出

      context.write(k, v);

  }

}

(3)编写Reducer类

package com.atguigu.mapreduce.flowsum;

import java.io.IOException;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Reducer;

 

public class FlowCountReducer extends Reducer<Text, FlowBean, Text, FlowBean> {

 

  @Override

  protected void reduce(Text key, Iterable<FlowBean> values, Context context)throws IOException, InterruptedException {

 

      long sum_upFlow = 0;

      long sum_downFlow = 0;

 

      // 1 遍历所用bean,将其中的上行流量,下行流量分别累加

      for (FlowBean flowBean : values) {

         sum_upFlow += flowBean.getUpFlow();

         sum_downFlow += flowBean.getDownFlow();

      }

 

      // 2 封装对象

      FlowBean resultBean = new FlowBean(sum_upFlow, sum_downFlow);

 

      // 3 写出

      context.write(key, resultBean);

  }

}

(4)编写Driver驱动类

package com.atguigu.mapreduce.flowsum;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 

public class FlowsumDriver {

 

  public static void main(String[] args) throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException {

 

// 输入输出路径需要根据自己电脑上实际的输入输出路径设置

args = new String[] { “e:/input/inputflow”, “e:/output1” };

 

      // 1 获取配置信息,或者job对象实例

      Configuration configuration = new Configuration();

      Job job = Job.getInstance(configuration);

 

      // 6 指定本程序的jar包所在的本地路径

      job.setJarByClass(FlowsumDriver.class);

 

      // 2 指定本业务job要使用的mapper/Reducer业务类

      job.setMapperClass(FlowCountMapper.class);

      job.setReducerClass(FlowCountReducer.class);

 

      // 3 指定mapper输出数据的kv类型

      job.setMapOutputKeyClass(Text.class);

      job.setMapOutputValueClass(FlowBean.class);

 

      // 4 指定最终输出的数据的kv类型

      job.setOutputKeyClass(Text.class);

      job.setOutputValueClass(FlowBean.class);

 

      // 5 指定job的输入原始文件所在目录

      FileInputFormat.setInputPaths(job, new Path(args[0]));

      FileOutputFormat.setOutputPath(job, new Path(args[1]));

 

      // 7 将job中配置的相关参数,以及job所用的java类所在的jar包, 提交给yarn去运行

      boolean result = job.waitForCompletion(true);

      System.exit(result ? 0 : 1);

  }

}

想要了解跟多关于大数据培训课程内容欢迎关注尚硅谷大数据培训,尚硅谷除了这些技术文章外还有免费的高质量大数据培训课程视频供广大学员下载学习


上一篇:
下一篇:
关于尚硅谷
教育理念
名师团队
学员心声
资源下载
视频下载
资料下载
工具下载
加入我们
招聘岗位
岗位介绍
招贤纳师
联系我们
电话:010-56253825
邮箱:info@atguigu.com
地址:北京市昌平区宏福科技园综合楼6层(北京校区)

 深圳市宝安区西部硅谷大厦B座C区一层(深圳校区)

上海市松江区谷阳北路166号大江商厦6层(上海校区)