数据处理过程图
package hadoop_test.data_duplicate_demo_02; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class DupDriver { public static void main(String[] args) throws Exception { System.setProperty("HADOOP_USER_NAME", "root"); Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(DupDriver.class); job.setMapperClass(DupMapper.class); job.setReducerClass(DupReducer .class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(NullWritable.class); // Value为NULL,具体原因后续会有说明 job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); FileInputFormat.setInputPaths(job, new Path("/hadoop_test/dup/dup.txt")); FileOutputFormat.setOutputPath(job, new Path("/hadoop_test/dup/word_count_result")); job.waitForCompletion(true); } }2. DupMapper
package hadoop_test.data_duplicate_demo_02; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; public class DupMapper extends Mapper3. DupReducer{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // value : 192.168.70.49 // 因为目标为去重,不用统计数量。因此不用让key为1,可减少IO时间开销 context.write(new Text(value),NullWritable.get()); } }
package hadoop_test.data_duplicate_demo_02; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; public class DupReducer extends Reducer{ @Override protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { context.write(new Text(key),NullWritable.get()); } }
详细每行代码功能可参考 【Hadoop学习项目】1. wordcount + combine 详解每行代码
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)