第7章 MapReduce实战_python

第1关：年龄统计

编程要求

使用MapReduce计算班级每个学生的最好成绩，输入文件路径为/user/test/input，请将计算后的结果输出到/user/test/output/目录下。

测试说明

输入文件在你每次点击评测的时候，平台会为你创建，无需你自己创建，只需要启动HDFS，编写java代码即可。

输入文件的数据格式如下：

张三 12 李四 13 张三 89 李四 92 ...

依照如上格式你应该输出：

李四 92

具体本关的预期输出请查看右侧测试集。

因为大数据实训消耗资源较大，且map/reduce运行比较耗时，所以评测时间较长，大概在60秒左右，请耐心等待。

import java.io.IOException;
import java.util.StringTokenizer;
 
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class WordCount {
	/********** Begin **********/
    public static class TokenizerMapper extends Mapper {
        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();
        private int maxValue = 0;
        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            StringTokenizer itr = new StringTokenizer(value.toString(),"\n");
            while (itr.hasMoreTokens()) {
                String[] str = itr.nextToken().split(" ");
                String name = str[0];
                one.set(Integer.parseInt(str[1]));
                word.set(name);
                context.write(word,one);
            }
            //context.write(word,one);
        }
    }
    public static class IntSumReducer extends Reducer {
        private IntWritable result = new IntWritable();
        public void reduce(Text key, Iterable values, Context context)
                throws IOException, InterruptedException {
            int maxAge = 0;
            int age = 0;
            for (IntWritable intWritable : values) {
                maxAge = Math.max(maxAge, intWritable.get());
            }
            result.set(maxAge);
            context.write(key, result);
        }
    }
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = new Job(conf, "word count");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(TokenizerMapper.class);
        job.setCombinerClass(IntSumReducer.class);
        job.setReducerClass(IntSumReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        String inputfile = "/user/test/input";
        String outputFile = "/user/test/output/";
        FileInputFormat.addInputPath(job, new Path(inputfile));
        FileOutputFormat.setOutputPath(job, new Path(outputFile));
        job.waitForCompletion(true);
        
	/********** End **********/
    }
}

第2关：文件内容合并去重

编程要求

接下来我们通过一个练习来巩固学习到的MapReduce知识吧。

对于两个输入文件，即文件file1和文件file2，请编写MapReduce程序，对两个文件进行合并，并剔除其中重复的内容，得到一个新的输出文件file3。为了完成文件合并去重的任务，你编写的程序要能将含有重复内容的不同文件合并到一个没有重复的整合文件，规则如下：

第一列按学号排列；
学号相同，按x,y,z排列；
输入文件路径为：/user/tmp/input/；
输出路径为：/user/tmp/output/。

注意：输入文件后台已经帮你创建好了，不需要你再重复创建。

测试说明

程序会对你编写的代码进行测试：输入已经指定了测试文本数据：需要你的程序输出合并去重后的结果。下面是输入文件和输出文件的一个样例供参考。

输入文件file1的样例如下： 20150101 x 20150102 y 20150103 x 20150104 y 20150105 z 20150106 x

输入文件file2的样例如下： 20150101 y 20150102 y 20150103 x 20150104 z 20150105 y

根据输入文件file1和file2合并得到的输出文件file3的样例如下：

20150101 x 20150101 y 20150102 y 20150103 x 20150104 y 20150104 z 20150105 y 20150105 z 20150106 x

import java.io.IOException;

import java.util.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class Merge {

	/**
	 * @param args
	 * 对A,B两个文件进行合并，并剔除其中重复的内容，得到一个新的输出文件C
	 */
	//在这重载map函数，直接将输入中的value复制到输出数据的key上 注意在map方法中要抛出异常：throws IOException,InterruptedException
	public static class Map extends Mapper { 
	
    /********** Begin **********/
        public void map(Object key, Text value, Context content) 
            throws IOException, InterruptedException {  
            Text text1 = new Text();
            Text text2 = new Text();
            StringTokenizer itr = new StringTokenizer(value.toString());
            while (itr.hasMoreTokens()) {
                text1.set(itr.nextToken());
                text2.set(itr.nextToken());
                content.write(text1, text2);
            }
        }
    
    
	/********** End **********/

	} 

	
	
	//在这重载reduce函数，直接将输入中的key复制到输出数据的key上  注意在reduce方法上要抛出异常：throws IOException,InterruptedException
	public static class  Reduce  extends Reducer {  
        public void reduce(Text key, Iterable values, Context context) 
            throws IOException, InterruptedException {
            Set set = new TreeSet();
            for(Text tex : values){
                set.add(tex.toString());
            }
            for(String tex : set){
                context.write(key, new Text(tex));
            }
        }
    }




	
	public static void main(String[] args) throws Exception{

		// TODO Auto-generated method stub
		Configuration conf = new Configuration();
		conf.set("fs.default.name","hdfs://localhost:9000");
		
		Job job = Job.getInstance(conf,"Merge and duplicate removal");
		job.setJarByClass(Merge.class);
		job.setMapperClass(Map.class);
		job.setCombinerClass(Reduce.class);
		job.setReducerClass(Reduce.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		String inputPath = "/user/tmp/input/";  //在这里设置输入路径
		String outputPath = "/user/tmp/output/";  //在这里设置输出路径

		FileInputFormat.addInputPath(job, new Path(inputPath));
		FileOutputFormat.setOutputPath(job, new Path(outputPath));
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}

}

第3关：信息挖掘 - 挖掘父子关系

编程要求

你编写的程序要能挖掘父子辈关系，给出祖孙辈关系的表格。规则如下：

孙子在前，祖父在后；
孙子相同，祖父的名字按照A-Z排列；
输入文件路径：/user/reduce/input；
输出文件路径：/user/reduce/output。

测试说明

程序会对你编写的代码进行测试：下面给出一个child-parent的表格，要求挖掘其中的父子辈关系，给出祖孙辈关系的表格。

输入文件内容如下： child parent Steven Lucy Steven Jack Jone Lucy Jone Jack Lucy Mary Lucy Frank Jack Alice Jack Jesse David Alice David Jesse Philip David Philip Alma Mark David Mark Alma

输出文件内容如下：

grandchild grandparent Steven Alice Steven Jesse Jone Alice Jone Jesse Steven Mary Steven Frank Jone Mary Jone Frank Philip Alice Philip Jesse Mark Alice Mark Jesse

import java.io.IOException;
import java.util.*;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class simple_data_mining {
	public static int time = 0;

	/**
	 * @param args
	 * 输入一个child-parent的表格
	 * 输出一个体现grandchild-grandparent关系的表格
	 */
	//Map将输入文件按照空格分割成child和parent，然后正序输出一次作为右表，反序输出一次作为左表，需要注意的是在输出的value中必须加上左右表区别标志
	public static class Map extends Mapper{
		public void map(Object key, Text value, Context context) throws IOException,InterruptedException{
			/********** Begin **********/
            String line = value.toString();
             String[] childAndParent = line.split(" ");
             List list = new ArrayList<>(2);
              for (String childOrParent : childAndParent) {
                 if (!"".equals(childOrParent)) {
                     list.add(childOrParent);
                  } 
              } 
              if (!"child".equals(list.get(0))) {
                  String childName = list.get(0);
                  String parentName = list.get(1);
                  String relationType = "1";
                  context.write(new Text(parentName), new Text(relationType + "+"
                        + childName + "+" + parentName));
                  relationType = "2";
                  context.write(new Text(childName), new Text(relationType + "+"
                        + childName + "+" + parentName));
              }
			/********** End **********/
		}
	}

	public static class Reduce extends Reducer{
		public void reduce(Text key, Iterable values,Context context) throws IOException,InterruptedException{
				/********** Begin **********/

			//输出表头
            if (time == 0) {
                context.write(new Text("grand_child"), new Text("grand_parent"));
                time++;
            }
			//获取value-list中value的child
            List grandChild = new ArrayList<>();
            //获取value-list中value的parent
            List grandParent = new ArrayList<>();
			//左表，取出child放入grand_child
            for (Text text : values) {
                String s = text.toString();
                String[] relation = s.split("\\+");
                String relationType = relation[0];
                String childName = relation[1];
                String parentName = relation[2];
                if ("1".equals(relationType)) {grandChild.add(childName);}
                else {
                    grandParent.add(parentName);
                }
            }
				//右表，取出parent放入grand_parent
            int grandParentNum = grandParent.size();
            int grandChildNum = grandChild.size();
            if (grandParentNum != 0 && grandChildNum != 0) {
                for (int m = 0; m < grandChildNum; m++) {
                    for (int n = 0; n < grandParentNum; n++) {
                        //输出结果
                    context.write(new Text(grandChild.get(m)), new Text(grandParent.get(n)));
                    }
                }
            }
				/********** End **********/	
		}
	}
	public static void main(String[] args) throws Exception{
		// TODO Auto-generated method stub
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf,"Single table join");
		job.setJarByClass(simple_data_mining.class);
		job.setMapperClass(Map.class);
		job.setReducerClass(Reduce.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		String inputPath = "/user/reduce/input";   //设置输入路径
		String outputPath = "/user/reduce/output";   //设置输出路径
		FileInputFormat.addInputPath(job, new Path(inputPath));
		FileOutputFormat.setOutputPath(job, new Path(outputPath));
		System.exit(job.waitForCompletion(true) ? 0 : 1);

	}

}

欢迎分享，转载请注明来源：内存溢出

原文地址: http://outofmemory.cn/langs/741800.html

第7章 MapReduce实战

发表评论

评论列表（0条）