对给定的数据利用MapReduce编程实现数据的清洗和预处理,编程实现数据存储到HBase数据库,实现数据的增删改查 *** 作接口

对给定的数据利用MapReduce编程实现数据的清洗和预处理,编程实现数据存储到HBase数据库,实现数据的增删改查 *** 作接口,第1张

对给定的数据利用MapReduce编程实现数据的清洗和预处理,编程实现数据存储到HBase数据库,实现数据的增删改查 *** 作接口

对给定的数据利用MapReduce编程实现数据的清洗和预处理,编程实现数据存储到Hbase数据库,实现数据的增删改查 *** 作接口,同时对MapReduce处理好的数据利用Hive实现数据的基本统计。
设计要求:

  1. 根据数据特征,设计一个任务场景,利用MapReduce编程实现数据的清洗和预处理。(10分)
  2. 利用HDFS的JavaAPI编写程序将原始数据和预处理后的数据上传到分布式文件系统

数据集:
链接:https://pan.baidu.com/s/1rnUJn5ld45HpLhzbwYIM1A
提取码:7bsd

package com.company.HDFS;
 
import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class step0 {
	final static String INPUT_PATH="hdfs://192.168.88.100/data";
	final static String OUTPUT_PATH="hdfs://192.168.88.100/output";
	public static void main(String[] args) throws Exception {
		// TODO Auto-generated method stub
		Configuration configuration = new Configuration();
		FileSystem fileSystem =FileSystem.get(new URI(INPUT_PATH),configuration);
		if (fileSystem.exists(new Path(OUTPUT_PATH))) {
			fileSystem.delete(new Path(OUTPUT_PATH),true);
		}
		Job job = new Job(configuration,"step0");
		FileInputFormat.setInputPaths(job, INPUT_PATH);
		FileOutputFormat.setOutputPath(job,new Path(OUTPUT_PATH));
		job.setJarByClass(step0.class);
		job.setMapperClass(ReMapper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(NullWritable.class);
		job.setReducerClass(ReReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);
		job.waitForCompletion(true);
	}
 
 
 
    public static class ReReducer extends Reducer {
        private IntWritable result = new IntWritable();
 
        public ReReducer() {
        }

        protected void reduce(Text key2, Iterable value2, Reducer.Context context) throws IOException, InterruptedException {
        	
            context.write(key2,NullWritable.get());
            			
        }
    }
 
    public static class ReMapper extends Mapper {
        private static final int FAIL_DATA=9999;
        public void map(LongWritable key, Text value, Mapper.Context context) throws IOException, InterruptedException {
            
        	context.write(value,NullWritable.get());
 
        }
    }
}
 
 
package com.company.HDFS;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.junit.jupiter.api.Test;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

public class step1 {

    
    @Test
    public void demo_03() {

        try {
            //1 获取文件系统
            Configuration configuration = new Configuration();
            FileSystem fs = FileSystem.get(new URI("hdfs://192.168.88.100:8020"), configuration, "root");

            // 2 获取文件详情
            RemoteIterator listFiles = fs.listFiles(new Path("/"), true);



            while (listFiles.hasNext()) {
                LocatedFileStatus status = listFiles.next();

                // 输出详情
                // 文件名称
                System.out.println(status.getPath().getName());
                // 长度
                System.out.println(status.getLen());
                // 权限
                System.out.println(status.getPermission());
                // 分组
                System.out.println(status.getGroup());

                // 获取存储的块信息
                BlockLocation[] blockLocations = status.getBlockLocations();

                for (BlockLocation blockLocation : blockLocations) {

                    // 获取块存储的主机节点
                    String[] hosts = blockLocation.getHosts();

                    for (String host : hosts) {
                        System.out.println(host);
                    }
                }

                System.out.println("-----------分割线----------");
            }

            // 3 关闭资源
            fs.close();
        } catch (Exception ex) {
        }
    }


    
    @Test
    public void testCopyFromLocalFile() throws IOException, InterruptedException, URISyntaxException {

        // 1 获取文件系统
        Configuration configuration = new Configuration();
        configuration.set("dfs.replication", "2");
        FileSystem fs = FileSystem.get(new URI("hdfs://192.168.88.100:8020"), configuration, "root");

        // 2 上传文件
        fs.copyFromLocalFile(new Path("J:\the_efforts_paid_offf\HDFS_Hbase_HiveApi\src\main\java\com\company\datas\iris.data"), new Path("hdfs://192.168.88.100/input"));

        // 3 关闭资源
        fs.close();

        System.out.println("over");
    }
}

package com.company.HDFS;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HbaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import org.junit.Test;
import org.testng.annotations.AfterTest;
import org.testng.annotations.BeforeTest;

import java.io.IOException;


public class step2 {
    

    @Test
    public void createTable() throws IOException {
        Configuration conf = HbaseConfiguration.create();
        conf.set("hbase.zookeeper.quorum", "node1:2181");
        //建立连接
        Connection conn = ConnectionFactory.createConnection(conf);
        //获取表的管理类
        Admin admin = conn.getAdmin();
        //定义表
        HTableDescriptor hTableDescriptor=new HTableDescriptor(TableName.valueOf("demo"));
        //定义列簇
        HColumnDescriptor hColumnDescriptor =new HColumnDescriptor("info");
        //讲列簇定义到表中
        hTableDescriptor.addFamily(hColumnDescriptor);
        //执行建表 *** 作
        admin.createTable(hTableDescriptor);
        admin.close();
        conn.close();


    }

    

    @Test
    public void put(){
        Configuration conf = HbaseConfiguration.create();
        conf.set("hbase.zookeeper.quorum","node1:2181");

        try {
            //建立连接
            Connection conn= ConnectionFactory.createConnection(conf);
            //获取表
            Table table=conn.getTable(TableName.valueOf("demo"));
            //用行键实例化put
            Put put= new Put("rk001".getBytes());
            //指定列簇名,列名,和值
            put.addColumn("info".getBytes(),"name".getBytes(),"zhangsan".getBytes());
            table.put(put);
            table.close();
            conn.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    

    @Test
    public  void scan() throws IOException {
        Configuration conf=HbaseConfiguration.create();
        conf.set("hbase.zookeeper.quorum", "node1:2181");

        //建立连接
        Connection conn=ConnectionFactory.createConnection(conf);

        //获取表
        Table table=conn.getTable(TableName.valueOf("demo"));


        //初始化Scan实例
        Scan scan=new Scan();

        //增加过滤条件
        scan.addColumn("info".getBytes(), "name".getBytes());
        //返回结果
        ResultScanner rss=table.getScanner(scan);
        //迭代并取出结果
        for(Result rs:rss){
            String valStr=Bytes.toString(rs.getValue("info".getBytes(), "name".getBytes()));
            System.out.println(valStr);
        }


        //关闭连接
        table.close();
        conn.close();

    }

    

    @Test
    public  void delete() throws IOException {
        Configuration conf=HbaseConfiguration.create();
        conf.set("hbase.zookeeper.quorum", "node1:2181");

        //建立连接
        Connection conn=ConnectionFactory.createConnection(conf);

        //获取表
        Table table=conn.getTable(TableName.valueOf("demo"));


        // 用行键来实例化Delete实例
        Delete del = new Delete("rk0001".getBytes());
        // 执行删除
        table.delete(del);


        //关闭连接
        table.close();
        conn.close();
    }
}

欢迎分享,转载请注明来源:内存溢出

原文地址: https://outofmemory.cn/zaji/5679376.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-12-17
下一篇 2022-12-17

发表评论

登录后才能评论

评论列表(0条)

保存