Hive自定义函数UDF、UDTF

Hive自定义函数UDF、UDTF,第1张

Hive自定义函数UDF、UDTF Hive自定义函数UDF、UDTF
  • hive中已经自带一些函数,但数量有限,有时候需要自己定义函数,自定义函数分为一下三种:
    • 1、UDF(User-Defined-Function)
      一进一出
      类似于:lower/upper/reverse
    • 2、UDAF(User-Defined Aggregation Function)
      聚集函数,多进一出
      类似于:count/max/min
      3、UDTF(User-Defined Table-Generating Functions)
      一进多出
      如lateral view explode()
1.自定义UDF
  • 需求字符串大小写转换
1.1依赖

    
        org.apache.hive
        hive-exec
        2.1.0
    
    
        org.apache.hadoop
        hadoop-common
        2.7.5
    

1.2代码实现

public class Uppercase extends UDF {

    public Text evaluate(final Text s) {
        if (null == s) {
            return null;
        }
        //返回大写字母
        return new Text(s.toString().toUpperCase());
    }

}
1.3 函数使用 1.3.1 临时函数
  • 向hive的客户端添加jar包
cd /export/server/hive-2.1.0/lib
mv user-defined-function-1.0-SNAPSHOT.jar my_uppercase.jar
  • 连接hive,添加自定义临时函数
add jar /export/server/hive-2.1.0/lib/my_uppercase.jar;
-- 自定义临时函数
create temporary function my_upercase as 'com.dk.udf.Uppercase';
-- 使用函数
select my_upercase("abcDe");
1.3.2 永久函数
  • 上传jar到HDFS中
# 在hdfd创建jar包存放的文件夹
hadoop fs -mkdir /hive_func
# 上传jar包
hadoop fs -put /export/server/hive-2.1.0/lib/my_uppercase.jar /hive_func
  • 连接hive,添加自定义函数
# 创建永久函数
create function my_upercase2 as 'com.dk.udf.Uppercase'
    using jar 'hdfs://node1:8020/hive_func/my_uppercase.jar';
# 使用
select my_upercase2("abcDe");
2. 自定义UDTF
  • 需求字符串一进多出
2.1 单列一进多出转换 2.1.1 代码实现
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

import java.util.ArrayList;
import java.util.List;


public class SplitString extends GenericUDTF {


    
    private final transient Object[] forwardList = new Object[1];

    
    @Override
    public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
        //初始化字段列表
        List fieldNames = new ArrayList<>();
        fieldNames.add("column_1");
        //初始化字段检查器列表,用于检查字段类型
        List inspectors = new ArrayList<>();
        //设置第一个字段类型为string
        inspectors.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, inspectors);
    }


    @Override
    public void process(Object[] args) throws HiveException {
        if (args == null || args.length < 1){
            super.forward(forwardList);
            return;
        }
        //获取需要拆分的数据
        String argsStr = args[0].toString();
        //获取字段分隔符
        String splitStr = args[1].toString();
        //获取拆分后的数组
        String[] fields = argsStr.split(splitStr);
        for (String field : fields) {
            //放入输出字段集合
            forwardList[0] = field;
            //输出
            super.forward(forwardList);
        }
    }

    @Override
    public void close() throws HiveException {

    }
}
2.1.2 函数使用
add jar /export/server/hive-2.1.0/lib/my_split_string.jar;
create temporary function split_string_udtf as 'com.dk.udtf.SplitString';

select split_string_udtf("索隆,路飞,山治,乔巴", ",");
2.2 多列一进多出转换 2.2.1 代码实现
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;


public class SplitMapList
extends GenericUDTF { private final transient Object[] fieldlist = new Object[2]; @Override public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException { //初始化列名 List fieldList = new ArrayList<>(); fieldList.add("column_1"); fieldList.add("column_2"); //初始化字段检查器,分别对应上面两个输出列的类型 List inspectors = new ArrayList<>(); inspectors.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); inspectors.add(PrimitiveObjectInspectorFactory.javaLongObjectInspector); //返回列名和字段检查器集合 return ObjectInspectorFactory.getStandardStructObjectInspector(fieldList, inspectors); } @Override public void process(Object[] args) throws HiveException { if (args == null || args.length < 1){ super.forward(fieldlist); return; } //arg0需要分析的数据 String arg0 = args[0].toString(); //arg1第一个分隔符 String arg1 = args[1].toString(); //arg2第二个分隔符 String arg2 = args[2].toString(); String[] items = arg0.split(arg1); for (String item : items) { String[] beans = item.split(arg2); fieldlist[0] = beans[0]; fieldlist[1] = Long.parseLong(beans[1]); // fieldlist[1] = beans[1]; super.forward(fieldlist); } } @Override public void close() throws HiveException { } public static void main(String[] args) { String[] strings = new String[3]; strings[0] = "路飞:12000000000,索隆:8000000000,乔巴:3000000"; strings[1] = ","; strings[2] = ":"; //arg0需要分析的数据 String arg0 = strings[0].toString(); //arg1第一个分隔符 String arg1 = strings[1].toString(); //arg2第二个分隔符 String arg2 = strings[2].toString(); String[] items = arg0.split(arg1); for (String item : items) { String[] beans = item.split(arg2); System.out.println(Arrays.toString(beans)); } } }
2.2.2 函数使用
-- mv user-defined-function-1.0-SNAPSHOT.jar my_split_map.jar
drop function my_split_map;
create function my_split_map as 'com.dk.udtf.SplitMapList'
    using jar 'hdfs://node1:8020/hive_func/my_split_map.jar';

select my_split_map("路飞:12000000000,索隆:8000000000,乔巴:3000000", ",", ":");

2.3 删除函数命令
-- 删除临时函数 
drop temporary function if exists encryptPhoneNumber; 
-- 删除永久函数,不会删除HDFS上的jar包 
drop function if exists my_lower2;

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/zaji/5676401.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-12-17
下一篇 2022-12-16

发表评论

登录后才能评论

评论列表(0条)

保存