- hive中已经自带一些函数,但数量有限,有时候需要自己定义函数,自定义函数分为一下三种:
- 1、UDF(User-Defined-Function)
一进一出
类似于:lower/upper/reverse - 2、UDAF(User-Defined Aggregation Function)
聚集函数,多进一出
类似于:count/max/min
3、UDTF(User-Defined Table-Generating Functions)
一进多出
如lateral view explode()
- 1、UDF(User-Defined-Function)
- 需求字符串大小写转换
1.2代码实现org.apache.hive hive-exec2.1.0 org.apache.hadoop hadoop-common2.7.5
public class Uppercase extends UDF { public Text evaluate(final Text s) { if (null == s) { return null; } //返回大写字母 return new Text(s.toString().toUpperCase()); } }1.3 函数使用 1.3.1 临时函数
- 向hive的客户端添加jar包
cd /export/server/hive-2.1.0/lib mv user-defined-function-1.0-SNAPSHOT.jar my_uppercase.jar
- 连接hive,添加自定义临时函数
add jar /export/server/hive-2.1.0/lib/my_uppercase.jar; -- 自定义临时函数 create temporary function my_upercase as 'com.dk.udf.Uppercase'; -- 使用函数 select my_upercase("abcDe");1.3.2 永久函数
- 上传jar到HDFS中
# 在hdfd创建jar包存放的文件夹 hadoop fs -mkdir /hive_func # 上传jar包 hadoop fs -put /export/server/hive-2.1.0/lib/my_uppercase.jar /hive_func
- 连接hive,添加自定义函数
# 创建永久函数 create function my_upercase2 as 'com.dk.udf.Uppercase' using jar 'hdfs://node1:8020/hive_func/my_uppercase.jar'; # 使用 select my_upercase2("abcDe");2. 自定义UDTF
- 需求字符串一进多出
import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import java.util.ArrayList; import java.util.List; public class SplitString extends GenericUDTF { private final transient Object[] forwardList = new Object[1]; @Override public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException { //初始化字段列表 List2.1.2 函数使用fieldNames = new ArrayList<>(); fieldNames.add("column_1"); //初始化字段检查器列表,用于检查字段类型 List inspectors = new ArrayList<>(); //设置第一个字段类型为string inspectors.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, inspectors); } @Override public void process(Object[] args) throws HiveException { if (args == null || args.length < 1){ super.forward(forwardList); return; } //获取需要拆分的数据 String argsStr = args[0].toString(); //获取字段分隔符 String splitStr = args[1].toString(); //获取拆分后的数组 String[] fields = argsStr.split(splitStr); for (String field : fields) { //放入输出字段集合 forwardList[0] = field; //输出 super.forward(forwardList); } } @Override public void close() throws HiveException { } }
add jar /export/server/hive-2.1.0/lib/my_split_string.jar; create temporary function split_string_udtf as 'com.dk.udtf.SplitString'; select split_string_udtf("索隆,路飞,山治,乔巴", ",");2.2 多列一进多出转换 2.2.1 代码实现
import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import java.util.ArrayList; import java.util.Arrays; import java.util.List; public class SplitMapList2.2.2 函数使用extends GenericUDTF { private final transient Object[] fieldlist = new Object[2]; @Override public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException { //初始化列名 List fieldList = new ArrayList<>(); fieldList.add("column_1"); fieldList.add("column_2"); //初始化字段检查器,分别对应上面两个输出列的类型 List inspectors = new ArrayList<>(); inspectors.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); inspectors.add(PrimitiveObjectInspectorFactory.javaLongObjectInspector); //返回列名和字段检查器集合 return ObjectInspectorFactory.getStandardStructObjectInspector(fieldList, inspectors); } @Override public void process(Object[] args) throws HiveException { if (args == null || args.length < 1){ super.forward(fieldlist); return; } //arg0需要分析的数据 String arg0 = args[0].toString(); //arg1第一个分隔符 String arg1 = args[1].toString(); //arg2第二个分隔符 String arg2 = args[2].toString(); String[] items = arg0.split(arg1); for (String item : items) { String[] beans = item.split(arg2); fieldlist[0] = beans[0]; fieldlist[1] = Long.parseLong(beans[1]); // fieldlist[1] = beans[1]; super.forward(fieldlist); } } @Override public void close() throws HiveException { } public static void main(String[] args) { String[] strings = new String[3]; strings[0] = "路飞:12000000000,索隆:8000000000,乔巴:3000000"; strings[1] = ","; strings[2] = ":"; //arg0需要分析的数据 String arg0 = strings[0].toString(); //arg1第一个分隔符 String arg1 = strings[1].toString(); //arg2第二个分隔符 String arg2 = strings[2].toString(); String[] items = arg0.split(arg1); for (String item : items) { String[] beans = item.split(arg2); System.out.println(Arrays.toString(beans)); } } }
-- mv user-defined-function-1.0-SNAPSHOT.jar my_split_map.jar drop function my_split_map; create function my_split_map as 'com.dk.udtf.SplitMapList' using jar 'hdfs://node1:8020/hive_func/my_split_map.jar'; select my_split_map("路飞:12000000000,索隆:8000000000,乔巴:3000000", ",", ":");2.3 删除函数命令
-- 删除临时函数 drop temporary function if exists encryptPhoneNumber; -- 删除永久函数,不会删除HDFS上的jar包 drop function if exists my_lower2;
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)