DS:1.3.4
Ambari:2.6.3.0-235
二、python测试代码,dspythontest.pyimport os import pandas as pd from pyspark.sql import SparkSession from argparse import ArgumentParser spark = SparkSession .builder .appName("ds python test") .enableHiveSupport() .getOrCreate() def main(args): coc = ['red', 'rellow'] c_df = pd.Dataframe(coc,columns=['color']) c_df['len']=c_df['color'].apply(len) c_df = spark.createDataframe(c_df) c_df.show() print(args.arch) if __name__ == '__main__': parser = ArgumentParser() parser.add_argument("--arch", default='bert', type=str) args = parser.parse_args() main(args)三、 *** 作步骤 1、依赖环境上传到yarn
(1)将python依赖环境包pythonenv放到/home下;
(2)进入/home/pythonenv文件夹下将环境打包成zip待用:
zip -r pythonenv.zip ./*2、将用到的python程序和配置文件传到DS资源中心 3、在DS项目管理中新建spark工作流
说明: 程序类型要选择python类型; 其他参数中前两行是将python依赖环境同步上传到yarn,#pythonenv不能省略,指的是将zip包解压到的文件夹; 其他参数的--files上传的是hive配置文件,由于ambari环境的此配置文件与程序有冲突,故修改配置重新上传此配置文件; 其他参数的最后一行末尾一定不能加 ,否则DS会报错不存在dspythontest.py文件; pyspark程序提交yarn集群,不需要在ds的dolphinscheduler_env.sh中配置python环境也可以运行,因为python所依赖环境已经经DS上传到yarn上了;三、遇到的问题 1、资源文件不存在问题
Exception in thread "main" java.io.FileNotFoundException: File file:/tmp/dolphinscheduler/exec/process/8/67/268/276/ dspythontest.py does not exist at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:624) at org.apache.hadoop.fs.RawLocalFileSystem.getFilelinkStatusInternal(RawLocalFileSystem.java:850) at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:614) at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:422) at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:340) at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:292) at org.apache.spark.deploy.yarn.Client.copyFileToRemote(Client.scala:357) at org.apache.spark.deploy.yarn.Client.org$apache$spark$deploy$yarn$Client$$distribute(Client.scala:476) at org.apache.spark.deploy.yarn.Client.prepareLocalResources(Client.scala:630) at org.apache.spark.deploy.yarn.Client.createContainerLaunchContext(Client.scala:845) at org.apache.spark.deploy.yarn.Client.submitApplication(Client.scala:170) at org.apache.spark.deploy.yarn.Client.run(Client.scala:1174) at org.apache.spark.deploy.yarn.Client$.main(Client.scala:1233) at org.apache.spark.deploy.yarn.Client.main(Client.scala) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:782) at org.apache.spark.deploy.SparkSubmit$.doRunMain(SparkSubmit.scala:180) at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205) at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:119) at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
出现此问题的原因是在其他参数的最后一行多加了 ,导致运行命令为:
${SPARK_HOME2}/bin/spark-submit --master yarn --deploy-mode cluster --driver-cores 1 --driver-memory 512M --num-executors 2 --executor-cores 2 --executor-memory 2G --queue default --archives=hdfs:///pyenv/pythonenv.zip#pythonenv --conf spark.pyspark.python=./pythonenv/env37/bin/python3 dspythontest.py --arch 1232、yarn日志报hive错误
Please make sure that jars for your version of hive and hadoop are included in the paths passed to spark.sql.hive.metastore.jars. at org.apache.spark.sql.hive.client.IsolatedClientLoader.createClient(IsolatedClientLoader.scala:274) at org.apache.spark.sql.hive.HiveUtils$.newClientFormetadata(HiveUtils.scala:362) at org.apache.spark.sql.hive.HiveUtils$.newClientFormetadata(HiveUtils.scala:266) at org.apache.spark.sql.hive.HiveExternalCatalog.client$lzycompute(HiveExternalCatalog.scala:66) at org.apache.spark.sql.hive.HiveExternalCatalog.client(HiveExternalCatalog.scala:65) at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$databaseExists$1.apply$mcZ$sp(HiveExternalCatalog.scala:194) at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$databaseExists$1.apply(HiveExternalCatalog.scala:194) at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$databaseExists$1.apply(HiveExternalCatalog.scala:194) at org.apache.spark.sql.hive.HiveExternalCatalog.withClient(HiveExternalCatalog.scala:97) at org.apache.spark.sql.hive.HiveExternalCatalog.databaseExists(HiveExternalCatalog.scala:193) at org.apache.spark.sql.internal.SharedState.externalCatalog$lzycompute(SharedState.scala:105) at org.apache.spark.sql.internal.SharedState.externalCatalog(SharedState.scala:93) at org.apache.spark.sql.hive.HiveSessionStateBuilder.externalCatalog(HiveSessionStateBuilder.scala:39) at org.apache.spark.sql.hive.HiveSessionStateBuilder.catalog$lzycompute(HiveSessionStateBuilder.scala:54) at org.apache.spark.sql.hive.HiveSessionStateBuilder.catalog(HiveSessionStateBuilder.scala:52) at org.apache.spark.sql.hive.HiveSessionStateBuilder.catalog(HiveSessionStateBuilder.scala:35) at org.apache.spark.sql.internal.baseSessionStateBuilder.build(baseSessionStateBuilder.scala:289) at org.apache.spark.sql.SparkSession$.org$apache$spark$sql$SparkSession$$instantiateSessionState(SparkSession.scala:1071) ... 16 more Caused by: java.lang.reflect.InvocationTargetException at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62) at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) at java.lang.reflect.Constructor.newInstance(Constructor.java:423) at org.apache.spark.sql.hive.client.IsolatedClientLoader.createClient(IsolatedClientLoader.scala:268) ... 33 more Caused by: java.lang.NoClassDefFoundError: org/apache/tez/dag/api/SessionNotRunning at org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:529) at org.apache.spark.sql.hive.client.HiveClientImpl.(HiveClientImpl.scala:192) ... 38 more Caused by: java.lang.ClassNotFoundException: org.apache.tez.dag.api.SessionNotRunning at java.net.URLClassLoader.findClass(URLClassLoader.java:382) at java.lang.ClassLoader.loadClass(ClassLoader.java:424) at org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1.doLoadClass(IsolatedClientLoader.scala:225) at org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1.loadClass(IsolatedClientLoader.scala:214) at java.lang.ClassLoader.loadClass(ClassLoader.java:357) ... 40 more During handling of the above exception, another exception occurred: Traceback (most recent call last): File "dspythontest.py", line 9, in .appName("ds python test") File "/home/hadoop/yarn/local/usercache/hdfs/appcache/application_1642127047517_0110/container_e09_1642127047517_0110_01_000001/pyspark.zip/pyspark/sql/session.py", line 179, in getOrCreate File "/home/hadoop/yarn/local/usercache/hdfs/appcache/application_1642127047517_0110/container_e09_1642127047517_0110_01_000001/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__ File "/home/hadoop/yarn/local/usercache/hdfs/appcache/application_1642127047517_0110/container_e09_1642127047517_0110_01_000001/pyspark.zip/pyspark/sql/utils.py", line 79, in deco pyspark.sql.utils.IllegalArgumentException: "Error while instantiating 'org.apache.spark.sql.hive.HiveSessionStateBuilder':"
此错误出现在ambari环境的hive-site.xml上,需要将hive-site.xml的如下配置删除,并跟随程序重新上传即可;
四、运行成功标志hive.execution.engine tez
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)