记录一些pyspark的简单程序......
WordCount读取hdfs文件,wc:
from pyspark import SparkContext, SparkConf """ Pyspark Word Count Demo """ def sorted_all_result(wc_rdd): """ WordCount取全量,collect()之后排序 :param wc_rdd: RDD :return: """ word_list = wc_rdd.collect() sorted_list = sorted(word_list, key=lambda tuple2: tuple2[1], reverse=True) print(sorted_list) def sorted_top_n_result(wc_rdd, n): """ WordCount takeOrdered()取TopN,倒序 :param wc_rdd: RDD :param n: 取数个数 :return: """ top_n_list = wc_rdd.takeOrdered(n, key=lambda tuple2: tuple2[1]) print(top_n_list) def main(): conf = SparkConf().setAppName("PysparkDemo01").setMaster("spark://192.168.61.128:7077") sc = SparkContext(conf=conf) rdd = sc.textFile("hdfs://192.168.61.128:9000/data/wc.txt") wc_rdd = rdd.flatMap(lambda line: str(line).replace(".", "").replace(",", "").lower().split(" ")).map( lambda word: (word, 1)).reduceByKey(lambda x, y: x + y) # sorted_all_result(wc_rdd) # print(type(wc_rdd)) sorted_top_n_result(wc_rdd, 3) if __name__ == "__main__": main()
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)