object SparkDataSkew { def main(args: Array[String]): Unit = { val sparkConf: SparkConf = new SparkConf() .setMaster("local") .setAppName("data_skew_key") val sparkContext = new SparkContext(sparkConf) val rdd: RDD[String] = sparkContext.makeRDD( List("spark", "flink", "hbase", "mysql", "spark", "spark", "spark", "flink", "flink"), 2 ) rdd.sample(false,0.5) //不放回抽样,抽中几率为50% .map((_,1)).reduceByKey(_+_) //word count .map(x => (x._2,x._1)) //交换位置,数量放在前面 .sortByKey(false) //进行排序 .take(2) //取前两位 .foreach(println) } }
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)