- 使用idea创建maven项目
在Project Structture 中 添加 scala(环境一定要搭建好,不然运行不了)
- 在pom.xml中添加(可以使用Alt+insert 然后选择Dependency添加需要的)依赖
4.0.0 org.example untitled1.0-SNAPSHOT org.apache.spark spark-core_2.112.4.8 8 8
- scala代码补全方法:https://blog.csdn.net/qq_44065303/article/details/108345728
- 统计单词(不区分大小写)
import org.apache.spark.{SparkConf, SparkContext} object word { def main(args:Array[String]): Unit ={ val conf=new SparkConf().setMaster("local").setAppName("word") val sc=new SparkContext(conf) val t=sc.textFile("hdfs://localhost:9000/wordcount/word.txt")//文件在hdfs的path(路径) val t1=t.filter(_.trim.length>0).map(i=>i.toUpperCase())//除去空格,全部转换成大写 val t2=t1.flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_,1).sortBy(_._2,false).collect().foreach(println)//key不变的情况下,使value增加 sc.stop() } }
案例二:map参考:https://blog.csdn.net/xianpanjia4616/article/details/80947616
- 统计一个 100 万人口的所有人的平均年龄
需要用 Scala 写一个生成 100 万人口年龄数据的文件
import java.io.FileWriter import java.io.File import scala.util.Random object rry{ def main(args:Array[String]) { val writer = new FileWriter(new File("/home/hadoop/data.txt"),false)//文件保存路径 val rand = new Random() for ( i <- 1 to 1000000) { writer.write( i + " " + rand.nextInt(100)) writer.write(System.getProperty("line.separator")) } writer.flush() writer.close() } }
数据上传到hdfs
打开终端 打开hadoop cd /usr/local/hadoop//hadoop安装路径 ./sbin/start-all.sh //上传 ./bin/hdfs dfs -put /home/hadoop/data.txt /wordcount(后面是hdfs上的目录)
- 编程实现
import org.apache.spark.{SparkConf, SparkContext} object scala1000{ def main(args:Array[String]) { val conf = new SparkConf().setAppName("10000r").setMaster("local") val sc = new SparkContext(conf) val t = sc.textFile("file:///home/hadoop/data.txt")//也可以使用("hdfs://localhost:9000/wordcount/data.txt") val count = t.count()//记录人数 val age = t.map(i => i.split(" ")(1)).map(j => j.trim.toInt).collect().reduce((a, b) => a + b)//trim是去除左右的空格toInt比转换成int类型 val ct=age.toDouble/count.toDouble//平均年龄 println(age+" "+count+" " +ct) } }案例三:
- 人口 (1 万) 性别还有身高进行统计
用以下 Scala 程序生成这个文件
import java.io.FileWriter import java.io.File import scala.util.Random object sheng { def main(args:Array[String]) { val writer = new FileWriter(new File("/home/hadoop/sheng.txt"),false) val rand = new Random() for ( i <- 1 to 10000) { var height = rand.nextInt(220) if (height < 50) { height = height + 50 } var gender = getRandomGender if (height < 100 && gender == "M") height = height + 100 if (height < 100 && gender == "F") height = height + 50 writer.write( i + " " + getRandomGender + " " + height) writer.write(System.getProperty("line.separator")) } writer.flush() writer.close() println("People Information File generated successfully.") } def getRandomGender() :String = { val rand = new Random() val randNum = rand.nextInt(2) + 1 if (randNum % 2 == 0) { "M" } else { "F" } }
生成文本如下:
1 M 177 2 F 210 3 M 193 4 M 220 ......
- 编程实现
先把男女数据分开保存,再分别求出最值
import org.apache.spark.{SparkConf, SparkContext} object sheng { def main(args:Array[String]) { val conf = new SparkConf().setMaster("local").setAppName("shengg") val sc = new SparkContext(conf) val t = sc.textFile("file:///home/hadoop/sheng.txt") //t.filter(i=>i.contains("M")).map(i=>i.split(" ")(1)+" "+i.split(" ")(2)).collect().foreach(println) val t1 = t.filter(i => i.contains("M")).map(i => i.split(" ")(1) + " " + i.split(" ")(2)) val t2 = t.filter(i => i.contains("M")).map(i => i.split(" ")(1) + " " + i.split(" ")(2)) val t3 = t1.map(i => i.split(" ")(1).toInt).sortBy(i => i) println("M:min " + t3.first()) val t4 = t1.map(i => i.split(" ")(1).toInt).sortBy(i => i, false) println("M:max " + t4.first()) val s1 = t.filter(i => i.contains("F")).map(i => i.split(" ")(1) + " " + i.split(" ")(2)) val s2 = t.filter(i => i.contains("F")).map(i => i.split(" ")(1) + " " + i.split(" ")(2)) val s3 = s1.map(i => i.split(" ")(1).toInt).sortBy(i => i) println("F:min " + s3.first()) val s4 = s2.map(i => i.split(" ")(1).toInt).sortBy(i => i, false) println("F:max " + s4.first()) sc.stop() } }
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)