欢迎您访问 最编程 本站为您分享编程语言代码,编程技术文章!
您现在的位置是: 首页

九十二:Spark-SparkSQL(统计电影平均得分前十名)

最编程 2024-04-04 07:00:15
...
package org.example.SQL import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} //电影数据分析 object sql_Movie { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val spark: SparkSession = SparkSession.builder().appName("sparksql").master("local[*]") .config("spark.sql.shuffle.partitions", 4) .getOrCreate() import spark.implicits._ val ds: Dataset[String] = spark.read.textFile("data/text/rating_100k.data") // ds.printSchema() // ds.show() val movies: DataFrame = ds.map(line => { val arr: Array[String] = line.split("\t") (arr(1), arr(2).toInt) }).toDF("movieid", "score") movies.printSchema() movies.show() //统计 评分次数>200的电影平均分Top10 movies.createOrReplaceTempView("movie") var sql = """ select movieid,count(*) as counts,avg(score) as avgs from movie group by movieid having counts >200 order by avgs limit 10 """.stripMargin spark.sql(sql).show() } }