九十二:Spark-SparkSQL(统计电影平均得分前十名)
最编程
2024-04-04 07:00:15
...
package org.example.SQL
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
//电影数据分析
object sql_Movie {
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.ERROR)
val spark: SparkSession = SparkSession.builder().appName("sparksql").master("local[*]")
.config("spark.sql.shuffle.partitions", 4)
.getOrCreate()
import spark.implicits._
val ds: Dataset[String] = spark.read.textFile("data/text/rating_100k.data")
// ds.printSchema()
// ds.show()
val movies: DataFrame = ds.map(line => {
val arr: Array[String] = line.split("\t")
(arr(1), arr(2).toInt)
}).toDF("movieid", "score")
movies.printSchema()
movies.show()
//统计 评分次数>200的电影平均分Top10
movies.createOrReplaceTempView("movie")
var sql =
"""
select movieid,count(*) as counts,avg(score) as avgs
from movie
group by movieid
having counts >200
order by avgs
limit 10
""".stripMargin
spark.sql(sql).show()
}
}