为电影处理蜂巢数据 蜂巢电影评论数据分析
数据和需求的概览
现有如此三份数据:
1、users.dat 数据格式为: 2::M::56::16::70072
对应字段为:UserID BigInt, Gender String, Age Int, Occupation String, Zipcode String
对应字段中文解释:用户id,性别,年龄,职业,邮政编码
2、movies.dat 数据格式为: 2::Jumanji (1995)::Adventure|Children’s|Fantasy
对应字段为:MovieID BigInt, Title String, Genres String
对应字段中文解释:电影ID,电影名字,电影类型
3、ratings.dat 数据格式为: 1::1193::5::978300760
对应字段为:UserID BigInt, MovieID BigInt, Rating Double, Timestamped String
对应字段中文解释:用户ID,电影ID,评分,评分时间戳
数据要求:
(1)写shell脚本清洗数据。(hive不支持解析多字节的分隔符,也就是说hive只能解析’:’, 不支持解析’::’,所以用普通方式建表来使用是行不通的,要求对数据做一次简单清洗)
(2)使用Hive能解析的方式进行
Hive要求:
(1)正确建表,导入数据(三张表,三份数据)
(2)求被评分次数最多的10部电影,并给出评分次数(电影名,评分次数)
(3)分别求男性,女性当中评分最高的10部电影(性别,电影名,影评分)
(4)求movieid = 2116这部电影各年龄段(因为年龄就只有7个,就按这个7个分就好了)的平均影评(年龄段,影评分)
(5)求最喜欢看电影(影评次数最多)的那位女性评最高分的10部电影的平均影评分(观影者,电影名,影评分)
(6)求好片(评分>=4.0)最多的那个年份的最好看的10部电影
(7)求1997年上映的电影中,评分最高的10部Comedy类电影
(8)该影评库中各种类型电影中评价最高的5部电影(类型,电影名,平均影评分)
(9)各年评分最高的电影类型(年份,类型,影评分)
(10)每个地区最高评分的电影名,把结果存入HDFS(地区,电影名,影评分)
1. 数据清洗
1.1 users.dat
sed -i "s/::/:/g" users.dat
1.2 movies.dat
sed -i "s/::/:/g" movies.dat
1.3 ratings.dat
sed -i "s/::/:/g" ratings.dat
2. 正确建表,导入数据(三张表,三份数据)
2.1 users
create table users(
userid bigint,
gender String,
age int,
occupation String,
zipcode String
)
row format delimited fields terminated by ':';
load data local inpath '/opt/data/users.dat' overwrite into table moves_test.users
2.2 movies
create table moves_test.movies
(
movieid bigint,
title string,
genres string
)
row format delimited fields terminated by ':';
load data local inpath '/opt/data/movies.dat' overwrite into table moves_test.movies;
2.3 ratings
create table moves_test.ratings
(
userid bigint,
movieid bigint,
rating double,
timestamped string
)
row format delimited fields terminated by ':';
load data local inpath '/opt/data/ratings.dat' overwrite into table moves_test.ratings;
3. 求被评分次数最多的10部电影,并给出评分次数(电影名,评分次数)
3.1 先找出来评分次数最多的10部电影和他评分次数
select movieid ,count(userid) rat_count from moves_test.ratings
group by movieid
limit 10;
3.2 再和movies表关联拿到电影名称
select title,t1.movieid,rat_count from (select movieid ,count(userid) rat_count from moves_test.ratings
group by movieid
limit 10 ) t1
join (select movieid,title from moves_test.movies ) t2
on t1.movieid = t2.movieid;
4. 分别求男性,女性当中评分最高的10部电影(性别,电影名,影评分)
4.1 先找出来每个评分的性别
select t1.gender,t2.movieid,t2.rating from moves_test.users t1
join (select userid ,movieid,rating from moves_test.ratings ) t2
on t1.userid = t2.userid
4.2 使用row_number开窗函数求出来每个电影的排名
select gender,movieid,ct from (select *,row_number() over(partition by gender order by ct desc ) rk from (
select gender,movieid,sum(rating) ct from (
select t1.gender,t2.movieid,t2.rating from moves_test.users t1
join (select userid ,movieid,rating from moves_test.ratings ) t2
on t1.userid = t2.userid) t
group by gender, movieid ) t ) t
where rk <=10
4.3 根据movies拿到电影名称
select t1.*,t2.title from (select gender,movieid,ct from (
select *,row_number() over(partition by gender order by ct desc ) rk from (
select gender,movieid,sum(rating) ct from (
select t1.gender,t2.movieid,t2.rating from moves_test.users t1
join (select userid ,movieid,rating from moves_test.ratings ) t2
on t1.userid = t2.userid) t
group by gender, movieid ) t ) t
where rk <=10 ) t1 join moves_test.movies t2
on t1.movieid = t2.movieid
5. 求movieid = 2116这部电影各年龄段(因为年龄就只有7个,就按这个7个分就好了)的平均影评(年龄段,影评分)
5.1 找出来对2116 进行评分的人
select userid,rating from moves_test.ratings where movieid = 2116
5.2 和user表进行关联得到最终结果
select age,avg(rating ) avg_rat from (select userid,rating
from moves_test.ratings where movieid = 2116 ) r
join (select userid ,age from users ) u
on r.userid = u.userid
group by age
6. 求最喜欢看电影(影评次数最多)的那位女性评最高分的10部电影的平均影评分(观影者,电影名,影评分)
6.1 找出来最喜欢看电影的那位女性
select userid from (select r.userid,count(*) ct from ratings r
join users u
on r.userid = u.userid
where u.gender = 'F'
group by r.userid
order by ct desc limit 1 ) t1
6.2 找出来这位女性评分最高的十部电影
select userid,movieid from (select userid,movieid ,rating from ratings
where userid in (select userid from (select r.userid,count(*) ct from ratings r
join users u
on r.userid = u.userid
where u.gender = 'F'
group by r.userid
order by ct desc limit 1 ) t1 )
order by rating desc
limit 10
) t2
6.3 找出来这几部电影的平均得分
select t2.userid,t2.movieid,avg(r.rating ) avg_rating from (select userid,movieid ,rating from ratings
where userid in (select userid from (select r.userid,count(*) ct from ratings r
join users u
on r.userid = u.userid
where u.gender = 'F'
group by r.userid
order by ct desc limit 1 ) t1 )
order by rating desc
limit 10
) t2
join ratings r
on r.movieid = t2.movieid
group by t2.userid,t2.movieid
7. 求好片(评分>=4.0)最多的那个年份的最好看的10部电影
7.1 找出来那个年份
select y from(select from_unixtime(cast(timestamped as bigint) ,'yyyy') y,count(*) ct
from ratings
where rating >=4.0
group by from_unixtime(cast(timestamped as bigint) ,'yyyy')
order by ct desc
limit 1) t2
7.2 找出来哪一年的电影,并且根据电影求平均
select movieid ,avg(rating ) avg_rat from ratings where from_unixtime(cast(timestamped as bigint) ,'yyyy') in (select y from(select from_unixtime(cast(timestamped as bigint) ,'yyyy') y,count(*) ct
from ratings
where rating >=4.0
group by from_unixtime(cast(timestamped as bigint) ,'yyyy')
order by ct desc
limit 1) t2 )
group by movieid
order by avg_rat desc
limit 10
8. 求1997年上映的电影中,评分最高的10部Comedy类电影
8.1 找到1997年上映的电影和Comedy类的
select movieid from movies where title like '%1997%' and genres like '%Comedy%'
group by movieid
8.2 最终结果
select movieid ,avg(rating ) avg_rating from (select r.movieid,r.rating from ratings r
left semi join (select movieid from movies where title like '%1997%' and genres like '%Comedy%'
group by movieid )t1
on t1.movieid = r.movieid ) t2
group by movieid
order by avg_rating desc
limit 10
9. 该影评库中各种类型电影中评价最高的5部电影(类型,电影名,平均影评分)
9.1 先把类型爆炸开
select movieid ,genre from movies
lateral view explode(split(genres ,'\\|')) ad as genre
9.2 找出来各个电影的评分
select movieid,avg(rating ) a_rating from ratings
group by movieid
9.3 使用开窗求得结果
select movieid,genre,a_rating from (select r.movieid ,genre ,r.a_rating,row_number() over(partition by genre order by a_rating desc ) rk from (select movieid ,genre from movies
lateral view explode(split(genres ,'\\|')) ad as genre ) t1
join (select movieid,avg(rating ) a_rating from ratings
group by movieid
) r
on r.movieid = t1.movieid
) t1
where t1.rk <=5
10. 各年评分最高的电影类型(年份,类型,影评分)
10.1 先把电影类型炸开
select movieid ,genre from movies
lateral view explode(split(genres ,'\\|')) ad as genre
10.2 把年份获取到然后进行开窗汇总获取第一个
select y,genre,avgs from (select *,row_number() over(partition by y order by avgs desc ) rk from (select from_unixtime(cast(timestamped as bigint) ,'yyyy') y,genre ,avg(rating) avgs from ratings r
join ( select movieid ,genre from movies
lateral view explode(split(genres ,'\\|')) ad as genre
) m
on m.movieid = r.movieid
group by from_unixtime(cast(timestamped as bigint) ,'yyyy'),genre ) t1) t2
where rk = 1
11. 每个地区最高评分的电影名,把结果存入HDFS(地区,电影名,影评分)
create table moves_test.move11 as
select zipcode,movieid,a_rating from (select *,row_number() over(partition by zipcode,movieid order by a_rating desc ) rk from (select zipcode,r.movieid ,avg(r.rating ) a_rating from users u
join ratings r
on r.userid = u.userid
group by zipcode,r.movieid ) d) d1
where rk =1