ggplot2-数据变换和映射：深入探索(2)

最编程 2024-01-09 18:56:26

...

ggplot2,我总结了下，主要就是数据变形、映射、几何对象、统计变换以及后期图片调整及美化

1、数据导入和变形

（1）数据导入及格式

#设置工作目录
setwd("/media/han/b/rosalind/ggplot2")
#导入数据框(Rstudio中的Rscript栏输入)
gene_exp <- read.table(file = "gene_exp.txt",
                       sep = "\t",
                       header = T,
                       stringsAsFactors = F)
gene_len <- read.table(file = "gene_len.txt",
                       sep = "\t",
                       header = T,
                       stringsAsFactors = F)
group <- read.table(file = "group.txt",
                    sep = "\t",
                    header = T,
                    stringsAsFactors = F)
#数据格式
> head(gene_exp)
  Gene        S1        S2        S3        S4
1   G1  844.9510 1301.7828 1207.7967 1153.3719
2   G2 1246.8492  785.4974 1182.4283 1193.2796
3   G3 1496.4822 1206.2611 1060.4760 1480.8871
4   G4 1392.3307 1100.6337  687.7282  781.1865
5   G5 1170.3425  857.2048  916.0348 1092.2339
6   G6  721.2031 1477.6733 1543.1986  824.7960
         S5        S6        S7        S8
1  899.1235 1150.6556  957.9256 1728.3804
2 1922.2217  731.0976  631.0565 1318.9178
3 1276.8723 1174.2532 1037.7988  842.4523
4 1586.0368 1176.4862 1082.6887  896.8659
5  353.4801  808.1170 1246.9023 1066.0504
6  655.0933  442.7497 1089.1102  859.6822
         S9
1  356.0858
2 1399.4487
3 1419.1017
4 1295.0609
5  671.8151
6 1223.9446
> head(gene_len)
  Gene Length
1   G1   1712
2   G2   1884
3   G3   2514
4   G4   1559
5   G5   1952
6   G6   2295
> head(group)
  Sample  Group
1     S1 group1
2     S2 group1
3     S3 group1
4     S4 group2
5     S5 group2
6     S6 group2

（2）数据变形

library(tidyr)
library(dplyr)
#gather函数用于数据变形
dexp <- gather(gene_exp, key = Sample, value = Expression, -Gene)
#查看数据变形后的格式
##tidyr数据格式所有列是变量，每行是观测值，可以直接调用列变量
>head(dexp)
  Gene Sample Expression
1   G1     S1   844.9510
2   G2     S1  1246.8492
3   G3     S1  1496.4822
4   G4     S1  1392.3307
5   G5     S1  1170.3425
6   G6     S1   721.2031

(3)合并表格

#重定向%>%符号默认将dexp传递给left_join(a, b, by = "Gene")中的a位置
#a, b 位置互换才能引起合并表格的较大变化，比如left_join(b, a, by = "Gene"),则b表格是在前面的
#left_join或者right_join仅仅是改变了a, b中“Gene”的排列方式，整体还是 a表样式在前，b表在后
dexp <- gather(data = gene_exp,
               key = Sample,
               value = Expression,
               -Gene) %>%
     left_join(gene_len, by = "Gene") %>%
     left_join(group, by = "Sample") %>%
#select()函数选择变量的顺序和个数
     select(Gene, Sample, Group, Expression, Length) %>%
     arrange(Gene)
> head(dexp)
  Gene Sample  Group Expression Length
1   G1     S1 group1   844.9510   1712
2   G1     S2 group1  1301.7828   1712
3   G1     S3 group1  1207.7967   1712
4   G1     S4 group2  1153.3719   1712
5   G1     S5 group2   899.1235   1712
6   G1     S6 group2  1150.6556   1712

（4）映射

#导入包
library(ggplot2)
library(dplyr) 
#数据简化
dexp_small <- filter(dexp, Group =="group1", Gene %in% paste("G", 1:10, sep = "")) %>%
  select(-Group, -Length)
>head(dexp_small)
  Gene Sample Expression
1   G1     S1    844.951
2   G1     S2   1301.783
3   G1     S3   1207.797
4  G10     S1   1407.990
5  G10     S2    473.370
6  G10     S3   1134.640
#ggplot:映射，几何对象，图层
#第一步ggplot()确定了主图层
#X轴是Sample名称，y轴是Expression，aes是映射函数，几何对象是散点图。
ggplot(data = dexp_small, aes(x=Sample, y = Expression, color = Gene)) +
  geom_point()

##映射类型

#颜色类：color(颜色或边框颜色)、fill(填充颜色)和alpha(透明度)
#形状类：linetype(线型)、size（点的大小或线的宽度）和shape(形状)
#位置类：x, y, xmin, xmax, ymin, ymax, xend, yend
#特殊类：一类是group和order，另一类是字符串映射
##主图层
p <- ggplot(data = dexp, aes(x = Sample,
                        y = Expression)) +
##个体几何对象
    geom_point(aes(color=Gene,
                 size=Length,
                 shape=Group),
                 #透明度
                 alpha=8/10)

##分组

#群组几何对象：
##按照样品名进行绘图
p + geom_boxplot(aes(group = Sample))
##按照组名进行分组
p + geom_boxplot(aes(group = Group))
#基因表达趋势变化，并且在group1中添加拟合曲线
p + geom_line(aes(group = Gene, color=Gene)) +
  geom_smooth(aes(group=1))

##分面   
 #将一个图形分配成多个小图形
#facet_wrap()只能按照一个变量进行分面       
#facet_wrap(facets, nrow = NULL, ncol = NULL, scales = "fixed",
#           shrink = TRUE, labeller = "label_value", as.table = TRUE,
#           switch = NULL, drop = TRUE, dir = "h", strip.position = "top")
#重要参数：
#facets:  分面参数如 ~Group，表示用 Group 变量进行数据分类
#nrow:    绘制图形的行数
#ncol:    绘制图形的列数，一般nrow/ncol只设定一个即可
#scales： fixed,小图均使用统一坐标；
#         free每个小图按照各自数据范围*调整坐标；
#         free_x为*调整x轴刻度范围；
#         free_y为*调整y轴刻度范围。
p <- ggplot(data = dexp, aes(x = Sample, y = Expression)) 
p + geom_point(aes(color=Gene, size=Length)) +
  #~Group表示按照组进行分组, scales="free_x"表示坐标轴*调整，nrow表示只分成一行
  facet_wrap(~Group, scales = "free_x", nrow = 1)

#facet_grid()：可以按照两个变量进行分面
#facet_grid(facets, margins = FALSE, scales = "fixed", space = "fixed",
#           shrink = TRUE, labeller = "label_value", as.table = TRUE,
#           switch = NULL, drop = TRUE)
#与facet_wrap不同的重要参数：
#facets:  应用两个标准分面，如Gene ~ Group
#margins: Ture，包含所有数据的组 
#space:   每张小图的坐标宽度,值同scales(具有free, fixed等参数),类似于WORD按照内容进行调整
#选择前9条进行展示
dexp_small <- filter(dexp, Gene %in% paste("G", 1:9, sep = ""))
#主图层
ps <- ggplot(data = dexp_small, aes(x=Sample, y = Expression))
#几何图形以及按照基因和数组进行分面
ps + geom_point(aes(color=Length)) + 
  facet_grid(Gene~Group, scales = "free", margins = TRUE, space = "free")

上一篇： disql

下一篇：使用pagoda1(scde包)和pagoda2改善