2  文献复现

Author

max

Published

June 19, 2025

3 分组数据分析实战

在这一部分,将以论文中的数据分析为例,展示分组数据分析和可视化的重复性研究。首先,我们简单介绍一下论文的研究背景、方法和主要结果。然后,使用原始数据进行可重复研究,通过复现论文中的图片,展示分组数据分析和可视化的重复性研究。

3.1 论文研究概述

3.2 数据准备

论文的原始数据及分析代码都在 GitHub 上。首先,使用 Git 命令将代码克隆到本地:

git clone https://github.com/daniosro/Si_biomineralization_ANME_SRB.git --depth 1

3.3 加载所需的R包

加载所需的R包,用于数据处理、可视化和统计分析。

library(readxl)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# 设置默认主题
theme_set(theme_bw())

3.4 Composition ((Mg+Al+Fe)/Si) of sed-free ANME-SRB consortia

#Read data file for composition ((Mg+Al+Fe)/Si) of sed-free ANME-SRB consortia,
# ANME-SRB consortia in sediments and sediments without ANME-SRB consortia
file = xfun::magic_path("Dataset S3.xlsx")
octtet_data <- read_excel(file)

octtet_data
# A tibble: 425 × 5
   mg_al_fe_to_si Source                          Basin source_order basin_order
            <dbl> <chr>                           <chr>        <dbl>       <dbl>
 1           0.07 Aggregate-attached, Sediment-f… Sant…            3           3
 2           0.35 Aggregate-attached, Sediment-f… Sant…            3           3
 3           0.29 Aggregate-attached, Sediment-f… Sant…            3           3
 4           0.29 Aggregate-attached, Sediment-f… Sant…            3           3
 5           0.04 Aggregate-attached, Sediment-f… Sant…            3           3
 6           0.41 Aggregate-attached, Sediment-f… Sant…            3           3
 7           0.4  Aggregate-attached, Sediment-f… Sant…            3           3
 8           0.12 Aggregate-attached, Sediment-f… Sant…            3           3
 9           0.34 Aggregate-attached, Sediment-f… Sant…            3           3
10           0.19 Aggregate-attached, Sediment-f… Sant…            3           3
# ℹ 415 more rows
#Filter by categories
#Sediments and ANME-SRB consortia-attached silicates from Jaco Scar
octtet_data_Jaco <- subset(octtet_data, Basin == "Jaco Scar", select = c("mg_al_fe_to_si","Source","Basin","source_order","basin_order"))

octtet_data_Jaco = octtet_data |> 
  filter(Basin == "Jaco Scar") |> 
  select(mg_al_fe_to_si, Source, Basin, source_order, basin_order) |> 
  mutate(Source = as_factor(Source))

#Sediments and ANME-SRB consortia-attached silicates from the Santa Monica Basin
octtet_data_SMB <- subset(octtet_data, Basin == "Santa Monica", select = c("mg_al_fe_to_si","Source","Basin","source_order","basin_order"))
#Si-rich phase attached to sed-free ANME-SRB consortia from the Santa Monica Basin in incubations
octtet_data_SMB_sedfree <- subset(octtet_data_SMB,Source ==  "Aggregate-attached, Sediment-free" | Source ==  "Sediment", select = c("mg_al_fe_to_si","Source","Basin","source_order","basin_order"))
#Silicates attached to ANME-SRB consortia from the Santa Monica Basin in sediments
octtet_data_SMB_fromsed <- subset(octtet_data_SMB,Source ==  "Aggregate-attached" | Source ==  "Sediment", select = c("mg_al_fe_to_si","Source","Basin","source_order","basin_order"))
#Silicates attached to ANME-SRB consortia from the Santa Monica Basin in sediments and 
#Si-rich phase attached to sed-free ANME-SRB consortia from the Santa Monica Basin in incubations
octtet_data_SMB_freevsfromsed<- subset(octtet_data_SMB,Source ==  "Aggregate-attached" | Source ==  "Aggregate-attached, Sediment-free", select = c("mg_al_fe_to_si","Source","Basin","source_order","basin_order"))
##Sediments and ANME-SRB consortia-attached silicates from Eel River Basin
octtet_data_ERB <- subset(octtet_data, Basin == "Eel River", select = c("mg_al_fe_to_si","Source","Basin","source_order","basin_order"))
#Perform one-way ANOVA test on the Jaco Scar sediments and ANME-SRB consortia-attached silicates

summary(octtet_data_Jaco)
 mg_al_fe_to_si                  Source      Basin            source_order  
 Min.   :0.1900   Sediment          :63   Length:103         Min.   :1.000  
 1st Qu.:0.4450   Aggregate-attached:40   Class :character   1st Qu.:1.000  
 Median :0.6100                           Mode  :character   Median :1.000  
 Mean   :0.6404                                              Mean   :1.388  
 3rd Qu.:0.7400                                              3rd Qu.:2.000  
 Max.   :2.0600                                              Max.   :2.000  
  basin_order
 Min.   :2   
 1st Qu.:2   
 Median :2   
 Mean   :2   
 3rd Qu.:2   
 Max.   :2   
resot1.aov <- aov(mg_al_fe_to_si ~ Source, data = octtet_data_Jaco)
summary(resot1.aov)
             Df Sum Sq Mean Sq F value   Pr(>F)    
Source        1  2.242  2.2416   34.29 5.95e-08 ***
Residuals   101  6.602  0.0654                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Perform one-way ANOVA test on the Si-rich phase attached to sed-free ANME-SRB consortia from the Santa Monica Basin in incubations
resot2.aov <- aov(mg_al_fe_to_si ~ Source, data = octtet_data_SMB_sedfree)
summary(resot2.aov)
             Df Sum Sq Mean Sq F value   Pr(>F)    
Source        1   9.34   9.337   41.29 1.23e-09 ***
Residuals   173  39.12   0.226                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Perform one-way ANOVA test on the silicates attached to ANME-SRB consortia from the Santa Monica Basin in sediments
resot3.aov <- aov(mg_al_fe_to_si ~ Source, data = octtet_data_SMB_fromsed)
summary(resot3.aov)
             Df Sum Sq Mean Sq F value   Pr(>F)    
Source        1   5.22   5.218   18.62 2.67e-05 ***
Residuals   174  48.77   0.280                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Perform one-way ANOVA test on the silicates attached to ANME-SRB consortia from the Santa Monica Basin in sediments and 
#Si-rich phase attached to sed-free ANME-SRB consortia from the Santa Monica Basin in incubations
resot4.aov <- aov(mg_al_fe_to_si ~ Source, data = octtet_data_SMB_freevsfromsed)
summary(resot4.aov)
            Df Sum Sq Mean Sq F value Pr(>F)  
Source       1  0.415  0.4148       3 0.0871 .
Residuals   81 11.200  0.1383                 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Perform one-way ANOVA test on the sediments and ANME-SRB consortia-attached silicates from Eel River Basin
resot5.aov <- aov(mg_al_fe_to_si ~ Source, data = octtet_data_ERB)
summary(resot5.aov)
             Df Sum Sq Mean Sq F value  Pr(>F)   
Source        1  1.126  1.1259   10.95 0.00129 **
Residuals   103 10.590  0.1028                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(ggpubr)

#Reorganize data
octtet_data$new_source_order <- reorder(octtet_data$Source,octtet_data$source_order)
octtet_data$new_basin_order <- reorder(octtet_data$Basin,octtet_data$basin_order)


#Make violin plots
ot = ggplot(octtet_data, aes(new_source_order, mg_al_fe_to_si))

ot + 
  geom_violin() + 
  geom_boxplot(width=0.2, outliers = FALSE) +
  stat_compare_means(method = "aov", label = "p", size = 3) +
  stat_compare_means(method = 't.test', label.y = 2,
                     ref.group = "Sediment", label = "p.signif") +
 # stat_compare_means(method = "t.test",
#                     comparisons = list(
#                       c("Sediment", "Aggregate-attached"),
#                       c("Aggregate-attached, Sediment-free", "Aggregate-attached"),
#                       c("Sediment", "Aggregate-attached, Sediment-free")
 #                    )) +
  # geom_jitter(width = 0.2)
  facet_grid(~new_basin_order, scales = "free", space = "free") +
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1)) +
  labs(x = "", y = "(Mg+Al+Fe)/Si")

# + scale_color_brewer(palette = "Blues")
# + geom_violin(aes(new_source_order,octtet)) + geom_point(aes(octtet_data$ID)) + geom_jitter(width=0.3,aes(color=octtet_data$ID), size=2.5)#

3.5 Composition (Al/Si) of sed-free ANME-SRB consortia

#Read data file for composition (Al/Si) of sed-free ANME-SRB consortia,
# ANME-SRB consortia in sediments and sediments without ANME-SRB consortia
AlSi_data <- read_excel(xfun::magic_path('Dataset S2.xlsx'))
AlSi_data
# A tibble: 425 × 5
   Al.per.Si Source                           Basin `Source order` `Basin order`
       <dbl> <chr>                            <chr>          <dbl>         <dbl>
 1    0      Aggregate-attached, Sediment-fr… Sant…              3             3
 2    0.196  Aggregate-attached, Sediment-fr… Sant…              3             3
 3    0.181  Aggregate-attached, Sediment-fr… Sant…              3             3
 4    0.182  Aggregate-attached, Sediment-fr… Sant…              3             3
 5    0.0285 Aggregate-attached, Sediment-fr… Sant…              3             3
 6    0.315  Aggregate-attached, Sediment-fr… Sant…              3             3
 7    0.238  Aggregate-attached, Sediment-fr… Sant…              3             3
 8    0.0657 Aggregate-attached, Sediment-fr… Sant…              3             3
 9    0.194  Aggregate-attached, Sediment-fr… Sant…              3             3
10    0.146  Aggregate-attached, Sediment-fr… Sant…              3             3
# ℹ 415 more rows
#Filter by categories
#Sediments and ANME-SRB consortia-attached silicates from Jaco Scar
AlSi_data_Jaco <- subset(AlSi_data, Basin == "Jaco Scar", select = c("Al.per.Si","Source","Basin","Source order","Basin order"))
#Sediments and ANME-SRB consortia-attached silicates from the Santa Monica Basin
AlSi_data_SMB <- subset(AlSi_data, Basin == "Santa Monica", select = c("Al.per.Si","Source","Basin","Source order","Basin order"))
#Si-rich phase attached to sed-free ANME-SRB consortia from the Santa Monica Basin in incubations
AlSi_data_SMB_sedfree <- subset(AlSi_data_SMB,Source ==  "Aggregate-attached, Sediment-free" | Source ==  "Sediment", select = c("Al.per.Si","Source","Basin","Source order","Basin order"))
#Silicates attached to ANME-SRB consortia from the Santa Monica Basin in sediments
AlSi_data_SMB_fromsed <- subset(AlSi_data_SMB,Source ==  "Aggregate-attached" | Source ==  "Sediment", select = c("Al.per.Si","Source","Basin","Source order","Basin order"))
#Silicates attached to ANME-SRB consortia from the Santa Monica Basin in sediments and 
#Si-rich phase attached to sed-free ANME-SRB consortia from the Santa Monica Basin in incubations
AlSi_data_SMB_freevsfromsed<- subset(AlSi_data_SMB,Source ==  "Aggregate-attached" | Source ==  "Aggregate-attached, Sediment-free", select = c("Al.per.Si","Source","Basin","Source order","Basin order"))
##Sediments and ANME-SRB consortia-attached silicates from Eel River Basin
AlSi_data_ERB <- subset(AlSi_data, Basin == "Eel River", select = c("Al.per.Si","Source","Basin","Source order","Basin order"))
source_order = c("Sediment", "Aggregate-attached","Aggregate-attached, Sediment-free")
basin_order = c("Eel River", "Jaco Scar", "Santa Monica")
AlSi_data = AlSi_data |> 
  mutate(Source = factor(Source, levels = source_order),
         Basin = factor(Basin, levels = basin_order))
#Make violin plots

ot = ggplot(AlSi_data, aes(Source, Al.per.Si))

ot + 
  geom_violin() + 
  geom_boxplot(width=0.2, outliers = FALSE) +
  stat_compare_means(method = "aov", label = "p", size = 3) +
  stat_compare_means(method = 't.test', label.y = 1,
                     ref.group = "Sediment", label = "p.signif") +
 # stat_compare_means(method = "t.test",
#                     comparisons = list(
#                       c("Sediment", "Aggregate-attached"),
#                       c("Aggregate-attached, Sediment-free", "Aggregate-attached"),
#                       c("Sediment", "Aggregate-attached, Sediment-free")
 #                    )) +
  # geom_jitter(width = 0.1)
  facet_grid(~Basin, scales = "free", space = "free") +
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1)) +
  labs(x = "", y = "(Al per Si")

#Perform one-way ANOVA test on the Jaco Scar sediments and ANME-SRB consortia-attached silicates
resot6.aov <- aov(Al.per.Si ~ Source, data = AlSi_data_Jaco)
summary(resot6.aov)
             Df Sum Sq Mean Sq F value   Pr(>F)    
Source        1 0.3026 0.30257   29.56 3.78e-07 ***
Residuals   101 1.0339 0.01024                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Perform one-way ANOVA test on the Si-rich phase attached to sed-free ANME-SRB consortia from the Santa Monica Basin in incubations
resot7.aov <- aov(Al.per.Si ~ Source, data = AlSi_data_SMB_sedfree)
summary(resot7.aov)
             Df Sum Sq Mean Sq F value Pr(>F)    
Source        1  2.130  2.1299   86.96 <2e-16 ***
Residuals   173  4.237  0.0245                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Perform one-way ANOVA test on the silicates attached to ANME-SRB consortia from the Santa Monica Basin in sediments

resot8.aov <- aov(Al.per.Si ~ Source, data = AlSi_data_SMB_fromsed)
summary(resot8.aov)
             Df Sum Sq Mean Sq F value   Pr(>F)    
Source        1  1.058  1.0584   36.61 8.61e-09 ***
Residuals   174  5.030  0.0289                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

对来自Santa Monica Basin的沉积物中附着于ANME-SRB联合体的硅酸盐,以及在培养实验中无沉积物的 ANME-SRB联合体上附着的富含硅(Si-rich)的相,进行单因素方差分析(One-way ANOVA)测试。

resot9.aov <- aov(Al.per.Si ~ Source, data = AlSi_data_SMB_freevsfromsed)
summary(resot9.aov)
            Df Sum Sq Mean Sq F value Pr(>F)  
Source       1  0.128 0.12798   6.676 0.0116 *
Residuals   81  1.553 0.01917                 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Perform one-way ANOVA test on the sediments and ANME-SRB consortia-attached silicates from Eel River Basin

resot10.aov <- aov(Al.per.Si ~ Source, data = AlSi_data_ERB)
summary(resot10.aov)
             Df Sum Sq Mean Sq F value  Pr(>F)    
Source        1 0.4429  0.4429   17.88 5.1e-05 ***
Residuals   103 2.5510  0.0248                    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1