本人学了一段时间的孟德尔随机化,在尝试将芬兰数据库导入进行分析的时候,总感觉自己输出的结果怪怪的。文件中的exposure.id是自动生成呢,还是需要自己输入?而且我重命名列名的时候总共感觉不对劲,虽然后面能输出结果,但觉得就是有点问题,请各位帮忙指正,在下万分感激!
normalizePath('~/.Renviron')
file.edit(normalizePath('~/.Renviron'))
ieugwasr::get_opengwas_jwt()
ieugwasr::api_status()
ieugwasr::user()
# 1. 安装和加载必要包
if (!requireNamespace("devtools", quietly = TRUE)) {
install.packages("devtools")
}
devtools::install_github("MRCIEU/TwoSampleMR",force = TRUE)
library(TwoSampleMR)
library(ieugwasr)
library(data.table)
library(dplyr)
# 2. 设定工作目录
setwd("D:/R Project/MR P2")
# 3. 读取和预处理数据
# 暴露组
exposure_data <- fread(“文件名", sep = "\t", header = FALSE)
colnames(exposure_data) <- c("chr", "pos", "ref", "alt", "rsid",
"exposure", "pval", "mlogp", "beta",
"se", "eaf", "af_cases", "af_controls")
# 过滤(例如,P值小于 5e-08)
exposure_data <- exposure_data[pval < 5e-08]
exposure_data <- exposure_data %>%
rename(
chr = chr,
pos = pos,
ref=ref,
alt=alt,
rsid = rsid,
exposure=exposure,
pval = pval,
beta = beta,
se = se,
eaf = eaf
) %>%
select(chr, pos,ref,alt, rsid,exposure, pval, beta, se, eaf)
# 结局组(子痫前期)
outcome_data <- fread("文件名", sep = "\t", header = FALSE)
colnames(outcome_data) <- c("chr", "pos", "ref", "alt", "rsid",
"outcome", "pval", "mlogp", "beta",
"se", "eaf", "af_cases", "af_controls")
# 过滤(例如,P值小于 5e-08)
outcome_data <- outcome_data[pval < 5e-08]
outcome_data <- outcome_data %>%
rename(
chr = chr,
pos = pos,
ref=ref,
alt=alt,
rsid = rsid,
outcome=outcome,
pval = pval,
beta = beta,
se = se,
eaf = eaf
) %>%
select(chr, pos,ref,alt, rsid,outcome, pval, beta, se, eaf)
# 5. SNP clumping
# 加载必要包
library(TwoSampleMR)
library(ieugwasr)
# 设置 PLINK 路径(根据实际安装路径调整)
options(plink = "C:/Users/。tp/plink_win64_20241022/plink.exe")
# 设置本地参考面板路径
options(ieugwasr_ld = "D:/R Project/MR progress/1kg.v3/EUR/EUR")
# 设置临时文件夹
output_dir <- "C:/Clump_Results"
dir.create(output_dir, showWarnings = FALSE)
# 初始化列表以存储每个染色体的clump结果
clumped_list <- list()
# 对每个染色体进行Clumping
chromosomes <- unique(exposure_data$chr)
for (chr in chromosomes) {
subset_data <- exposure_data[exposure_data$chr == chr, ]
# 确保子集数据不为空
if (nrow(subset_data) == 0) next
clumped_subset <- ieugwasr::ld_clump(
dat = subset_data,
clump_r2 = 0.001,
clump_kb = 1000,
clump_p = 5e-08,
bfile = "D:/R Project/MR progress/1kg.v3/EUR/EUR",
plink = "C:/Users/。tp/plink_win64_20241022/plink.exe"
)
# 保存每个染色体的结果
output_path <- file.path(output_dir, paste0("chr", chr, "_clumped.txt"))
write.table(clumped_subset, output_path, sep = "\t", row.names = FALSE)
clumped_list[[as.character(chr)]] <- clumped_subset
}
# 合并所有染色体的Clump结果
exposure_data_clumped <- do.call(rbind, clumped_list)
# 检查是否成功获取Clump结果
if (is.null(exposure_data_clumped)) {
stop("No SNPs remained after clumping. Check input data and clumping parameters.")
}
# 提取结局数据
# 确保 outcome_data 包含所需的列
# 1. 将data.table转换为data.frame
outcome_data <- data.frame(outcome_data)
# 2. 清理无效SNP(移除rsid列中包含非rs开头的行)
outcome_data <- outcome_data[grepl("^rs", outcome_data$rsid), ]
# 3. 调整列名映射(根据实际数据列名修改)
colnames(outcome_data) <- c("chr", "pos", "ref", "alt", "rsid",
"nearest_genes", "pval", "beta", "sebeta",
"af_alt")
# 转换数据类型为数值
outcome_data$beta <- as.numeric(outcome_data$beta)
outcome_data$pval <- as.numeric(outcome_data$pval)
outcome_data$sebeta <- as.numeric(outcome_data$sebeta)
outcome_data$af_alt <- as.numeric(outcome_data$af_alt)
# 移除包含NA的行
outcome_data <- outcome_data[complete.cases(outcome_data[, c("rsid", "beta", "sebeta", "af_alt", "pval")]), ]
# --------------------------
# 2. 正确调用format_data()
# --------------------------
outcome_dat <- TwoSampleMR::format_data(
outcome_data,
type = "outcome",
snps = exposure_data_clumped$rsid, # 确保已正确生成exposure_data_clumped
phenotype_col = "nearest_genes", # 根据实际表型列调整
snp_col = "rsid",
beta_col = "beta",
se_col = "sebeta", # 使用实际列名
eaf_col = "af_alt", # 使用效应等位基因频率列
effect_allele_col = "alt",
other_allele_col = "ref",
pval_col = "pval"
)
# --------------------------
# 1. 格式化暴露数据
# --------------------------
# 确保暴露数据是data.frame格式
exposure_data_clumped <- data.frame(exposure_data_clumped)
# 使用format_data转换列名和结构
# --------------------------
# 1. 处理暴露数据
# --------------------------
# 转换数据类型
exposure_data_clumped$beta <- as.numeric(exposure_data_clumped$beta)
exposure_data_clumped$se <- as.numeric(exposure_data_clumped$se)
exposure_data_clumped$eaf <- as.numeric(exposure_data_clumped$eaf)
exposure_data_clumped$pval <- as.numeric(exposure_data_clumped$pval)
exposure_data_clumped <- exposure_data_clumped[complete.cases(exposure_data_clumped), ]
# 格式化暴露数据
library(dplyr)
exposure_data_clumped <- exposure_data_clumped %>%
group_by(id, rsid) %>%
slice(1) %>%
ungroup()
# 强制转换数据类型
exposure_data_clumped$beta <- as.numeric(exposure_data_clumped$beta)
exposure_data_clumped$se <- as.numeric(exposure_data_clumped$se)
exposure_data_clumped$eaf <- as.numeric(exposure_data_clumped$eaf)
exposure_data_clumped$pval <- as.numeric(exposure_data_clumped$pval)
exposure_data_clumped <- exposure_data_clumped[complete.cases(exposure_data_clumped), ]
# 添加暴露ID
exposure_data_clumped$id <- "CARBO"
# 格式化暴露数据
exposure_dat <- TwoSampleMR::format_data(
exposure_data_clumped,
type = "exposure",
snp_col = "rsid",
beta_col = "beta",
se_col = "se",
effect_allele_col = "alt",
other_allele_col = "ref",
eaf_col = "eaf",
pval_col = "pval",
id_col = "id", # 使用新增的id列
phenotype_col = "exposure" # 使用实际表型列
)
# 协调数据并运行MR
harmonised_data <- TwoSampleMR::harmonise_data(exposure_dat, outcome_dat)
result <- mr(harmonised_data)
print(result)
# 异质性检验(p<0.05)
mr_heterogeneity(harmonised_data)
# 水平多效性检验(p<0.05)
mr_pleiotropy_test(harmonised_data)
# 散点图
p1 <- mr_scatter_plot(result, harmonised_data)
p1
# 要是只想展示某几种方法,在进行 MR 分析 的步骤时,指定方法即可
#result_add_method <- mr(data, method_list = c("mr_ivw", "mr_egger_regression", "mr_weighted_median"))
#p1_1 <- mr_scatter_plot(result_add_method, data)
#p1_1
# 森林图
result_single <- mr_singlesnp(harmonised_data)
p2 <- mr_forest_plot(result_single)
p2
# 和上面一样,想展示指定方法的结果,在函数中指定即可
#result_single_add_method <- mr_singlesnp(data, all_method = c("mr_ivw", "mr_two_sample_ml"))
#p2_2 <- mr_forest_plot(result_single_add_method)
#p2_2
# 留一图
result_loo <- mr_leaveoneout(harmonised_data)
p3 <- mr_leaveoneout_plot(result_loo)
p3
# 漏斗图
result_single <- mr_singlesnp(harmonised_data)
p4 <- mr_funnel_plot(result_single)
p4