读数据,合并excel
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(writexl)
# 读取baseline.xlsx文件中的两个sheet
baseline <- read_excel("Data/baseline.xlsx", sheet = "baseline")
decode <- read_excel("Data/baseline.xlsx", sheet = "decode")
# 提取decode中的var1和var3列
decode_mapping <- decode[, c("var1", "var3")]
# 替换baseline的列名
name_mapping <- setNames(decode_mapping$var3, decode_mapping$var1)
names(baseline) <- name_mapping[names(baseline)]
#读取data2_0330.xlsx文件
data2_0330 <- read_excel("Data/data2_0330.xlsx", sheet = "data2_0330")
decode_data2 <- read_excel("Data/data2_0330.xlsx", sheet = "decode")
# 提取decode中的var1和var3列
decode_mapping_data2 <- decode_data2[, c("var1", "var3")]
# 替换data2_0330的列名,
name_mapping_data2 <- setNames(decode_mapping_data2$var3, decode_mapping_data2$var1)
names(data2_0330) <- name_mapping_data2[names(data2_0330)]
# 读取smoking and drinking.xlsx文件
smk_drink <- read_excel("Data/smoking and drinking.xlsx")
name_mapping <- setNames(decode_mapping$var3, decode_mapping$var1)
full_name <- c("id", "smoke", "drink")
mapping <- setNames(full_name, names(smk_drink))
names(smk_drink) <- mapping
# 读取C136(1).xlsx文件
c136 <- read_excel("Data/C136(1).xlsx", sheet = "C136") # warning does not matter --- see column p20441
decode_c136 <- read_excel("Data/C136(1).xlsx", sheet = "decode")
decode_mapping_2 <- decode_c136[, c("var1", "var3")]
name_mapping1 <- setNames(decode_mapping_2$var3, decode_mapping_2$var1)
names(c136) <- name_mapping1[names(c136)]
# 读取com_disease文件
com_disease <- read_excel("Data/com_disease.xlsx")
data0 <- baseline %>%
left_join(smk_drink, by = c("id" = "id")) %>%
left_join(c136, by = c("id" = "id")) %>%
left_join(com_disease, by=c("id"="id")) %>%
left_join(data2_0330, by = c("id" = "id"))
#删除 sex 为 "male" 的行
data1 <- data0 %>%
filter(sex != "Male")
# 保存为 Excel (女,患癌和不患癌)
write_xlsx(data1, path = "Data/merged_data_with&without.xlsx")
变量处理
# 加载必要的包
library(dplyr)
#创建分类变量
data1$Breast_Cancer <- ifelse(is.na(data1$date1), "Without_Cancer", "With_Cancer")
# with_Cancer = 1 ; without_cancer =0
data1$Breast_Cancer <- ifelse(data1$Breast_Cancer=="With_Cancer",1,0)
# edu_level
data1$edu_first <- sub("\\|.*$", "", data1$edu)
data1 <- data1 %>%
mutate(
edu_level = case_when(
# 捕获所有A Level/AS Level变体(不区分大小写)
grepl("A levels?|AS levels?|Advanced Level",edu_first, ignore.case = TRUE) ~ "High school",
grepl("O levels?|GCSEs?|CSEs?", edu_first, ignore.case = TRUE) ~ "Junior high school and below",
grepl("College or University degree?", edu_first, ignore.case = TRUE) ~ "College",
grepl("NVQ?|HNC|Certificate|Diploma|Professional Qualification", edu_first, ignore.case = TRUE) ~ "Vocational",
grepl("Prefer not to answer", edu_first, ignore.case = TRUE) ~ "No answer given",
is.na(edu_first) ~ "No answer given",
# 默认保留原值(可根据需要修改)
TRUE ~ as.character(edu_first)
)
)
# Neuro_score, smoke, drink,bmi,imd, employ,income,region
data1_clean <- data1 %>%
mutate(
# 处理Neuro_score的非标准缺失值
Neuro_score = ifelse(Neuro_score %in% c("NA"), NA, Neuro_score),
Neuro_score = as.numeric(Neuro_score),
) %>%
# 删除关键变量的缺失值
filter(
!is.na(Neuro_score),
!is.na(drink),
!is.na(smoke),
!is.na(bmi),
!is.na(employ),
!is.na(income)
) %>%
# 创建imd变量
mutate(
imd = coalesce(imde, imds, imdw),
region = case_when(
!is.na(imde) ~ "England",
!is.na(imds) ~ "Scotland",
!is.na(imdw) ~ "Wales",
TRUE ~ NA_character_
)
) %>%
select(-imde, -imds, -imdw) %>%
mutate(
# 将分类变量转换为因子
edu_level = as.factor(edu_level),
region=as.factor(region)
) %>%
mutate(
smoke_group = case_when(
smoke == "Daily or almost daily" ~ "Smoking everyday",
smoke %in% c("Three or four times a week",
"Once or twice a week",
"One to three times a month") ~ "Regular smoking (not daily)",
smoke == "Special occasions only" ~ "Smoking occasionally",
smoke == "Never" ~ "Never smoking",
smoke == "Prefer not to answer"~ "Refuse to answer" ,
TRUE ~ "Other"
) %>% factor(levels = c("Smoking everyday", "Regular smoking (not daily)",
"Smoking occasionally", "Never smoking", "Refuse to answer"))
)
#diag_prof
data1_clean$diag_prof<- ifelse(!is.na(data1_clean$diag_prof),1,0)
#employ
data1_clean <- data1_clean %>%
mutate(
employ_status = case_when(
grepl("In paid employment?", employ, ignore.case = TRUE) ~ "Employed",
grepl("Retired?", employ, ignore.case = TRUE) ~ "Retired",
grepl("Full or part-time student|Doing unpaid or voluntary work|Looking after home?|Unemployed|Unable to work?", employ, ignore.case = TRUE) ~ "Unemployed",
grepl("Prefer not to answer", employ, ignore.case = TRUE) ~ "No answer given",
# 默认保留原值(可根据需要修改)
TRUE ~ "Other"
),
# 转换为因子并指定水平顺序
employ_status = factor(
employ_status,
levels = c(
"Employed",
"Retired",
"Unemployed",
"Other",
"No answer given"
)
)
)
#income
library(stringr)
data1_clean <- data1_clean %>%
mutate(
# 1. 首先清理收入数据(确保字符类型且无空格)
income = str_trim(as.character(income)),
# 2. 分类逻辑
income_status = case_when(
# 合并低收入范围(使用正则表达式捕获不同表述)
grepl("18,000|30,999|31,000|51,999|Less than", income, ignore.case = TRUE) ~ "Lower than 52,000",
# 中等收入范围(精确匹配)
grepl("52,000 to 100,000", income, fixed = TRUE) ~ "52,000 to 100,000",
# 高收入范围(兼容大小写和空格变体)
grepl("Greater than 100,000|>100,000|100k\\+", income, ignore.case = TRUE) ~ "Greater than 100,000",
# 处理拒绝回答和缺失值
grepl("Do not know|Prefer not to answer|Unknown", income, ignore.case = TRUE) ~ "No answer given"
),
# 3. 转换为有序因子
income_status = factor(
income_status,
levels = c(
"Lower than 52,000",
"52,000 to 100,000",
"Greater than 100,000",
"No answer given"
)
)
)
#Dur_mod_activity
data1_clean <- data1_clean %>%
mutate(
# 将Dur_mod_activity转换为数值型
Dur_mod_activity = as.numeric(Dur_mod_activity),
# 创建新的分类变量
activity_level = case_when(
is.na(Dur_mod_activity) ~ "No answer given",
Dur_mod_activity %in% c("Prefer not to answer", "Do not know") ~ "No answer given",
Dur_mod_activity < 30 ~ "Low activity",
Dur_mod_activity >= 30 & Dur_mod_activity < 150 ~ "Moderate activity",
Dur_mod_activity >= 150 ~ "High activity"
) %>% factor(levels = c("Low activity", "Moderate activity", "High activity", "No answer given"))
)
Model
library(broom)
model3 <- glm(
Breast_Cancer ~ Neuro_score +diag_prof+ age + edu_level + imd + region + income_status + smoke + drink + bmi + employ_status+Dur_mod_activity,
data = data1_clean,
family = binomial(link = "logit") # 指定logistic回归
)
coefficients_df <- tidy(model3)
coefficients_df$OR <- exp(coefficients_df$estimate)
write_xlsx(coefficients_df, "logistic_model_coefficients.xlsx")
summary(model3)
##
## Call:
## glm(formula = Breast_Cancer ~ Neuro_score + diag_prof + age +
## edu_level + imd + region + income_status + smoke + drink +
## bmi + employ_status + Dur_mod_activity, family = binomial(link = "logit"),
## data = data1_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.4370957 0.1431031 -31.006 < 2e-16
## Neuro_score -0.0122011 0.0039328 -3.102 0.001920
## diag_prof 0.0479412 0.0365380 1.312 0.189489
## age 0.0268472 0.0021892 12.263 < 2e-16
## edu_levelHigh school 0.0021942 0.0394504 0.056 0.955645
## edu_levelJunior high school and below -0.0797405 0.0319783 -2.494 0.012646
## edu_levelNo answer given -0.1821705 0.1798178 -1.013 0.311020
## edu_levelNone of the above -0.1263644 0.0420283 -3.007 0.002641
## edu_levelVocational -0.1575996 0.0452590 -3.482 0.000497
## imd -0.0022363 0.0010105 -2.213 0.026888
## regionScotland 0.0799810 0.0442617 1.807 0.070762
## regionWales -0.2456696 0.0674590 -3.642 0.000271
## income_status52,000 to 100,000 -0.0122563 0.0354190 -0.346 0.729316
## income_statusGreater than 100,000 -0.2008480 0.0644323 -3.117 0.001826
## income_statusNo answer given -0.1311429 0.0382968 -3.424 0.000616
## smokeNever -0.1602901 0.0546973 -2.930 0.003384
## smokeOnce or twice a week -0.0223869 0.0376331 -0.595 0.551928
## smokeOne to three times a month -0.0490271 0.0455418 -1.077 0.281690
## smokePrefer not to answer -9.4881582 49.2398785 -0.193 0.847200
## smokeSpecial occasions only -0.1255616 0.0458928 -2.736 0.006220
## smokeThree or four times a week 0.0424903 0.0379874 1.119 0.263338
## drinkYes 0.0382450 0.0249421 1.533 0.125189
## bmi 0.0038886 0.0025556 1.522 0.128112
## employ_statusRetired -0.0123733 0.0342539 -0.361 0.717933
## employ_statusUnemployed 0.1039021 0.0469024 2.215 0.026741
## employ_statusOther -0.0571323 0.1878818 -0.304 0.761062
## employ_statusNo answer given 0.0607573 0.2883618 0.211 0.833123
## Dur_mod_activity -0.0006766 0.0001894 -3.572 0.000354
##
## (Intercept) ***
## Neuro_score **
## diag_prof
## age ***
## edu_levelHigh school
## edu_levelJunior high school and below *
## edu_levelNo answer given
## edu_levelNone of the above **
## edu_levelVocational ***
## imd *
## regionScotland .
## regionWales ***
## income_status52,000 to 100,000
## income_statusGreater than 100,000 **
## income_statusNo answer given ***
## smokeNever **
## smokeOnce or twice a week
## smokeOne to three times a month
## smokePrefer not to answer
## smokeSpecial occasions only **
## smokeThree or four times a week
## drinkYes
## bmi
## employ_statusRetired
## employ_statusUnemployed *
## employ_statusOther
## employ_statusNo answer given
## Dur_mod_activity ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 58147 on 155228 degrees of freedom
## Residual deviance: 57765 on 155201 degrees of freedom
## (56705 observations deleted due to missingness)
## AIC: 57821
##
## Number of Fisher Scoring iterations: 11
# 查看模型的系数
coefficients_df <- tidy(model3)
print(coefficients_df)
## # A tibble: 28 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -4.44 0.143 -31.0 4.44e-211
## 2 Neuro_score -0.0122 0.00393 -3.10 1.92e- 3
## 3 diag_prof 0.0479 0.0365 1.31 1.89e- 1
## 4 age 0.0268 0.00219 12.3 1.42e- 34
## 5 edu_levelHigh school 0.00219 0.0395 0.0556 9.56e- 1
## 6 edu_levelJunior high school and below -0.0797 0.0320 -2.49 1.26e- 2
## 7 edu_levelNo answer given -0.182 0.180 -1.01 3.11e- 1
## 8 edu_levelNone of the above -0.126 0.0420 -3.01 2.64e- 3
## 9 edu_levelVocational -0.158 0.0453 -3.48 4.97e- 4
## 10 imd -0.00224 0.00101 -2.21 2.69e- 2
## # ℹ 18 more rows
CI forest plot
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
# 提取置信区间
ci <- confint(model3, level = 0.95)
## Waiting for profiling to be done...
# 数据处理
plot_data <- tidy(model3) %>%
mutate(OR = exp(estimate),
term = fct_inorder(term)) %>% # 保持原始变量顺序
bind_cols(
exp(confint(model3) %>%
as_tibble() %>%
rename(conf.low = 1, conf.high = 2)
)
) %>%
filter(!is.na(conf.low) & !is.na(conf.high)) %>% # 移除NA的置信区间
filter(term != "(Intercept)") %>% # 移除截距项
mutate(p.value = p.value) # 确保p.value列已经存在
## Waiting for profiling to be done...
# 创建森林图
ggplot(plot_data, aes(x = OR, y = term)) +
geom_vline(xintercept = 1, linetype = "dashed", color = "gray50") +
geom_point(size = 3, shape = 18, color = "darkorange") +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high),
height = 0.2, color = "steelblue", linewidth = 1) +
scale_x_log10(breaks = c(0.5, 1, 2), # 对数刻度
limits = c(0.5, 2)) + # 限制x轴范围为0.5到2
labs(x = "Odds Ratio (95% CI)", y = "",
title = "Logistic Regression: Odds Ratios") +
theme_bw(base_size = 14) +
theme(panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5, face = "bold"),
axis.title = element_text(face = "bold")) +
geom_text(aes(label = sprintf("p=%.2f", p.value)), # 保留两位小数
hjust = 1.5, vjust = 0.5, size = 3, color = "black")+
theme_minimal()

# 保存图形
ggsave("CI_forest_plot.png", width = 8, height = 6, dpi = 300)
circle plot
library(tidyverse)
library(readxl)
library(grid)
df <- read_excel('/Users/qiujiahui/Desktop/2025 Summer/circle_barplot_model_coefficients.xlsx')
# 添加原始顺序索引
df$original_order <- seq_len(nrow(df))
# 2. 数据处理:计算 exp(Estimate) 并保留正负信息
df <- df %>%
mutate(
sign = ifelse(exp(estimate) >= 1, "positive", "negative"),
coef = exp(estimate)
)
# 4. 添加 ID - 按原始顺序
df$id <- seq(1, nrow(df))
# 5. 外圈标签数据 - 保持原始顺序
label_data <- df %>% filter(!is.na(name))
number_of_bar <- nrow(df)
# 调整标签角度计算
angle <- 90 - 360 * (label_data$id - 0.5) / number_of_bar
label_data$angle <- ifelse(angle < -90, angle + 180, angle)
label_data$hjust <- ifelse(angle < -90, 1, 0)
# 6. 内圈组别标签位置 - 基于原始顺序
group_labels <- df %>%
filter(!is.na(group)) %>%
group_by(group) %>%
summarise(
min_id = min(id),
max_id = max(id),
title = mean(c(min_id, max_id))
)
# 7. 同心圆刻度线数据
max_coef <- max(df$coef, na.rm = TRUE)
generate_circle_data <- function(radius, n_points = 360) {
theta <- seq(0, 2*pi, length.out = n_points)
x <- (theta / (2 * pi)) * number_of_bar + 1
y <- rep(radius, n_points)
data.frame(x = x, y = y)
}
circle_data <- map_dfr(c(0.3, 0.6, 0.9,1, 1.2, 1.5, 1.8), generate_circle_data)
tick_labels <- data.frame(
x = 1,
y = unique(circle_data$y),
label = sprintf("%.1f", unique(circle_data$y))
)
# 8. 创建图表
library(ggplot2)
# 假设你的数据框 df 已经定义好了
# df <- data.frame(id = ..., coef = ..., sign = ...)
# 假设其他数据框 circle_data, tick_labels, label_data, group_labels 也已经定义好了
# circle_data <- ...
# tick_labels <- ...
# label_data <- ...
# group_labels <- ...
# 计算最大系数值
library(ggplot2)
# 假设你的数据框 df 已经定义好了
# df <- data.frame(id = ..., coef = ..., sign = ...)
# 假设其他数据框 circle_data, tick_labels, label_data, group_labels 也已经定义好了
# circle_data <- ...
# tick_labels <- ...
# label_data <- ...
# group_labels <- ...
# 计算最大系数值
max_coef <- max(abs(df$coef), na.rm = TRUE)
# 绘图代码
q <- ggplot(df, aes(x = as.factor(id), y = coef, fill = sign)) +
geom_bar(stat = "identity", alpha = 0.7, size = 0.3) +
geom_path(data = circle_data, aes(x = x, y = y, group = as.factor(y)),
inherit.aes = FALSE, color = "gray70", linetype = "solid", alpha = 0.5) +
geom_text(data = tick_labels,
aes(x = x, y = y, label = label),
inherit.aes = FALSE,
color = "gray30",
size = 3,
hjust = -0.1) +
coord_polar(start = 0) +
ylim(-max_coef * 0.5, max_coef * 1.1) +
scale_fill_manual(values = c("positive" = "#F1B8F1", "negative" = "#80A9E6"),
labels = c("positive" = "Odds Ratio > 1", "negative" = "Odds Ratio < 1"),
drop = TRUE) + # 确保移除未使用的颜色级别
geom_text(data = label_data,
aes(x = id, y = coef + max_coef * 0.08,
label = name, angle = angle, hjust = hjust),
color = "black", alpha = 0.8, size = 3, inherit.aes = FALSE) +
geom_text(data = group_labels,
aes(x = title, y = -max_coef * 0.1, label = group),
inherit.aes = FALSE, fontface = "bold", size = 3) +
theme_minimal() +
theme(
legend.position = "right",
axis.text = element_blank(),
axis.title = element_blank(),
panel.grid = element_blank(),
plot.margin = unit(rep(-1, 4), "cm")
) +
labs(title = "Circle Bar Plot of Odds Ratio to Breast Cancer",
fill = "Odds Ratio")
# 显示图形
print(q)

# 显示图形
print(q)

# 保存图表
ggsave(
filename = "circle_barplot.png",
plot = q,
width = 28,
height =25,
units = "in"
)