读数据,合并excel

library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(writexl) 

# 读取baseline.xlsx文件中的两个sheet
baseline <- read_excel("Data/baseline.xlsx", sheet = "baseline")
decode <- read_excel("Data/baseline.xlsx", sheet = "decode")

# 提取decode中的var1和var3列
decode_mapping <- decode[, c("var1", "var3")]

# 替换baseline的列名 
name_mapping <- setNames(decode_mapping$var3, decode_mapping$var1)
names(baseline) <- name_mapping[names(baseline)]

#读取data2_0330.xlsx文件
data2_0330 <- read_excel("Data/data2_0330.xlsx", sheet = "data2_0330")
decode_data2 <- read_excel("Data/data2_0330.xlsx", sheet = "decode")

# 提取decode中的var1和var3列
decode_mapping_data2 <- decode_data2[, c("var1", "var3")]

# 替换data2_0330的列名,
name_mapping_data2 <- setNames(decode_mapping_data2$var3, decode_mapping_data2$var1)
names(data2_0330) <- name_mapping_data2[names(data2_0330)]


# 读取smoking and drinking.xlsx文件
smk_drink <- read_excel("Data/smoking and drinking.xlsx")
name_mapping <- setNames(decode_mapping$var3, decode_mapping$var1)
full_name <- c("id", "smoke", "drink")
mapping <- setNames(full_name, names(smk_drink))
names(smk_drink) <- mapping


# 读取C136(1).xlsx文件
c136 <- read_excel("Data/C136(1).xlsx", sheet = "C136") # warning does not matter --- see column p20441 
decode_c136 <- read_excel("Data/C136(1).xlsx", sheet = "decode")
decode_mapping_2 <- decode_c136[, c("var1", "var3")]

name_mapping1 <- setNames(decode_mapping_2$var3, decode_mapping_2$var1)
names(c136) <- name_mapping1[names(c136)]

# 读取com_disease文件
com_disease <- read_excel("Data/com_disease.xlsx")
data0 <- baseline %>%
  left_join(smk_drink, by = c("id" = "id")) %>%
  left_join(c136, by = c("id" = "id")) %>%
  left_join(com_disease, by=c("id"="id")) %>%
  left_join(data2_0330, by = c("id" = "id"))

#删除 sex 为 "male" 的行
data1 <- data0 %>%
  filter(sex != "Male")

# 保存为 Excel (女,患癌和不患癌)
write_xlsx(data1, path = "Data/merged_data_with&without.xlsx")

变量处理

# 加载必要的包
library(dplyr)

#创建分类变量
data1$Breast_Cancer <- ifelse(is.na(data1$date1), "Without_Cancer", "With_Cancer")
# with_Cancer = 1 ; without_cancer =0 

data1$Breast_Cancer <- ifelse(data1$Breast_Cancer=="With_Cancer",1,0)

# edu_level
data1$edu_first <- sub("\\|.*$", "", data1$edu)


data1 <- data1 %>%
  mutate(
    edu_level = case_when(
      # 捕获所有A Level/AS Level变体(不区分大小写)
      grepl("A levels?|AS levels?|Advanced Level",edu_first, ignore.case = TRUE) ~ "High school",
      grepl("O levels?|GCSEs?|CSEs?", edu_first, ignore.case = TRUE) ~ "Junior high school and below",
      grepl("College or University degree?", edu_first, ignore.case = TRUE) ~ "College",
      grepl("NVQ?|HNC|Certificate|Diploma|Professional Qualification", edu_first, ignore.case = TRUE) ~ "Vocational",
      grepl("Prefer not to answer", edu_first, ignore.case = TRUE) ~ "No answer given",
      is.na(edu_first) ~ "No answer given",
      # 默认保留原值(可根据需要修改)
      TRUE ~ as.character(edu_first)
    )
  )


# Neuro_score, smoke, drink,bmi,imd, employ,income,region
data1_clean <- data1 %>%
  mutate(
    # 处理Neuro_score的非标准缺失值
    Neuro_score = ifelse(Neuro_score %in% c("NA"), NA, Neuro_score),
    Neuro_score = as.numeric(Neuro_score),
  ) %>%
  # 删除关键变量的缺失值
  filter(
    !is.na(Neuro_score),
    !is.na(drink),
    !is.na(smoke),
    !is.na(bmi),
    !is.na(employ),
    !is.na(income) 
  ) %>%
  # 创建imd变量
  mutate(
    imd = coalesce(imde, imds, imdw),
    region = case_when(
      !is.na(imde) ~ "England",
      !is.na(imds) ~ "Scotland",
      !is.na(imdw) ~ "Wales",
      TRUE ~ NA_character_
    )
  ) %>%
  select(-imde, -imds, -imdw) %>%
  mutate(
    # 将分类变量转换为因子
    edu_level = as.factor(edu_level),
    region=as.factor(region)
  ) %>%
  mutate(
    smoke_group = case_when(
      smoke == "Daily or almost daily" ~ "Smoking everyday",
      smoke %in% c("Three or four times a week", 
                   "Once or twice a week",
                   "One to three times a month") ~ "Regular smoking (not daily)",
      smoke == "Special occasions only" ~ "Smoking occasionally",
      smoke == "Never" ~ "Never smoking",
      smoke == "Prefer not to answer"~ "Refuse to answer" ,
      TRUE ~ "Other"
    ) %>% factor(levels = c("Smoking everyday", "Regular smoking (not daily)", 
                            "Smoking occasionally", "Never smoking", "Refuse to answer"))
  )

#diag_prof
data1_clean$diag_prof<- ifelse(!is.na(data1_clean$diag_prof),1,0)

#employ
data1_clean <- data1_clean %>%
  mutate(
    employ_status = case_when(
      grepl("In paid employment?", employ, ignore.case = TRUE) ~ "Employed",
      grepl("Retired?", employ, ignore.case = TRUE) ~ "Retired",
      grepl("Full or part-time student|Doing unpaid or voluntary work|Looking after home?|Unemployed|Unable to work?", employ, ignore.case = TRUE) ~ "Unemployed",
      grepl("Prefer not to answer", employ, ignore.case = TRUE) ~ "No answer given",
      # 默认保留原值(可根据需要修改)
      TRUE ~ "Other"
    ),
    
    # 转换为因子并指定水平顺序
    employ_status = factor(
      employ_status,
      levels = c(
        "Employed",
        "Retired",
        "Unemployed",
        "Other",
        "No answer given"
      )
    )
  )

#income
library(stringr) 
data1_clean <- data1_clean %>%
  mutate(
    # 1. 首先清理收入数据(确保字符类型且无空格)
    income = str_trim(as.character(income)),
    # 2. 分类逻辑
    income_status = case_when(
      # 合并低收入范围(使用正则表达式捕获不同表述)
      grepl("18,000|30,999|31,000|51,999|Less than", income, ignore.case = TRUE) ~ "Lower than 52,000",
      # 中等收入范围(精确匹配)
      grepl("52,000 to 100,000", income, fixed = TRUE) ~ "52,000 to 100,000",
      # 高收入范围(兼容大小写和空格变体)
      grepl("Greater than 100,000|>100,000|100k\\+", income, ignore.case = TRUE) ~ "Greater than 100,000",
      # 处理拒绝回答和缺失值
      grepl("Do not know|Prefer not to answer|Unknown", income, ignore.case = TRUE) ~ "No answer given"
    ),
    # 3. 转换为有序因子
    income_status = factor(
      income_status,
      levels = c(
        "Lower than 52,000",
        "52,000 to 100,000",
        "Greater than 100,000",
        "No answer given"
      )
    )
  )

#Dur_mod_activity
data1_clean <- data1_clean %>%
  mutate(
    # 将Dur_mod_activity转换为数值型
    Dur_mod_activity = as.numeric(Dur_mod_activity),
    # 创建新的分类变量
    activity_level = case_when(
      is.na(Dur_mod_activity)   ~ "No answer given", 
      Dur_mod_activity %in% c("Prefer not to answer", "Do not know") ~ "No answer given",
      Dur_mod_activity < 30 ~ "Low activity",
      Dur_mod_activity >= 30 & Dur_mod_activity < 150 ~ "Moderate activity",
      Dur_mod_activity >= 150 ~ "High activity"
    ) %>% factor(levels = c("Low activity", "Moderate activity", "High activity", "No answer given"))
  )

Model

library(broom)
model3 <- glm(
  Breast_Cancer ~ Neuro_score +diag_prof+ age + edu_level + imd + region +  income_status + smoke + drink + bmi  + employ_status+Dur_mod_activity,
  data = data1_clean,
  family = binomial(link = "logit")  # 指定logistic回归
)

coefficients_df <- tidy(model3)
coefficients_df$OR <- exp(coefficients_df$estimate)
write_xlsx(coefficients_df, "logistic_model_coefficients.xlsx")
summary(model3)
## 
## Call:
## glm(formula = Breast_Cancer ~ Neuro_score + diag_prof + age + 
##     edu_level + imd + region + income_status + smoke + drink + 
##     bmi + employ_status + Dur_mod_activity, family = binomial(link = "logit"), 
##     data = data1_clean)
## 
## Coefficients:
##                                         Estimate Std. Error z value Pr(>|z|)
## (Intercept)                           -4.4370957  0.1431031 -31.006  < 2e-16
## Neuro_score                           -0.0122011  0.0039328  -3.102 0.001920
## diag_prof                              0.0479412  0.0365380   1.312 0.189489
## age                                    0.0268472  0.0021892  12.263  < 2e-16
## edu_levelHigh school                   0.0021942  0.0394504   0.056 0.955645
## edu_levelJunior high school and below -0.0797405  0.0319783  -2.494 0.012646
## edu_levelNo answer given              -0.1821705  0.1798178  -1.013 0.311020
## edu_levelNone of the above            -0.1263644  0.0420283  -3.007 0.002641
## edu_levelVocational                   -0.1575996  0.0452590  -3.482 0.000497
## imd                                   -0.0022363  0.0010105  -2.213 0.026888
## regionScotland                         0.0799810  0.0442617   1.807 0.070762
## regionWales                           -0.2456696  0.0674590  -3.642 0.000271
## income_status52,000 to 100,000        -0.0122563  0.0354190  -0.346 0.729316
## income_statusGreater than 100,000     -0.2008480  0.0644323  -3.117 0.001826
## income_statusNo answer given          -0.1311429  0.0382968  -3.424 0.000616
## smokeNever                            -0.1602901  0.0546973  -2.930 0.003384
## smokeOnce or twice a week             -0.0223869  0.0376331  -0.595 0.551928
## smokeOne to three times a month       -0.0490271  0.0455418  -1.077 0.281690
## smokePrefer not to answer             -9.4881582 49.2398785  -0.193 0.847200
## smokeSpecial occasions only           -0.1255616  0.0458928  -2.736 0.006220
## smokeThree or four times a week        0.0424903  0.0379874   1.119 0.263338
## drinkYes                               0.0382450  0.0249421   1.533 0.125189
## bmi                                    0.0038886  0.0025556   1.522 0.128112
## employ_statusRetired                  -0.0123733  0.0342539  -0.361 0.717933
## employ_statusUnemployed                0.1039021  0.0469024   2.215 0.026741
## employ_statusOther                    -0.0571323  0.1878818  -0.304 0.761062
## employ_statusNo answer given           0.0607573  0.2883618   0.211 0.833123
## Dur_mod_activity                      -0.0006766  0.0001894  -3.572 0.000354
##                                          
## (Intercept)                           ***
## Neuro_score                           ** 
## diag_prof                                
## age                                   ***
## edu_levelHigh school                     
## edu_levelJunior high school and below *  
## edu_levelNo answer given                 
## edu_levelNone of the above            ** 
## edu_levelVocational                   ***
## imd                                   *  
## regionScotland                        .  
## regionWales                           ***
## income_status52,000 to 100,000           
## income_statusGreater than 100,000     ** 
## income_statusNo answer given          ***
## smokeNever                            ** 
## smokeOnce or twice a week                
## smokeOne to three times a month          
## smokePrefer not to answer                
## smokeSpecial occasions only           ** 
## smokeThree or four times a week          
## drinkYes                                 
## bmi                                      
## employ_statusRetired                     
## employ_statusUnemployed               *  
## employ_statusOther                       
## employ_statusNo answer given             
## Dur_mod_activity                      ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 58147  on 155228  degrees of freedom
## Residual deviance: 57765  on 155201  degrees of freedom
##   (56705 observations deleted due to missingness)
## AIC: 57821
## 
## Number of Fisher Scoring iterations: 11
# 查看模型的系数
coefficients_df <- tidy(model3)
print(coefficients_df)
## # A tibble: 28 × 5
##    term                                  estimate std.error statistic   p.value
##    <chr>                                    <dbl>     <dbl>     <dbl>     <dbl>
##  1 (Intercept)                           -4.44      0.143    -31.0    4.44e-211
##  2 Neuro_score                           -0.0122    0.00393   -3.10   1.92e-  3
##  3 diag_prof                              0.0479    0.0365     1.31   1.89e-  1
##  4 age                                    0.0268    0.00219   12.3    1.42e- 34
##  5 edu_levelHigh school                   0.00219   0.0395     0.0556 9.56e-  1
##  6 edu_levelJunior high school and below -0.0797    0.0320    -2.49   1.26e-  2
##  7 edu_levelNo answer given              -0.182     0.180     -1.01   3.11e-  1
##  8 edu_levelNone of the above            -0.126     0.0420    -3.01   2.64e-  3
##  9 edu_levelVocational                   -0.158     0.0453    -3.48   4.97e-  4
## 10 imd                                   -0.00224   0.00101   -2.21   2.69e-  2
## # ℹ 18 more rows

CI forest plot

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)

# 提取置信区间
ci <- confint(model3, level = 0.95)
## Waiting for profiling to be done...
# 数据处理
plot_data <- tidy(model3) %>%
  mutate(OR = exp(estimate),
         term = fct_inorder(term)) %>%  # 保持原始变量顺序
  bind_cols(
    exp(confint(model3) %>%
        as_tibble() %>%
        rename(conf.low = 1, conf.high = 2)
    )
  ) %>%
  filter(!is.na(conf.low) & !is.na(conf.high)) %>%  # 移除NA的置信区间
  filter(term != "(Intercept)") %>%  # 移除截距项
  mutate(p.value = p.value)  # 确保p.value列已经存在
## Waiting for profiling to be done...
# 创建森林图
ggplot(plot_data, aes(x = OR, y = term)) +
  geom_vline(xintercept = 1, linetype = "dashed", color = "gray50") +
  geom_point(size = 3, shape = 18, color = "darkorange") +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), 
                 height = 0.2, color = "steelblue", linewidth = 1) +
  scale_x_log10(breaks = c(0.5, 1, 2),  # 对数刻度
                limits = c(0.5, 2)) +  # 限制x轴范围为0.5到2
  labs(x = "Odds Ratio (95% CI)", y = "", 
       title = "Logistic Regression: Odds Ratios") +
  theme_bw(base_size = 14) +
  theme(panel.grid.minor = element_blank(),
        plot.title = element_text(hjust = 0.5, face = "bold"),
        axis.title = element_text(face = "bold")) +
  geom_text(aes(label = sprintf("p=%.2f", p.value)),  # 保留两位小数
            hjust = 1.5, vjust = 0.5, size = 3, color = "black")+
  theme_minimal()

# 保存图形
ggsave("CI_forest_plot.png", width = 8, height = 6, dpi = 300)

circle plot

library(tidyverse)
library(readxl)
library(grid)

df <- read_excel('/Users/qiujiahui/Desktop/2025 Summer/circle_barplot_model_coefficients.xlsx')

# 添加原始顺序索引
df$original_order <- seq_len(nrow(df))

# 2. 数据处理:计算 exp(Estimate) 并保留正负信息
df <- df %>%
  mutate(
    sign = ifelse(exp(estimate) >= 1, "positive", "negative"),
    coef = exp(estimate)
  )



# 4. 添加 ID - 按原始顺序
df$id <- seq(1, nrow(df))

# 5. 外圈标签数据 - 保持原始顺序
label_data <- df %>% filter(!is.na(name))
number_of_bar <- nrow(df)

# 调整标签角度计算
angle <- 90 - 360 * (label_data$id - 0.5) / number_of_bar
label_data$angle <- ifelse(angle < -90, angle + 180, angle)
label_data$hjust <- ifelse(angle < -90, 1, 0)

# 6. 内圈组别标签位置 - 基于原始顺序
group_labels <- df %>%
  filter(!is.na(group)) %>%
  group_by(group) %>%
  summarise(
    min_id = min(id),
    max_id = max(id),
    title = mean(c(min_id, max_id))
  )

# 7. 同心圆刻度线数据
max_coef <- max(df$coef, na.rm = TRUE)
generate_circle_data <- function(radius, n_points = 360) {
  theta <- seq(0, 2*pi, length.out = n_points)
  x <- (theta / (2 * pi)) * number_of_bar + 1
  y <- rep(radius, n_points)
  data.frame(x = x, y = y)
}

circle_data <- map_dfr(c(0.3, 0.6, 0.9,1, 1.2, 1.5, 1.8), generate_circle_data)

tick_labels <- data.frame(
  x = 1,
  y = unique(circle_data$y),
  label = sprintf("%.1f", unique(circle_data$y))
)

# 8. 创建图表
library(ggplot2)

# 假设你的数据框 df 已经定义好了
# df <- data.frame(id = ..., coef = ..., sign = ...)

# 假设其他数据框 circle_data, tick_labels, label_data, group_labels 也已经定义好了
# circle_data <- ...
# tick_labels <- ...
# label_data <- ...
# group_labels <- ...

# 计算最大系数值
library(ggplot2)

# 假设你的数据框 df 已经定义好了
# df <- data.frame(id = ..., coef = ..., sign = ...)

# 假设其他数据框 circle_data, tick_labels, label_data, group_labels 也已经定义好了
# circle_data <- ...
# tick_labels <- ...
# label_data <- ...
# group_labels <- ...

# 计算最大系数值
max_coef <- max(abs(df$coef), na.rm = TRUE)

# 绘图代码
q <- ggplot(df, aes(x = as.factor(id), y = coef, fill = sign)) +
  geom_bar(stat = "identity", alpha = 0.7, size = 0.3) +
  geom_path(data = circle_data, aes(x = x, y = y, group = as.factor(y)),
            inherit.aes = FALSE, color = "gray70", linetype = "solid", alpha = 0.5) +
  geom_text(data = tick_labels,
            aes(x = x, y = y, label = label),
            inherit.aes = FALSE,
            color = "gray30",
            size = 3,
            hjust = -0.1) +
  coord_polar(start = 0) +
  ylim(-max_coef * 0.5, max_coef * 1.1) +
  scale_fill_manual(values = c("positive" = "#F1B8F1", "negative" = "#80A9E6"),
                    labels = c("positive" = "Odds Ratio > 1", "negative" = "Odds Ratio < 1"),
                    drop = TRUE) +  # 确保移除未使用的颜色级别
  geom_text(data = label_data,
            aes(x = id, y = coef + max_coef * 0.08, 
                label = name, angle = angle, hjust = hjust),
            color = "black", alpha = 0.8, size = 3, inherit.aes = FALSE) +
  geom_text(data = group_labels, 
            aes(x = title, y = -max_coef * 0.1, label = group), 
            inherit.aes = FALSE, fontface = "bold", size = 3) +
  theme_minimal() +
  theme(
    legend.position = "right",
    axis.text = element_blank(),
    axis.title = element_blank(),
    panel.grid = element_blank(),
    plot.margin = unit(rep(-1, 4), "cm")
  ) +
  labs(title = "Circle Bar Plot of Odds Ratio to Breast Cancer",
       fill = "Odds Ratio")

# 显示图形
print(q)

# 显示图形
print(q)

# 保存图表
ggsave(
  filename = "circle_barplot.png", 
  plot = q,
  width = 28,
  height =25,
  units = "in"
)