read excel

library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# 读取baseline.xlsx文件中的两个sheet
baseline <- read_excel("Data/baseline.xlsx", sheet = "baseline")
decode <- read_excel("Data/baseline.xlsx", sheet = "decode")

# 提取decode中的var1和var3列
decode_mapping <- decode[, c("var1", "var3")]

# 替换baseline的列名 
name_mapping <- setNames(decode_mapping$var3, decode_mapping$var1)
name_mapping
##         eid      p21022         p31      p26410      p26427      p26426 
##        "id"       "age"       "sex"      "imde"      "imds"      "imdw" 
##    p6138_i0     p845_i0   p21000_i0    p6142_i0     p680_i0     p728_i0 
##       "edu"   "edu_age"    "ethnic"    "employ"     "house"       "car" 
##     p738_i0 p4079_i0_a0 p4079_i0_a1   p94_i0_a0   p94_i0_a1 p4080_i0_a0 
##    "income"      "dbp1"      "dbp2"      "dbp3"      "dbp4"      "sbp1" 
## p4080_i0_a1   p93_i0_a0   p93_i0_a1   p21001_i0      p48_i0      p49_i0 
##      "sbp2"      "sbp3"      "sbp4"       "bmi"     "waist"       "hip" 
##      p54_i0 
##    "center"
names(baseline) <- name_mapping[names(baseline)]

# 读取smoking and drinking
smk_drink <- read_excel("Data/smoking and drinking.xlsx")
name_mapping <- setNames(decode_mapping$var3, decode_mapping$var1)
full_name <- c("id", "smoke", "drink")

mapping <- setNames(full_name, names(smk_drink))

names(smk_drink) <- mapping

# 读取C136
c136 <- read_excel("Data/C136(1).xlsx", sheet = "C136") # warning does not matter --- see column p20441 
decode_c136 <- read_excel("Data/C136(1).xlsx", sheet = "decode")
decode_mapping_2 <- decode_c136[, c("var1", "var3")]

name_mapping1 <- setNames(decode_mapping_2$var3, decode_mapping_2$var1)
names(c136) <- name_mapping1[names(c136)]

# 读取com_disease文件
com_disease <- read_excel("Data/com_disease.xlsx")

library(dplyr)
data0 <- baseline %>%
  left_join(smk_drink, by = c("id" = "id")) %>%
  left_join(c136, by = c("id" = "id")) %>%
  left_join(com_disease, by=c("id"="id"))

#删除 sex 为 "male" 的行
data1 <- data0 %>%
  filter(sex != "Male")

变量处理

# 加载必要的包
library(dplyr)

#创建分类变量
data1$Breast_Cancer <- ifelse(is.na(data1$date1), "Without_Cancer", "With_Cancer")
# with_Cancer = 1 ; without_cancer =0 

data1$Breast_Cancer <- ifelse(data1$Breast_Cancer=="With_Cancer",1,0)

# edu_level
data1$edu_first <- sub("\\|.*$", "", data1$edu)


data1 <- data1 %>%
  mutate(
    edu_level = case_when(
      # 捕获所有A Level/AS Level变体(不区分大小写)
      grepl("A levels?|AS levels?|Advanced Level",edu_first, ignore.case = TRUE) ~ "High school",
      grepl("O levels?|GCSEs?|CSEs?", edu_first, ignore.case = TRUE) ~ "Junior high school and below",
      grepl("College or University degree?", edu_first, ignore.case = TRUE) ~ "College",
      grepl("NVQ?|HNC|Certificate|Diploma|Professional Qualification", edu_first, ignore.case = TRUE) ~ "Vocational",
      grepl("Prefer not to answer", edu_first, ignore.case = TRUE) ~ "No answer given",
      is.na(edu_first) ~ "No answer given",
      # 默认保留原值(可根据需要修改)
      TRUE ~ as.character(edu_first)
    )
  )


# Neuro_score, smoke, drink,bmi,imd, employ,income,region
data1_clean <- data1 %>%
  mutate(
    # 处理Neuro_score的非标准缺失值
    Neuro_score = ifelse(Neuro_score %in% c("NA"), NA, Neuro_score),
    Neuro_score = as.numeric(Neuro_score),
  ) %>%
  # 删除关键变量的缺失值
  filter(
    !is.na(Neuro_score),
    !is.na(drink),
    !is.na(smoke),
    !is.na(bmi),
    !is.na(employ),
    !is.na(income) 
  ) %>%
  # 创建imd变量
  mutate(
    imd = coalesce(imde, imds, imdw),
    region = case_when(
      !is.na(imde) ~ "England",
      !is.na(imds) ~ "Scotland",
      !is.na(imdw) ~ "Wales",
      TRUE ~ NA_character_
    )
  ) %>%
  select(-imde, -imds, -imdw) %>%
  mutate(
    # 将分类变量转换为因子
    edu_level = as.factor(edu_level),
    region=as.factor(region)
  ) %>%
  mutate(
    smoke_group = case_when(
      smoke == "Daily or almost daily" ~ "Smoking everyday",
      smoke %in% c("Three or four times a week", 
                   "Once or twice a week",
                   "One to three times a month") ~ "Regular smoking (not daily)",
      smoke == "Special occasions only" ~ "Smoking occasionally",
      smoke == "Never" ~ "Never smoking",
      smoke == "Prefer not to answer"~ "Refuse to answer" ,
      TRUE ~ "Other"
    ) %>% factor(levels = c("Smoking everyday", "Regular smoking (not daily)", 
                            "Smoking occasionally", "Never smoking", "Refuse to answer"))
  )

#diag_prof
data1_clean$diag_prof<- ifelse(!is.na(data1_clean$diag_prof),1,0)

#employ
data1_clean <- data1_clean %>%
  mutate(
    employ_status = case_when(
      grepl("In paid employment?", employ, ignore.case = TRUE) ~ "Employed",
      grepl("Retired?", employ, ignore.case = TRUE) ~ "Retired",
      grepl("Full or part-time student|Doing unpaid or voluntary work|Looking after home?|Unemployed|Unable to work?", employ, ignore.case = TRUE) ~ "Unemployed",
      grepl("Prefer not to answer", employ, ignore.case = TRUE) ~ "No answer given",
      # 默认保留原值(可根据需要修改)
      TRUE ~ "Other"
    ),
    
    # 转换为因子并指定水平顺序
    employ_status = factor(
      employ_status,
      levels = c(
        "Employed",
        "Retired",
        "Unemployed",
        "Other",
        "No answer given"
      )
    )
  )

#income
library(stringr)  # 加载 stringr 包
data1_clean <- data1_clean %>%
  mutate(
    # 1. 首先清理收入数据(确保字符类型且无空格)
    income = str_trim(as.character(income)),
    # 2. 分类逻辑
    income_status = case_when(
      # 合并低收入范围(使用正则表达式捕获不同表述)
      grepl("18,000|30,999|31,000|51,999|Less than", income, ignore.case = TRUE) ~ "Lower than 52,000",
      # 中等收入范围(精确匹配)
      grepl("52,000 to 100,000", income, fixed = TRUE) ~ "52,000 to 100,000",
      # 高收入范围(兼容大小写和空格变体)
      grepl("Greater than 100,000|>100,000|100k\\+", income, ignore.case = TRUE) ~ "Greater than 100,000",
      # 处理拒绝回答和缺失值
      grepl("Do not know|Prefer not to answer|Unknown", income, ignore.case = TRUE) ~ "No answer given"
    ),
    # 3. 转换为有序因子
    income_status = factor(
      income_status,
      levels = c(
        "Lower than 52,000",
        "52,000 to 100,000",
        "Greater than 100,000",
        "No answer given"
      ),
      ordered = TRUE  # 设为有序因子(适用于收入等级)
    )
  )

描述性统计的表

# 加载必要的包
library(dplyr)
library(gtsummary)
library(gt)

# 创建分类变量并选择所需列
analysis_data <- data1_clean %>%
  select(
    Breast_Cancer, 
    edu_level, 
    employ_status, 
    income_status, 
    smoke_group, 
    drink, 
    diag_prof, 
    age, 
    bmi,
    Neuro_score,
    imd,
    region  # 添加地区变量
  ) %>%
  mutate(
    Breast_Cancer = factor(Breast_Cancer, 
                          levels = c(1, 0),
                          labels = c("With Cancer", "Without Cancer")),
    diag_prof = factor(diag_prof,
                      levels = c(1, 0),
                      labels = c("Yes", "No")),
    region = factor(region,  # 确保region是因子类型
                   levels = c("England", "Scotland", "Wales"))
  )

# 分类变量表格
categorical_table <- analysis_data %>%
  select(
    Breast_Cancer,
    edu_level,
    employ_status,
    income_status,
    smoke_group,
    drink,
    diag_prof,
    region
  ) %>%
  tbl_summary(
    by = Breast_Cancer,
    missing = "no",
    label = list(
      edu_level ~ "Education level",
      employ_status ~ "Employment status",
      income_status ~ "Monthly household income",
      smoke_group ~ "Smoking frequency",
      drink ~ "Drinking",
      diag_prof ~ "Mental disorder diagnosis",
      region ~ "Region"
    ),
    type = all_categorical() ~ "categorical",
    statistic = all_categorical() ~ "{n} ({p}%)"
  ) %>%
  add_overall() %>%
  add_p(
    test = all_categorical() ~ "chisq.test",
    test.args = all_categorical() ~ list(simulate.p.value = TRUE)
  ) %>%
  modify_header(
    label ~ "**Characteristic**",
    stat_0 ~ "**Overall** (N = {N})",
    stat_1 ~ "**With Cancer** (N = {n})",
    stat_2 ~ "**Without Cancer** (N = {n})",
    p.value ~ "**p-value**"
  ) %>%
  modify_spanning_header(c("stat_1", "stat_2") ~ "**Breast Cancer Status**") %>%
  modify_caption("Table 1. Categorical Variables by Breast Cancer Status") %>%
  bold_labels() %>%
  italicize_levels()

# 连续变量表格
continuous_table <- analysis_data %>%
  select(
    Breast_Cancer,
    age,
    bmi,
    Neuro_score,
    imd
  ) %>%
  tbl_summary(
    by = Breast_Cancer,
    missing = "no",
    label = list(
      age ~ "Age (years)",
      bmi ~ "Body Mass Index (kg/m²)",
      Neuro_score ~ "Neuroticism score",
      imd ~ "Index of Multiple Deprivation"
    ),
    type = all_continuous() ~ "continuous2",
    statistic = all_continuous() ~ "{mean} ({sd})"
  ) %>%
  add_overall() %>%
  add_p(test = all_continuous() ~ "t.test") %>%
  modify_header(
    label ~ "**Characteristic**",
    stat_0 ~ "**Overall** (N = {N})",
    stat_1 ~ "**With Cancer** (N = {n})",
    stat_2 ~ "**Without Cancer** (N = {n})",
    p.value ~ "**p-value**"
  ) %>%
  modify_spanning_header(c("stat_1", "stat_2") ~ "**Breast Cancer Status**") %>%
  modify_caption("Table 2. Continuous Variables by Breast Cancer Status") %>%
  bold_labels()

# 转换为gt表格并显示
categorical_table_gt <- as_gt(categorical_table)
continuous_table_gt <- as_gt(continuous_table)

# 显示表格
categorical_table_gt
Table 1. Categorical Variables by Breast Cancer Status
Characteristic Overall (N = 211934)1
Breast Cancer Status
p-value2
With Cancer (N = 9795)1 Without Cancer (N = 202139)1
Education level


0.027
    College 68,488 (32%) 3,286 (34%) 65,202 (32%)
    High school 25,655 (12%) 1,225 (13%) 24,430 (12%)
    Junior high school and below 61,163 (29%) 2,753 (28%) 58,410 (29%)
    No answer given 1,505 (0.7%) 60 (0.6%) 1,445 (0.7%)
    None of the above 33,546 (16%) 1,524 (16%) 32,022 (16%)
    Vocational 21,577 (10%) 947 (9.7%) 20,630 (10%)
Employment status


<0.001
    Employed 119,516 (56%) 5,099 (52%) 114,417 (57%)
    Retired 72,329 (34%) 3,846 (39%) 68,483 (34%)
    Unemployed 18,559 (8.8%) 781 (8.0%) 17,778 (8.8%)
    Other 1,003 (0.5%) 40 (0.4%) 963 (0.5%)
    No answer given 527 (0.2%) 29 (0.3%) 498 (0.2%)
Monthly household income


<0.001
    Lower than 52,000 135,522 (64%) 6,457 (66%) 129,065 (64%)
    52,000 to 100,000 35,190 (17%) 1,554 (16%) 33,636 (17%)
    Greater than 100,000 9,385 (4.4%) 376 (3.8%) 9,009 (4.5%)
    No answer given 31,837 (15%) 1,408 (14%) 30,429 (15%)
Smoking frequency


<0.001
    Smoking everyday 35,030 (17%) 1,778 (18%) 33,252 (16%)
    Regular smoking (not daily) 127,098 (60%) 5,932 (61%) 121,166 (60%)
    Smoking occasionally 30,632 (14%) 1,308 (13%) 29,324 (15%)
    Never smoking 19,087 (9.0%) 776 (7.9%) 18,311 (9.1%)
    Refuse to answer 87 (<0.1%) 1 (<0.1%) 86 (<0.1%)
Drinking


0.006
    No 94,319 (45%) 4,227 (43%) 90,092 (45%)
    Yes 117,615 (55%) 5,568 (57%) 112,047 (55%)
Mental disorder diagnosis


0.7
    Yes 26,624 (13%) 1,243 (13%) 25,381 (13%)
    No 185,310 (87%) 8,552 (87%) 176,758 (87%)
Region


<0.001
    England 181,783 (88%) 8,425 (88%) 173,358 (88%)
    Scotland 15,859 (7.7%) 805 (8.4%) 15,054 (7.6%)
    Wales 9,097 (4.4%) 337 (3.5%) 8,760 (4.4%)
1 n (%)
2 Pearson’s Chi-squared test with simulated p-value (based on 2000 replicates)
continuous_table_gt
Table 2. Continuous Variables by Breast Cancer Status
Characteristic Overall (N = 211934)
Breast Cancer Status
p-value1
With Cancer (N = 9795) Without Cancer (N = 202139)
Age (years)


<0.001
    Mean (SD) 56 (8) 57 (7) 56 (8)
Body Mass Index (kg/m²)


0.6
    Mean (SD) 27.1 (5.2) 27.1 (4.9) 27.1 (5.2)
Neuroticism score


<0.001
    Mean (SD) 4.6 (3.2) 4.4 (3.2) 4.6 (3.3)
Index of Multiple Deprivation


<0.001
    Mean (SD) 17 (13) 16 (13) 17 (14)
1 Welch Two Sample t-test
#导出表格
library(gt)
gtsave(categorical_table_gt, "descriptive_table_categorical.html")
gtsave(continuous_table_gt, "descriptive_table_continue.html")