library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# 读取baseline.xlsx文件中的两个sheet
baseline <- read_excel("Data/baseline.xlsx", sheet = "baseline")
decode <- read_excel("Data/baseline.xlsx", sheet = "decode")
# 提取decode中的var1和var3列
decode_mapping <- decode[, c("var1", "var3")]
# 替换baseline的列名
name_mapping <- setNames(decode_mapping$var3, decode_mapping$var1)
name_mapping
## eid p21022 p31 p26410 p26427 p26426
## "id" "age" "sex" "imde" "imds" "imdw"
## p6138_i0 p845_i0 p21000_i0 p6142_i0 p680_i0 p728_i0
## "edu" "edu_age" "ethnic" "employ" "house" "car"
## p738_i0 p4079_i0_a0 p4079_i0_a1 p94_i0_a0 p94_i0_a1 p4080_i0_a0
## "income" "dbp1" "dbp2" "dbp3" "dbp4" "sbp1"
## p4080_i0_a1 p93_i0_a0 p93_i0_a1 p21001_i0 p48_i0 p49_i0
## "sbp2" "sbp3" "sbp4" "bmi" "waist" "hip"
## p54_i0
## "center"
names(baseline) <- name_mapping[names(baseline)]
# 读取smoking and drinking
smk_drink <- read_excel("Data/smoking and drinking.xlsx")
name_mapping <- setNames(decode_mapping$var3, decode_mapping$var1)
full_name <- c("id", "smoke", "drink")
mapping <- setNames(full_name, names(smk_drink))
names(smk_drink) <- mapping
# 读取C136
c136 <- read_excel("Data/C136(1).xlsx", sheet = "C136") # warning does not matter --- see column p20441
decode_c136 <- read_excel("Data/C136(1).xlsx", sheet = "decode")
decode_mapping_2 <- decode_c136[, c("var1", "var3")]
name_mapping1 <- setNames(decode_mapping_2$var3, decode_mapping_2$var1)
names(c136) <- name_mapping1[names(c136)]
# 读取com_disease文件
com_disease <- read_excel("Data/com_disease.xlsx")
library(dplyr)
data0 <- baseline %>%
left_join(smk_drink, by = c("id" = "id")) %>%
left_join(c136, by = c("id" = "id")) %>%
left_join(com_disease, by=c("id"="id"))
#删除 sex 为 "male" 的行
data1 <- data0 %>%
filter(sex != "Male")
# 加载必要的包
library(dplyr)
#创建分类变量
data1$Breast_Cancer <- ifelse(is.na(data1$date1), "Without_Cancer", "With_Cancer")
# with_Cancer = 1 ; without_cancer =0
data1$Breast_Cancer <- ifelse(data1$Breast_Cancer=="With_Cancer",1,0)
# edu_level
data1$edu_first <- sub("\\|.*$", "", data1$edu)
data1 <- data1 %>%
mutate(
edu_level = case_when(
# 捕获所有A Level/AS Level变体(不区分大小写)
grepl("A levels?|AS levels?|Advanced Level",edu_first, ignore.case = TRUE) ~ "High school",
grepl("O levels?|GCSEs?|CSEs?", edu_first, ignore.case = TRUE) ~ "Junior high school and below",
grepl("College or University degree?", edu_first, ignore.case = TRUE) ~ "College",
grepl("NVQ?|HNC|Certificate|Diploma|Professional Qualification", edu_first, ignore.case = TRUE) ~ "Vocational",
grepl("Prefer not to answer", edu_first, ignore.case = TRUE) ~ "No answer given",
is.na(edu_first) ~ "No answer given",
# 默认保留原值(可根据需要修改)
TRUE ~ as.character(edu_first)
)
)
# Neuro_score, smoke, drink,bmi,imd, employ,income,region
data1_clean <- data1 %>%
mutate(
# 处理Neuro_score的非标准缺失值
Neuro_score = ifelse(Neuro_score %in% c("NA"), NA, Neuro_score),
Neuro_score = as.numeric(Neuro_score),
) %>%
# 删除关键变量的缺失值
filter(
!is.na(Neuro_score),
!is.na(drink),
!is.na(smoke),
!is.na(bmi),
!is.na(employ),
!is.na(income)
) %>%
# 创建imd变量
mutate(
imd = coalesce(imde, imds, imdw),
region = case_when(
!is.na(imde) ~ "England",
!is.na(imds) ~ "Scotland",
!is.na(imdw) ~ "Wales",
TRUE ~ NA_character_
)
) %>%
select(-imde, -imds, -imdw) %>%
mutate(
# 将分类变量转换为因子
edu_level = as.factor(edu_level),
region=as.factor(region)
) %>%
mutate(
smoke_group = case_when(
smoke == "Daily or almost daily" ~ "Smoking everyday",
smoke %in% c("Three or four times a week",
"Once or twice a week",
"One to three times a month") ~ "Regular smoking (not daily)",
smoke == "Special occasions only" ~ "Smoking occasionally",
smoke == "Never" ~ "Never smoking",
smoke == "Prefer not to answer"~ "Refuse to answer" ,
TRUE ~ "Other"
) %>% factor(levels = c("Smoking everyday", "Regular smoking (not daily)",
"Smoking occasionally", "Never smoking", "Refuse to answer"))
)
#diag_prof
data1_clean$diag_prof<- ifelse(!is.na(data1_clean$diag_prof),1,0)
#employ
data1_clean <- data1_clean %>%
mutate(
employ_status = case_when(
grepl("In paid employment?", employ, ignore.case = TRUE) ~ "Employed",
grepl("Retired?", employ, ignore.case = TRUE) ~ "Retired",
grepl("Full or part-time student|Doing unpaid or voluntary work|Looking after home?|Unemployed|Unable to work?", employ, ignore.case = TRUE) ~ "Unemployed",
grepl("Prefer not to answer", employ, ignore.case = TRUE) ~ "No answer given",
# 默认保留原值(可根据需要修改)
TRUE ~ "Other"
),
# 转换为因子并指定水平顺序
employ_status = factor(
employ_status,
levels = c(
"Employed",
"Retired",
"Unemployed",
"Other",
"No answer given"
)
)
)
#income
library(stringr) # 加载 stringr 包
data1_clean <- data1_clean %>%
mutate(
# 1. 首先清理收入数据(确保字符类型且无空格)
income = str_trim(as.character(income)),
# 2. 分类逻辑
income_status = case_when(
# 合并低收入范围(使用正则表达式捕获不同表述)
grepl("18,000|30,999|31,000|51,999|Less than", income, ignore.case = TRUE) ~ "Lower than 52,000",
# 中等收入范围(精确匹配)
grepl("52,000 to 100,000", income, fixed = TRUE) ~ "52,000 to 100,000",
# 高收入范围(兼容大小写和空格变体)
grepl("Greater than 100,000|>100,000|100k\\+", income, ignore.case = TRUE) ~ "Greater than 100,000",
# 处理拒绝回答和缺失值
grepl("Do not know|Prefer not to answer|Unknown", income, ignore.case = TRUE) ~ "No answer given"
),
# 3. 转换为有序因子
income_status = factor(
income_status,
levels = c(
"Lower than 52,000",
"52,000 to 100,000",
"Greater than 100,000",
"No answer given"
),
ordered = TRUE # 设为有序因子(适用于收入等级)
)
)
# 加载必要的包
library(dplyr)
library(gtsummary)
library(gt)
# 创建分类变量并选择所需列
analysis_data <- data1_clean %>%
select(
Breast_Cancer,
edu_level,
employ_status,
income_status,
smoke_group,
drink,
diag_prof,
age,
bmi,
Neuro_score,
imd,
region # 添加地区变量
) %>%
mutate(
Breast_Cancer = factor(Breast_Cancer,
levels = c(1, 0),
labels = c("With Cancer", "Without Cancer")),
diag_prof = factor(diag_prof,
levels = c(1, 0),
labels = c("Yes", "No")),
region = factor(region, # 确保region是因子类型
levels = c("England", "Scotland", "Wales"))
)
# 分类变量表格
categorical_table <- analysis_data %>%
select(
Breast_Cancer,
edu_level,
employ_status,
income_status,
smoke_group,
drink,
diag_prof,
region
) %>%
tbl_summary(
by = Breast_Cancer,
missing = "no",
label = list(
edu_level ~ "Education level",
employ_status ~ "Employment status",
income_status ~ "Monthly household income",
smoke_group ~ "Smoking frequency",
drink ~ "Drinking",
diag_prof ~ "Mental disorder diagnosis",
region ~ "Region"
),
type = all_categorical() ~ "categorical",
statistic = all_categorical() ~ "{n} ({p}%)"
) %>%
add_overall() %>%
add_p(
test = all_categorical() ~ "chisq.test",
test.args = all_categorical() ~ list(simulate.p.value = TRUE)
) %>%
modify_header(
label ~ "**Characteristic**",
stat_0 ~ "**Overall** (N = {N})",
stat_1 ~ "**With Cancer** (N = {n})",
stat_2 ~ "**Without Cancer** (N = {n})",
p.value ~ "**p-value**"
) %>%
modify_spanning_header(c("stat_1", "stat_2") ~ "**Breast Cancer Status**") %>%
modify_caption("Table 1. Categorical Variables by Breast Cancer Status") %>%
bold_labels() %>%
italicize_levels()
# 连续变量表格
continuous_table <- analysis_data %>%
select(
Breast_Cancer,
age,
bmi,
Neuro_score,
imd
) %>%
tbl_summary(
by = Breast_Cancer,
missing = "no",
label = list(
age ~ "Age (years)",
bmi ~ "Body Mass Index (kg/m²)",
Neuro_score ~ "Neuroticism score",
imd ~ "Index of Multiple Deprivation"
),
type = all_continuous() ~ "continuous2",
statistic = all_continuous() ~ "{mean} ({sd})"
) %>%
add_overall() %>%
add_p(test = all_continuous() ~ "t.test") %>%
modify_header(
label ~ "**Characteristic**",
stat_0 ~ "**Overall** (N = {N})",
stat_1 ~ "**With Cancer** (N = {n})",
stat_2 ~ "**Without Cancer** (N = {n})",
p.value ~ "**p-value**"
) %>%
modify_spanning_header(c("stat_1", "stat_2") ~ "**Breast Cancer Status**") %>%
modify_caption("Table 2. Continuous Variables by Breast Cancer Status") %>%
bold_labels()
# 转换为gt表格并显示
categorical_table_gt <- as_gt(categorical_table)
continuous_table_gt <- as_gt(continuous_table)
# 显示表格
categorical_table_gt
| Characteristic | Overall (N = 211934)1 |
Breast Cancer Status
|
p-value2 | |
|---|---|---|---|---|
| With Cancer (N = 9795)1 | Without Cancer (N = 202139)1 | |||
| Education level | 0.027 | |||
| College | 68,488 (32%) | 3,286 (34%) | 65,202 (32%) | |
| High school | 25,655 (12%) | 1,225 (13%) | 24,430 (12%) | |
| Junior high school and below | 61,163 (29%) | 2,753 (28%) | 58,410 (29%) | |
| No answer given | 1,505 (0.7%) | 60 (0.6%) | 1,445 (0.7%) | |
| None of the above | 33,546 (16%) | 1,524 (16%) | 32,022 (16%) | |
| Vocational | 21,577 (10%) | 947 (9.7%) | 20,630 (10%) | |
| Employment status | <0.001 | |||
| Employed | 119,516 (56%) | 5,099 (52%) | 114,417 (57%) | |
| Retired | 72,329 (34%) | 3,846 (39%) | 68,483 (34%) | |
| Unemployed | 18,559 (8.8%) | 781 (8.0%) | 17,778 (8.8%) | |
| Other | 1,003 (0.5%) | 40 (0.4%) | 963 (0.5%) | |
| No answer given | 527 (0.2%) | 29 (0.3%) | 498 (0.2%) | |
| Monthly household income | <0.001 | |||
| Lower than 52,000 | 135,522 (64%) | 6,457 (66%) | 129,065 (64%) | |
| 52,000 to 100,000 | 35,190 (17%) | 1,554 (16%) | 33,636 (17%) | |
| Greater than 100,000 | 9,385 (4.4%) | 376 (3.8%) | 9,009 (4.5%) | |
| No answer given | 31,837 (15%) | 1,408 (14%) | 30,429 (15%) | |
| Smoking frequency | <0.001 | |||
| Smoking everyday | 35,030 (17%) | 1,778 (18%) | 33,252 (16%) | |
| Regular smoking (not daily) | 127,098 (60%) | 5,932 (61%) | 121,166 (60%) | |
| Smoking occasionally | 30,632 (14%) | 1,308 (13%) | 29,324 (15%) | |
| Never smoking | 19,087 (9.0%) | 776 (7.9%) | 18,311 (9.1%) | |
| Refuse to answer | 87 (<0.1%) | 1 (<0.1%) | 86 (<0.1%) | |
| Drinking | 0.006 | |||
| No | 94,319 (45%) | 4,227 (43%) | 90,092 (45%) | |
| Yes | 117,615 (55%) | 5,568 (57%) | 112,047 (55%) | |
| Mental disorder diagnosis | 0.7 | |||
| Yes | 26,624 (13%) | 1,243 (13%) | 25,381 (13%) | |
| No | 185,310 (87%) | 8,552 (87%) | 176,758 (87%) | |
| Region | <0.001 | |||
| England | 181,783 (88%) | 8,425 (88%) | 173,358 (88%) | |
| Scotland | 15,859 (7.7%) | 805 (8.4%) | 15,054 (7.6%) | |
| Wales | 9,097 (4.4%) | 337 (3.5%) | 8,760 (4.4%) | |
| 1 n (%) | ||||
| 2 Pearson’s Chi-squared test with simulated p-value (based on 2000 replicates) | ||||
continuous_table_gt
| Characteristic | Overall (N = 211934) |
Breast Cancer Status
|
p-value1 | |
|---|---|---|---|---|
| With Cancer (N = 9795) | Without Cancer (N = 202139) | |||
| Age (years) | <0.001 | |||
| Mean (SD) | 56 (8) | 57 (7) | 56 (8) | |
| Body Mass Index (kg/m²) | 0.6 | |||
| Mean (SD) | 27.1 (5.2) | 27.1 (4.9) | 27.1 (5.2) | |
| Neuroticism score | <0.001 | |||
| Mean (SD) | 4.6 (3.2) | 4.4 (3.2) | 4.6 (3.3) | |
| Index of Multiple Deprivation | <0.001 | |||
| Mean (SD) | 17 (13) | 16 (13) | 17 (14) | |
| 1 Welch Two Sample t-test | ||||
#导出表格
library(gt)
gtsave(categorical_table_gt, "descriptive_table_categorical.html")
gtsave(continuous_table_gt, "descriptive_table_continue.html")