descriptive

read excel

library(readxl)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# 读取baseline.xlsx文件中的两个sheet
baseline <- read_excel("Data/baseline.xlsx", sheet = "baseline")
decode <- read_excel("Data/baseline.xlsx", sheet = "decode")

# 提取decode中的var1和var3列
decode_mapping <- decode[, c("var1", "var3")]

# 替换baseline的列名 
name_mapping <- setNames(decode_mapping$var3, decode_mapping$var1)
name_mapping

##         eid      p21022         p31      p26410      p26427      p26426 
##        "id"       "age"       "sex"      "imde"      "imds"      "imdw" 
##    p6138_i0     p845_i0   p21000_i0    p6142_i0     p680_i0     p728_i0 
##       "edu"   "edu_age"    "ethnic"    "employ"     "house"       "car" 
##     p738_i0 p4079_i0_a0 p4079_i0_a1   p94_i0_a0   p94_i0_a1 p4080_i0_a0 
##    "income"      "dbp1"      "dbp2"      "dbp3"      "dbp4"      "sbp1" 
## p4080_i0_a1   p93_i0_a0   p93_i0_a1   p21001_i0      p48_i0      p49_i0 
##      "sbp2"      "sbp3"      "sbp4"       "bmi"     "waist"       "hip" 
##      p54_i0 
##    "center"

names(baseline) <- name_mapping[names(baseline)]

# 读取smoking and drinking
smk_drink <- read_excel("Data/smoking and drinking.xlsx")
name_mapping <- setNames(decode_mapping$var3, decode_mapping$var1)
full_name <- c("id", "smoke", "drink")

mapping <- setNames(full_name, names(smk_drink))

names(smk_drink) <- mapping

# 读取C136
c136 <- read_excel("Data/C136(1).xlsx", sheet = "C136") # warning does not matter --- see column p20441 
decode_c136 <- read_excel("Data/C136(1).xlsx", sheet = "decode")
decode_mapping_2 <- decode_c136[, c("var1", "var3")]

name_mapping1 <- setNames(decode_mapping_2$var3, decode_mapping_2$var1)
names(c136) <- name_mapping1[names(c136)]

# 读取com_disease文件
com_disease <- read_excel("Data/com_disease.xlsx")

library(dplyr)
data0 <- baseline %>%
  left_join(smk_drink, by = c("id" = "id")) %>%
  left_join(c136, by = c("id" = "id")) %>%
  left_join(com_disease, by=c("id"="id"))

#删除 sex 为 "male" 的行
data1 <- data0 %>%
  filter(sex != "Male")

变量处理

# 加载必要的包
library(dplyr)

#创建分类变量
data1$Breast_Cancer <- ifelse(is.na(data1$date1), "Without_Cancer", "With_Cancer")
# with_Cancer = 1 ; without_cancer =0 

data1$Breast_Cancer <- ifelse(data1$Breast_Cancer=="With_Cancer",1,0)

# edu_level
data1$edu_first <- sub("\\|.*$", "", data1$edu)


data1 <- data1 %>%
  mutate(
    edu_level = case_when(
      # 捕获所有A Level/AS Level变体（不区分大小写）
      grepl("A levels?|AS levels?|Advanced Level",edu_first, ignore.case = TRUE) ~ "High school",
      grepl("O levels?|GCSEs?|CSEs?", edu_first, ignore.case = TRUE) ~ "Junior high school and below",
      grepl("College or University degree?", edu_first, ignore.case = TRUE) ~ "College",
      grepl("NVQ？|HNC|Certificate|Diploma|Professional Qualification", edu_first, ignore.case = TRUE) ~ "Vocational",
      grepl("Prefer not to answer", edu_first, ignore.case = TRUE) ~ "No answer given",
      is.na(edu_first) ~ "No answer given",
      # 默认保留原值（可根据需要修改）
      TRUE ~ as.character(edu_first)
    )
  )


# Neuro_score, smoke, drink,bmi,imd, employ,income,region
data1_clean <- data1 %>%
  mutate(
    # 处理Neuro_score的非标准缺失值
    Neuro_score = ifelse(Neuro_score %in% c("NA"), NA, Neuro_score),
    Neuro_score = as.numeric(Neuro_score),
  ) %>%
  # 删除关键变量的缺失值
  filter(
    !is.na(Neuro_score),
    !is.na(drink),
    !is.na(smoke),
    !is.na(bmi),
    !is.na(employ),
    !is.na(income) 
  ) %>%
  # 创建imd变量
  mutate(
    imd = coalesce(imde, imds, imdw),
    region = case_when(
      !is.na(imde) ~ "England",
      !is.na(imds) ~ "Scotland",
      !is.na(imdw) ~ "Wales",
      TRUE ~ NA_character_
    )
  ) %>%
  select(-imde, -imds, -imdw) %>%
  mutate(
    # 将分类变量转换为因子
    edu_level = as.factor(edu_level),
    region=as.factor(region)
  ) %>%
  mutate(
    smoke_group = case_when(
      smoke == "Daily or almost daily" ~ "Smoking everyday",
      smoke %in% c("Three or four times a week", 
                   "Once or twice a week",
                   "One to three times a month") ~ "Regular smoking (not daily)",
      smoke == "Special occasions only" ~ "Smoking occasionally",
      smoke == "Never" ~ "Never smoking",
      smoke == "Prefer not to answer"~ "Refuse to answer" ,
      TRUE ~ "Other"
    ) %>% factor(levels = c("Smoking everyday", "Regular smoking (not daily)", 
                            "Smoking occasionally", "Never smoking", "Refuse to answer"))
  )

#diag_prof
data1_clean$diag_prof<- ifelse(!is.na(data1_clean$diag_prof),1,0)

#employ
data1_clean <- data1_clean %>%
  mutate(
    employ_status = case_when(
      grepl("In paid employment?", employ, ignore.case = TRUE) ~ "Employed",
      grepl("Retired?", employ, ignore.case = TRUE) ~ "Retired",
      grepl("Full or part-time student|Doing unpaid or voluntary work|Looking after home?|Unemployed|Unable to work?", employ, ignore.case = TRUE) ~ "Unemployed",
      grepl("Prefer not to answer", employ, ignore.case = TRUE) ~ "No answer given",
      # 默认保留原值（可根据需要修改）
      TRUE ~ "Other"
    ),
    
    # 转换为因子并指定水平顺序
    employ_status = factor(
      employ_status,
      levels = c(
        "Employed",
        "Retired",
        "Unemployed",
        "Other",
        "No answer given"
      )
    )
  )

#income
library(stringr)  # 加载 stringr 包
data1_clean <- data1_clean %>%
  mutate(
    # 1. 首先清理收入数据（确保字符类型且无空格）
    income = str_trim(as.character(income)),
    # 2. 分类逻辑
    income_status = case_when(
      # 合并低收入范围（使用正则表达式捕获不同表述）
      grepl("18,000|30,999|31,000|51,999|Less than", income, ignore.case = TRUE) ~ "Lower than 52,000",
      # 中等收入范围（精确匹配）
      grepl("52,000 to 100,000", income, fixed = TRUE) ~ "52,000 to 100,000",
      # 高收入范围（兼容大小写和空格变体）
      grepl("Greater than 100,000|>100,000|100k\\+", income, ignore.case = TRUE) ~ "Greater than 100,000",
      # 处理拒绝回答和缺失值
      grepl("Do not know|Prefer not to answer|Unknown", income, ignore.case = TRUE) ~ "No answer given"
    ),
    # 3. 转换为有序因子
    income_status = factor(
      income_status,
      levels = c(
        "Lower than 52,000",
        "52,000 to 100,000",
        "Greater than 100,000",
        "No answer given"
      ),
      ordered = TRUE  # 设为有序因子（适用于收入等级）
    )
  )

描述性统计的表

# 加载必要的包
library(dplyr)
library(gtsummary)
library(gt)

# 创建分类变量并选择所需列
analysis_data <- data1_clean %>%
  select(
    Breast_Cancer, 
    edu_level, 
    employ_status, 
    income_status, 
    smoke_group, 
    drink, 
    diag_prof, 
    age, 
    bmi,
    Neuro_score,
    imd,
    region  # 添加地区变量
  ) %>%
  mutate(
    Breast_Cancer = factor(Breast_Cancer, 
                          levels = c(1, 0),
                          labels = c("With Cancer", "Without Cancer")),
    diag_prof = factor(diag_prof,
                      levels = c(1, 0),
                      labels = c("Yes", "No")),
    region = factor(region,  # 确保region是因子类型
                   levels = c("England", "Scotland", "Wales"))
  )

# 分类变量表格
categorical_table <- analysis_data %>%
  select(
    Breast_Cancer,
    edu_level,
    employ_status,
    income_status,
    smoke_group,
    drink,
    diag_prof,
    region
  ) %>%
  tbl_summary(
    by = Breast_Cancer,
    missing = "no",
    label = list(
      edu_level ~ "Education level",
      employ_status ~ "Employment status",
      income_status ~ "Monthly household income",
      smoke_group ~ "Smoking frequency",
      drink ~ "Drinking",
      diag_prof ~ "Mental disorder diagnosis",
      region ~ "Region"
    ),
    type = all_categorical() ~ "categorical",
    statistic = all_categorical() ~ "{n} ({p}%)"
  ) %>%
  add_overall() %>%
  add_p(
    test = all_categorical() ~ "chisq.test",
    test.args = all_categorical() ~ list(simulate.p.value = TRUE)
  ) %>%
  modify_header(
    label ~ "**Characteristic**",
    stat_0 ~ "**Overall** (N = {N})",
    stat_1 ~ "**With Cancer** (N = {n})",
    stat_2 ~ "**Without Cancer** (N = {n})",
    p.value ~ "**p-value**"
  ) %>%
  modify_spanning_header(c("stat_1", "stat_2") ~ "**Breast Cancer Status**") %>%
  modify_caption("Table 1. Categorical Variables by Breast Cancer Status") %>%
  bold_labels() %>%
  italicize_levels()

# 连续变量表格
continuous_table <- analysis_data %>%
  select(
    Breast_Cancer,
    age,
    bmi,
    Neuro_score,
    imd
  ) %>%
  tbl_summary(
    by = Breast_Cancer,
    missing = "no",
    label = list(
      age ~ "Age (years)",
      bmi ~ "Body Mass Index (kg/m²)",
      Neuro_score ~ "Neuroticism score",
      imd ~ "Index of Multiple Deprivation"
    ),
    type = all_continuous() ~ "continuous2",
    statistic = all_continuous() ~ "{mean} ({sd})"
  ) %>%
  add_overall() %>%
  add_p(test = all_continuous() ~ "t.test") %>%
  modify_header(
    label ~ "**Characteristic**",
    stat_0 ~ "**Overall** (N = {N})",
    stat_1 ~ "**With Cancer** (N = {n})",
    stat_2 ~ "**Without Cancer** (N = {n})",
    p.value ~ "**p-value**"
  ) %>%
  modify_spanning_header(c("stat_1", "stat_2") ~ "**Breast Cancer Status**") %>%
  modify_caption("Table 2. Continuous Variables by Breast Cancer Status") %>%
  bold_labels()

# 转换为gt表格并显示
categorical_table_gt <- as_gt(categorical_table)
continuous_table_gt <- as_gt(continuous_table)

# 显示表格
categorical_table_gt

Table 1. Categorical Variables by Breast Cancer Status
Characteristic	Overall (N = 211934)¹	Breast Cancer Status		p-value²
Characteristic	Overall (N = 211934)¹	With Cancer (N = 9795)¹	Without Cancer (N = 202139)¹	p-value²
Education level				0.027
College	68,488 (32%)	3,286 (34%)	65,202 (32%)
High school	25,655 (12%)	1,225 (13%)	24,430 (12%)
Junior high school and below	61,163 (29%)	2,753 (28%)	58,410 (29%)
No answer given	1,505 (0.7%)	60 (0.6%)	1,445 (0.7%)
None of the above	33,546 (16%)	1,524 (16%)	32,022 (16%)
Vocational	21,577 (10%)	947 (9.7%)	20,630 (10%)
Employment status				<0.001
Employed	119,516 (56%)	5,099 (52%)	114,417 (57%)
Retired	72,329 (34%)	3,846 (39%)	68,483 (34%)
Unemployed	18,559 (8.8%)	781 (8.0%)	17,778 (8.8%)
Other	1,003 (0.5%)	40 (0.4%)	963 (0.5%)
No answer given	527 (0.2%)	29 (0.3%)	498 (0.2%)
Monthly household income				<0.001
Lower than 52,000	135,522 (64%)	6,457 (66%)	129,065 (64%)
52,000 to 100,000	35,190 (17%)	1,554 (16%)	33,636 (17%)
Greater than 100,000	9,385 (4.4%)	376 (3.8%)	9,009 (4.5%)
No answer given	31,837 (15%)	1,408 (14%)	30,429 (15%)
Smoking frequency				<0.001
Smoking everyday	35,030 (17%)	1,778 (18%)	33,252 (16%)
Regular smoking (not daily)	127,098 (60%)	5,932 (61%)	121,166 (60%)
Smoking occasionally	30,632 (14%)	1,308 (13%)	29,324 (15%)
Never smoking	19,087 (9.0%)	776 (7.9%)	18,311 (9.1%)
Refuse to answer	87 (<0.1%)	1 (<0.1%)	86 (<0.1%)
Drinking				0.006
No	94,319 (45%)	4,227 (43%)	90,092 (45%)
Yes	117,615 (55%)	5,568 (57%)	112,047 (55%)
Mental disorder diagnosis				0.7
Yes	26,624 (13%)	1,243 (13%)	25,381 (13%)
No	185,310 (87%)	8,552 (87%)	176,758 (87%)
Region				<0.001
England	181,783 (88%)	8,425 (88%)	173,358 (88%)
Scotland	15,859 (7.7%)	805 (8.4%)	15,054 (7.6%)
Wales	9,097 (4.4%)	337 (3.5%)	8,760 (4.4%)
¹ n (%)
² Pearson’s Chi-squared test with simulated p-value (based on 2000 replicates)

continuous_table_gt

Table 2. Continuous Variables by Breast Cancer Status
Characteristic	Overall (N = 211934)	Breast Cancer Status		p-value¹
Characteristic	Overall (N = 211934)	With Cancer (N = 9795)	Without Cancer (N = 202139)	p-value¹
Age (years)				<0.001
Mean (SD)	56 (8)	57 (7)	56 (8)
Body Mass Index (kg/m²)				0.6
Mean (SD)	27.1 (5.2)	27.1 (4.9)	27.1 (5.2)
Neuroticism score				<0.001
Mean (SD)	4.6 (3.2)	4.4 (3.2)	4.6 (3.3)
Index of Multiple Deprivation				<0.001
Mean (SD)	17 (13)	16 (13)	17 (14)
¹ Welch Two Sample t-test

#导出表格
library(gt)
gtsave(categorical_table_gt, "descriptive_table_categorical.html")
gtsave(continuous_table_gt, "descriptive_table_continue.html")

descriptive_table

2025-08-12

read excel

变量处理

描述性统计的表