Methods

Data extraction

We extracted data from NHANES-III, estimating HOMA2-IR and HOMA2-B using fasting glucose concentrations and C-peptide with the HOMA2-IR calculator from Oxford. We evaluated cases with diabetes defined as previous diagnosis of diabetes HbA1c >6.5% or fasting glucose >126mg/dL, as recommended by ADA.

library(SAScii); library(tidyverse); library(survival); library(ggpubr)

#### Download NHANES-III ####  
download.file("https://wwwn.cdc.gov/nchs/data/nhanes3/24a/ssgad.xpt", tf0 <- tempfile(), mode="wb")
GAD65 <- foreign::read.xport(tf0)
#### Household data ###
download.file("https://wwwn.cdc.gov/nchs/data/nhanes3/1a/adult.dat", tf <- tempfile(), mode="wb")
nhanes3<-read.SAScii(fn = tf,sas_ri = "https://wwwn.cdc.gov/nchs/data/nhanes3/1a/adult.sas")
write.csv(nhanes3, "nhanes3.csv")

#### Laboratory data ###
download.file("https://wwwn.cdc.gov/nchs/data/nhanes3/1a/lab.dat", tf1 <- tempfile(), mode="wb")
nhanes3_lab<-read.SAScii(fn = tf1,sas_ri = "https://wwwn.cdc.gov/nchs/data/nhanes3/1a/lab.sas")
write.csv(nhanes3_lab, "nhanes3_lab.csv")
#### Examination data ###
download.file("https://wwwn.cdc.gov/nchs/data/nhanes3/1a/exam.dat", tf1 <- tempfile(), mode="wb")
nhanes3_exam<-read.SAScii(fn = tf1,sas_ri = "https://wwwn.cdc.gov/nchs/data/nhanes3/1a/exam.sas")

#### Merge datasets ###
library(tidyverse)
nhanes3_data<- nhanes3_lab %>% left_join(GAD65, by="SEQN")
nhanes3_data<-nhanes3_data %>% left_join(nhanes3, by="SEQN")
nhanes3_data<-nhanes3 %>% left_join(nhanes3_exam, by="SEQN")

write.csv(nhanes3_data, "nhanes3.csv")

#### Dataset clusters ####
nhanes3<-read.csv("nhanes3.csv")
nhanes3<- nhanes3%>% filter(!is.na(GAPGAD65) & !is.na(C1PSI) & HSAGEIR.x>=20)
nhanes3$diabetes<-as.numeric((nhanes3$GHP>=6.5 |nhanes3$G1P>=126 | nhanes3$HAD1==1))
nhanes3fin<- nhanes3 %>% filter(diabetes==1 & (C1PSI!=88888 &  G1PSI!=888888 & GHP!=8888 & BMPBMI!=8888))
clusters<-nhanes3fin %>% dplyr::select(SEQN,GHP, HAD5R, G1PSI, C1PSI,BMPBMI, HSAGEIR.x,GAPGAD65)
names(clusters)<-c("SEQN", "HbA1c", "ADD", "GLU", "CPEP", "BMI", "AGE", "GAD65")
clusters$ADD[is.na(clusters$ADD) | clusters$ADD==999 | clusters$ADD==888]<-clusters$AGE
cox.imp<-function(x){
  var<-summary(pool(x))[1]
  HR<-exp(summary(pool(x))[2])
  lwr<-exp(summary(pool(x))[2]-1.96*summary(pool(x))[3])
  upr<-exp(summary(pool(x))[2]+1.96*summary(pool(x))[3])
  pval<-summary(pool(x))[6]
  print(cbind(var,HR, lwr, upr, pval))
}

Diabetes cluster classification

We used HbA1c, body-mass index (BMI), age at diabetes diagnosis, HOMA2-IR and HOMA2-B to clasify diabetes clusters using the SNNN algorithm previously developed by our team (https://uiem.shinyapps.io/diabetes_clusters_app/). Once we classified these subgroups, we classified severe autoimmune diabetes (SAID) using anti-GAD65 antibodies with values >0.069 indicating positivity.

clust<-read.csv("clusters.csv")
clust0<-clust%>%select(SEQN, class, HOMA2_B, HOMA2_IR)
nhanes<-nhanes3fin %>% left_join(clust0, by="SEQN")
nhanes$Cluster<-as.character(nhanes$class)
nhanes$Cluster[nhanes$GAPGAD65>=0.069]<-"said"
table(nhanes$Cluster, nhanes$class)
##       
##        mard mord siid sird
##   mard  235    0    0    0
##   mord    0  208    0    0
##   said   14    5   18    2
##   siid    0    0  293    0
##   sird    0    0    0  161
nhanes$HAD5R[is.na(nhanes$HAD5R) | nhanes$HAD5R==999 | nhanes$HAD5R==888]<-nhanes$HSAGEIR.x
## Warning in nhanes$HAD5R[is.na(nhanes$HAD5R) | nhanes$HAD5R == 999 | nhanes$HAD5R
## == : número de items para para sustituir no es un múltiplo de la longitud del
## reemplazo
nhanes$years<-nhanes$HSAGEIR.x-nhanes$HAD5R
nhanes$recent<-ifelse(nhanes$years<5,1,0)
nhanes$recent<-factor(nhanes$recent, labels = c("<5 years since diagnosis", ">=5 years since diagnosis"))
mort<-read.csv("nhanes3_mort.csv")
mort$SEQN<-mort$seqn
nhanes0<-nhanes %>% left_join(mort, by="SEQN")
nhanes0$cluster2<-factor(nhanes0$Cluster, labels = c("MARD", "MOD", "SAID", "SIID", "SIRD"))
nhanes0$cluster2<-relevel(nhanes0$cluster2, ref="MOD")

## Evaluating diabetes frequencies based on years since diabetes diagnosis
t1<-table(nhanes0$recent, nhanes0$cluster2)
knitr::kable(t1, align = "lccrr")
MOD MARD SAID SIID SIRD
<5 years since diagnosis 142 133 24 192 68
>=5 years since diagnosis 66 102 18 101 93
### Plotting cluster frequencies
df <- nhanes0 %>%
  group_by(cluster2, recent) %>%
  summarise(Counts = n()) %>%
  mutate(freq = Counts / sum(Counts)) %>% drop_na()
## `summarise()` regrouping output by 'cluster2' (override with `.groups` argument)
df<-as.data.frame(df)
df$Counts<-as.numeric(df$Counts);df$freq<-as.numeric(df$freq)


ggplot(data=df, aes(y=freq, x=cluster2,fill=cluster2)) +
  geom_bar(stat="identity", position='dodge')+scale_fill_nejm()+
  scale_y_continuous(labels = scales::percent_format())+labs(fill="Cluster")+
  theme_pubr()+ylab("Frequency (%)")+xlab("")+facet_wrap(~recent)+
  theme(axis.text.x = element_blank(),axis.ticks.x = element_blank())+
  geom_text(aes(label=round(freq*100,1)), vjust=1.6, color="white", size=3.5)

We identified that the proportion of diabetes subgroups was modified by defining recent vs. non-recent diabetes diagnosis. The profiles of each biomarker are consistent with what is expected for each cluster.

g1<-ggplot(nhanes0, aes(x=cluster2, y=HOMA2_IR, fill=cluster2))+geom_boxplot()+scale_y_log10()+
  ylab("HOMA2-IR")+xlab("Cluster")+theme_classic()+labs(fill="Cluster")+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())
g2<-ggplot(nhanes0, aes(x=cluster2, y=HOMA2_B, fill=cluster2))+geom_boxplot()+scale_y_log10()+
  ylab("HOMA2-B")+xlab("Cluster")+theme_classic()+labs(fill="Cluster")+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())
g3<-ggplot(nhanes0, aes(x=cluster2, y=GHP, fill=cluster2))+geom_boxplot()+ylab("HbA1c (%)")+
  xlab("Cluster")+theme_classic()+labs(fill="Cluster")+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())
g4<-ggplot(nhanes0, aes(x=cluster2, y=G1P, fill=cluster2))+geom_boxplot()+scale_y_log10()+
  ylab("Glucose (mg/dL)")+xlab("Cluster")+theme_classic()+labs(fill="Cluster")+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())
g5<-ggplot(nhanes0, aes(x=cluster2, y=BMPBMI, fill=cluster2))+geom_boxplot()+scale_y_log10()+
  ylab("BMI (Kg/m2)")+xlab("Cluster")+theme_classic()+labs(fill="Cluster")+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())
g6<-ggplot(nhanes0, aes(x=cluster2, y=HAD5R, fill=cluster2))+geom_boxplot()+ylab("Age at diagnosis (years)")+
  xlab("Cluster")+theme_classic()+labs(fill="Cluster")+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

ggarrange(g1, g2, g3, g4, g5, g6, labels=LETTERS[1:6])

All-cause mortality

Next, we fitted a Cox proportional risk regression model to predict all-cause mortality. Time of follow-up was estimated from examination until last record. We performed multiple imputation by chained equations using the mice R packag with 5 iterations and five multiply imputed datasets.

nhanes0$mort_cvd_rec<-NULL;
nhanes0$mort_cvd_rec[nhanes0$ucod_leading==1]<-1;
nhanes0$mort_cvd_rec[nhanes0$ucod_leading>1]<-2;
nhanes0$mort_cvd_rec[nhanes0$ucod_leading==0]<-0
nhanes0$mort_cvd_rec<-na.tools::na.replace(nhanes0$mort_cvd_rec,0)

nhanes0$mort_db_rec<-NULL;
nhanes0$mort_db_rec[nhanes0$ucod_leading==2]<-1;
nhanes0$mort_db_rec[nhanes0$ucod_leading!=2]<-2;
nhanes0$mort_db_rec[nhanes0$ucod_leading==0]<-0
nhanes0$mort_db_rec<-na.tools::na.replace(nhanes0$mort_db_rec,0)



nhanes2<-nhanes0 %>% select(cluster2, mortstat, permth_exm, HSSEX.x,DMARACER.x,
                            HAR3,HAD6,HAD10,HAE5A,HSAGEIR.x,HAE8D,mort_cvd_rec,mort_db_rec, years)
nhanes2<-nhanes2[!is.na(nhanes2$mortstat) & !is.na(nhanes2$cluster2),]
nhanes3<-nhanes2 %>%filter(years<=5)

set.seed(123);imp<-mice::mice(nhanes2, maxit=5, m=5)
set.seed(123);imp2<-mice::mice(nhanes3, maxit=5, m=5)

To account for multiple imputation, we used R pooling function to obtain estimates of all-cause mortality for each cluster, using mild-obesity related diabetes (MOD) as the reference category.

## 
## Attaching package: 'mice'
## The following objects are masked _by_ '.GlobalEnv':
## 
##     nhanes, nhanes2
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
## Warning in pool.fitlist(getfit(object), dfcom = dfcom): Large sample assumed.
##           term  estimate std.error statistic       df      p.value
## 1 cluster2MARD 0.6606550 0.1104646  5.980694 997403.7 2.222638e-09
## 2 cluster2SAID 0.4742804 0.1871304  2.534491 997403.7 1.126123e-02
## 3 cluster2SIID 0.4736432 0.1064895  4.447791 997403.7 8.676704e-06
## 4 cluster2SIRD 0.6867577 0.1204675  5.700772 997403.7 1.192995e-08
## Warning in pool.fitlist(getfit(object), dfcom = dfcom): Large sample assumed.

## Warning in pool.fitlist(getfit(object), dfcom = dfcom): Large sample assumed.

## Warning in pool.fitlist(getfit(object), dfcom = dfcom): Large sample assumed.

## Warning in pool.fitlist(getfit(object), dfcom = dfcom): Large sample assumed.

## Warning in pool.fitlist(getfit(object), dfcom = dfcom): Large sample assumed.

## Warning in pool.fitlist(getfit(object), dfcom = dfcom): Large sample assumed.

## Warning in pool.fitlist(getfit(object), dfcom = dfcom): Large sample assumed.
##           term estimate estimate estimate      p.value
## 1 cluster2MARD 1.936060 1.559156 2.404076 2.222638e-09
## 2 cluster2SAID 1.606858 1.113496 2.318815 1.126123e-02
## 3 cluster2SIID 1.605834 1.303332 1.978547 8.676704e-06
## 4 cluster2SIRD 1.987262 1.569318 2.516512 1.192995e-08
## Warning in pool.fitlist(getfit(object), dfcom = dfcom): Large sample assumed.
##            term    estimate  std.error  statistic          df      p.value
## 1  cluster2MARD  0.67810930 0.11491767  5.9008273  1602.76170 4.404544e-09
## 2  cluster2SAID  0.32886438 0.19144023  1.7178436 12904.37637 8.584913e-02
## 3  cluster2SIID  0.44273521 0.10879162  4.0695708 34167.94737 4.720563e-05
## 4  cluster2SIRD  0.72445532 0.12401376  5.8417332 14984.03752 5.273178e-09
## 5       HSSEX.x -0.08034425 0.07557929 -1.0630459 17382.20308 2.877759e-01
## 6    DMARACER.x -0.11570909 0.07873727 -1.4695594  5084.94197 1.417430e-01
## 7          HAR3  0.04102256 0.09286604  0.4417391   107.63473 6.595643e-01
## 8          HAD6 -0.37676363 0.11107810 -3.3918804    26.13394 2.219442e-03
## 9         HAD10  0.08431858 0.07097304  1.1880368   148.06415 2.367211e-01
## 10        HAE5A -0.05255745 0.06855499 -0.7666465    29.70661 4.493416e-01
## 11        HAE8D -0.10529243 0.09643160 -1.0918872    11.46606 2.972945e-01
## Warning in pool.fitlist(getfit(object), dfcom = dfcom): Large sample assumed.

## Warning in pool.fitlist(getfit(object), dfcom = dfcom): Large sample assumed.

## Warning in pool.fitlist(getfit(object), dfcom = dfcom): Large sample assumed.

## Warning in pool.fitlist(getfit(object), dfcom = dfcom): Large sample assumed.

## Warning in pool.fitlist(getfit(object), dfcom = dfcom): Large sample assumed.

## Warning in pool.fitlist(getfit(object), dfcom = dfcom): Large sample assumed.

## Warning in pool.fitlist(getfit(object), dfcom = dfcom): Large sample assumed.
##            term  estimate  estimate  estimate      p.value
## 1  cluster2MARD 1.9701492 1.5728208 2.4678515 4.404544e-09
## 2  cluster2SAID 1.3893894 0.9546997 2.0220002 8.584913e-02
## 3  cluster2SIID 1.5569600 1.2579754 1.9270046 4.720563e-05
## 4  cluster2SIRD 2.0636068 1.6183196 2.6314165 5.273178e-09
## 5       HSSEX.x 0.9227986 0.7957425 1.0701418 2.877759e-01
## 6    DMARACER.x 0.8907343 0.7633534 1.0393712 1.417430e-01
## 7          HAR3 1.0418756 0.8684938 1.2498706 6.595643e-01
## 8          HAD6 0.6860782 0.5518513 0.8529532 2.219442e-03
## 9         HAD10 1.0879754 0.9466853 1.2503528 2.367211e-01
## 10        HAE5A 0.9487998 0.8295058 1.0852500 4.493416e-01
## 11        HAE8D 0.9000613 0.7450542 1.0873173 2.972945e-01
## Warning in pool.fitlist(getfit(object), dfcom = dfcom): Large sample assumed.
##            term    estimate   std.error  statistic          df      p.value
## 1  cluster2MARD -0.10409422 0.131822418 -0.7896549  141.941730 4.310465e-01
## 2  cluster2SAID  0.14378697 0.196794085  0.7306468 1008.879609 4.651646e-01
## 3  cluster2SIID  0.31581996 0.110445623  2.8595064 6175.424470 4.257232e-03
## 4  cluster2SIRD  0.22413706 0.135380173  1.6556121  227.422601 9.917958e-02
## 5       HSSEX.x -0.21873142 0.075874369 -2.8828104 4418.383641 3.960470e-03
## 6    DMARACER.x -0.01914291 0.080836408 -0.2368105  608.878800 8.128835e-01
## 7          HAR3 -0.33746987 0.094791330 -3.5601344  107.959207 5.525989e-04
## 8          HAD6 -0.28393250 0.095824220 -2.9630557  103.211179 3.781755e-03
## 9         HAD10  0.09688734 0.060727055  1.5954559 2478.138935 1.107378e-01
## 10        HAE5A -0.04076450 0.085269990 -0.4780639    9.851429 6.430358e-01
## 11        HAE8D  0.09259158 0.157094840  0.5893993    5.454091 5.791666e-01
## 12    HSAGEIR.x  0.07616702 0.005507496 13.8297003   27.443917 6.861178e-14
## Warning in pool.fitlist(getfit(object), dfcom = dfcom): Large sample assumed.

## Warning in pool.fitlist(getfit(object), dfcom = dfcom): Large sample assumed.

## Warning in pool.fitlist(getfit(object), dfcom = dfcom): Large sample assumed.

## Warning in pool.fitlist(getfit(object), dfcom = dfcom): Large sample assumed.

## Warning in pool.fitlist(getfit(object), dfcom = dfcom): Large sample assumed.

## Warning in pool.fitlist(getfit(object), dfcom = dfcom): Large sample assumed.

## Warning in pool.fitlist(getfit(object), dfcom = dfcom): Large sample assumed.
##            term  estimate  estimate  estimate      p.value
## 1  cluster2MARD 0.9011404 0.6959579 1.1668149 4.310465e-01
## 2  cluster2SAID 1.1546381 0.7851116 1.6980888 4.651646e-01
## 3  cluster2SIID 1.3713833 1.1044490 1.7028331 4.257232e-03
## 4  cluster2SIRD 1.2512425 0.9596294 1.6314712 9.917958e-02
## 5       HSSEX.x 0.8035375 0.6925013 0.9323774 3.960470e-03
## 6    DMARACER.x 0.9810392 0.8372921 1.1494648 8.128835e-01
## 7          HAR3 0.7135735 0.5925850 0.8592642 5.525989e-04
## 8          HAD6 0.7528175 0.6239107 0.9083577 3.781755e-03
## 9         HAD10 1.1017362 0.9781055 1.2409936 1.107378e-01
## 10        HAE5A 0.9600552 0.8122934 1.1346959 6.430358e-01
## 11        HAE8D 1.0970136 0.8062880 1.4925670 5.791666e-01
## 12    HSAGEIR.x 1.0791428 1.0675564 1.0908549 6.861178e-14