### Code to extract diagnosis and prescription GP data for depression, psychotic, bipolar and substance use disorders and antidepressant prescriptions ### the code may need to be adapted (e.g. file names, file formats) ################################################ ########## DIAGNOSIS DATA EXTRACTION ########### ################################################ ## R version 3.6.0 # load R packages library(tidyverse) library(lubridate) library(readxl) library(dplyr) library(knitr) library(xtable) library(data.table) library(ggplot2) library(readr) # open clinical data clinical <- fread("gp_diagnoses",header=T, data.table=F, na.strings="") # this is the table with diagnostic records that can be downloaded from UK biobank website by investigators having access to the data (check the name of the file) # open Read2 and Read3 code files (these include all codes for depression, psychosis, bipolar disorder and substance abuse) readv2 <- fread("readv2_mood_psych_sub",header=T,data.table=F) readv3 <- fread("readv3_mood_psych_sub",header=T,data.table=F) # readv2 and readv3 files are reported in Supplementary table 1, tabs readv2_mood_psych_sub and readv3_mood_psych_sub, respectively # if you didn't convert them to text files, you can read them directly from the excel table using: # readv2 <- read_excel("Supplementary table 1", sheet=1) # readv3 <- read_excel("Supplementary table 1", sheet=2) # separately extract read 2 and then map these to read3 code before extracting read3 and binding rows # extract read 2 codes and add mapped read3 codes and terms clinical %>% filter(clinical$read_2 %in% readv2$readv2_code) %>% left_join(readv2, by=c("read_2"="readv2_code")) %>% select(-read_3,-value1,-value2,-value3) %>% rename(read_3 = readv3_code) -> v2_psych_codes # extract read 3 code and add read2 codes clinical %>% filter(clinical$read_3 %in% readv3$readv3_code) %>% left_join(readv3, by=c("read_3"="readv3_code")) %>% select(-value1,-value2,-value3) %>% bind_rows(v2_psych_codes) %>% filter(!is.na(event_dt)) -> psych_codes psych_codes$event_dt <- as.Date(psych_codes$event_dt, "%d/%m/%Y") # remove dates that are incorrect: # where clinical event or prescription date precedes participant date of birth it has been altered to 01/01/1901. # Where the date matches participant date of birth it has been altered to 02/02/1902. # Where the date follows participant date of birth but is in the year of their birth it has been altered to 03/03/1903. # Where the date was in the future this has been changed to 07/07/2037 as these are likely to have been entered as a place-holder or other system default dates_rm<-which(psych_codes$event_dt=="1901-01-01"|psych_codes$event_dt=="1902-02-02"|psych_codes$event_dt=="1903-03-03"|psych_codes$event_dt=="2037-07-07") psych_codes<-psych_codes[-dates_rm,] # mark date but keep record for all rows where the date is 01/01 (this is likely some issue with the system defaulting to 01/01, but the year may be correct) psych_codes$diagn_month_day <- format(as.Date(psych_codes$event_dt), "%m-%d") table(psych_codes$diagn_month_day=='01-01') psych_codes$diagn_year <- format(as.Date(psych_codes$event_dt),"%Y") psych_codes$diagn_year <- as.numeric(psych_codes$diagn_year) # n=41785 non-duplicated individuals (note that the number of individuals may change in the future as new primary care data are linked and/or due to dropouts) # write output fwrite(psych_codes,"mood_BP_SU_codes.txt",col.names=T,row.names=F,quote=F,sep="\t") ################################################ ######### PRESCRIPTION DATA EXTRACTION ######### ################################################ ## This part of the code will extract extract medication codes ## Read 2, BNF, dm+d # open full medication data (table with prescription records that can be downloaded from UK biobank website by investigators having access to the data, check file name) meds <- fread("gp_prescriptions", header = T, data.table = F, colClasses = c( "read_2" = "character", "bnf_code" = "character", "dmd_code" = "character"), na.strings="") ##################### #### BNF codes ###### ##################### ## extract all BNF codes (this will also include part of read2 codes, as some individuals will have both codes) bnf_codes <- c( "0403","04.03") bnf_data <- data.frame() for (i in bnf_codes){ print(paste("started BNF code",i, "at", Sys.time())) temp <- grep(paste0("^",i,"."),meds$bnf_code,ignore.case=T) temp2 <- meds[temp,] bnf_data <- rbind(bnf_data,temp2) print(dim(bnf_data)) rm(temp2) } # remove rows that have 040300 as this is an anti-epileptic not antidepressant (pregabalin) pregabalin_04030000=which(bnf_data$bnf_code=="04030000") bnf_data<-bnf_data[-pregabalin_04030000,] # identify number of duplicates bnf_data$unique <- !(duplicated(bnf_data) | duplicated(bnf_data, fromLast = TRUE)) table(bnf_data$unique) # remove duplicates that have been added due to the way we extract the data bnf_data %>% select(-unique) %>% distinct() -> bnf_nodup ####################### #### Read2 codes ###### ####################### # load clinical read2 codes read_2_codes <- read.table("ADs_read_2_codes",header=T) # this corresponds to tab ADs_read_2_codes of Supplementary Table 1 # if you didn't convert this table to a text file, you can read directly the excel table: # read_2_codes <- read_excel("Supplementary Table 1", sheet=3) # filter medication file to include read2 codes but no people with read2 and bnf codes as these have already been extracted meds %>% filter(!is.na(read_2) & is.na(bnf_code)) -> meds_no_bnf # extract read 2 codes from medication data read_2_data <- meds_no_bnf[which(meds_no_bnf$read_2 %in% read_2_codes$read2_code),] # this identifies all duplicates, not just one of duplicate pair read_2_data$unique <- !(duplicated(read_2_data) | duplicated(read_2_data, fromLast = TRUE)) table(read_2_data$unique) # remove duplicates read_2_data %>% select(-unique) %>% distinct() -> read_2_nodup ####################### ##### dm+d codes ###### ####################### # medication names : dmd_names <-c("agomelatine","allegron","alventa","amfebutamone","amitriptyline","amoxapine","amphero","anafranil","apclaven","asendis","bolvidon","bonilux","brintellix","bupropion","butriptyline","cipralex","cipramil","citalopram","clomipramine","cymbalta","dapoxetine","defanyl","depalta","depefex","desipramine","domical","dosulepin","dothapax","dothiepin","doxepin","duciltia","duloxetine","dutonin","dutor","edronax","efexor","escitalopram","faverin","felicium","feprapax","fetzima","fluanxol","fluoxetine","flupenthixol","flupentixol","fluvoxamine","gamanil","imipramine","iprindole","iproniazid","isocarboxazid","lentizol","levomilnacipran","lofepramine","loferpramine","lomont","ludiomil","lustral","majoven","manerix","maprotiline","marplan","mianserin","milnacipran","mirtazapine","mirtazepine","mirtazipine","moclobemide","molipaxin","motipress","motival","nardil","nefazodone","nortriptyline","norval","olena","optimax","oxactin","parnate","paroxetine","parstelin","perphenazine","phenelzine","politid","prepadine","priligy","prothiaden","protriptyline","prozac","prozep","ranfaxine","ranflutin","reboxetine","rodomel","seroxat","sertraline","sinepin","sinequan","sunveniz","surmontil","tardcaps","thaden","tifaxin","tofranil","tonpular","tranylcypromine","trazadone","trazodone","trimipramine","triptafen","triptafen-m","trixat","tryptizol","tryptophan","valdoxan","vaxalin","venaxx","vencarm","venlablue","venladex","venlafaxin","venlafaxine","venlalic","venlaneo","venlasov","vensir","venzip","vexarin","viepax","viloxazine","vivactil","vivalan","vortioxetine","winfex","yentreve","zispin","zyban") # subset to only have dmd codes otherwise I'll extract a lot of read 2 people as well (this is purely to have smaller file for now) meds %>% filter(!is.na(dmd_code) & is.na(read_2)) -> meds_dmd_only dmd_data <- data.frame() for (i in dmd_names){ print(paste("started dm+d code",i, "at", Sys.time())) temp <- grep(paste0("^",i,"."),meds_dmd_only$drug_name,ignore.case=T) temp2 <- meds_dmd_only[temp,] dmd_data <- rbind(dmd_data,temp2) print(dim(dmd_data)) } dmd_data$unique <- !(duplicated(dmd_data) | duplicated(dmd_data, fromLast = TRUE)) table(dmd_data$unique) dmd_data %>% select(-unique) %>% distinct() -> dmd_nodup ## combine files together # make new read2 variable that removes the .00 addition from all codes # use ifelse statement to find all rows that end with 00 using grepl ($ indicates that 00 is at the end, # this returns logical vector instead of matched items) # then use substr to remove last characters from those with 00, for others use normal read2 code bnf_nodup %>% bind_rows(read_2_nodup,dmd_nodup) %>% distinct() %>% mutate(read_2_no00 = ifelse(grepl("00$",read_2), substr(read_2,1,nchar(read_2)-2), read_2)) -> bnf_read2_dmd # 2719373 lines, 82667 unique individuals (note that number of individuals may vary) bnf_read2_dmd$unique <- !(duplicated(bnf_read2_dmd) | duplicated(bnf_read2_dmd, fromLast = TRUE)) table(bnf_read2_dmd$unique) ## map chemical name and functional class to codes ## annotated files with all unique codes: bnf_unique_drugnames_mapped <- read.table("bnf_drug_names") # see tab bnf_drug_names in Supplementary Table 1 # if you didn't convert this table to a text file, you can read it directly from the excel file using: # bnf_unique_drugnames_mapped <- read_excel("Supplementary Table 1", sheet=4) dmd_unique_drugnames_mapped <- read.table("dmd_drug_names", col_types = cols(dmd_code = col_character())) # see tab dmd_drug_names in Supplementary Table 1 # if you didn't convert this table to a text file, you can read it directly from the excel file using: # dmd_unique_drugnames_mapped <- read_excel("Supplementary Table 1", sheet=5) read2_unique_drugnames_mapped <- read.table("read2_drug_names") # see tab read2_drug_names in Supplementary Table 1 # if you didn't convert this table to a text file, you can read it directly from the excel file using: # read2_unique_drugnames_mapped <- read_excel("Supplementary Table 1", sheet=6) names(read2_unique_drugnames_mapped)[1]<-"code" read2_unique_drugnames_mapped$code_system <- "read_2" names(bnf_unique_drugnames_mapped)[1]<-"code" bnf_unique_drugnames_mapped$code_system <- "bnf" names(dmd_unique_drugnames_mapped)[1]<-"code" dmd_unique_drugnames_mapped$code_system <- "dmd" ## map antidep_codes_mapping to full medication file to get chemical name and functional class for all rows ## for read2 not all drugs have a drug name so map on code only, ## therefore I make a file which does not have duplicated read2 codes read2_unique_drugnames_mapped %>% distinct(code, .keep_all=T) %>% select(-drug_name,-n) -> read2_distinct_codes_mapped # map read2 codes by code only # map bnf and dmd code by both code and drug name to get unique combinations as some codes have more than 1 medication name bnf_read2_dmd %>% left_join(read2_distinct_codes_mapped, by=c("read_2_no00"="code")) %>% left_join(bnf_unique_drugnames_mapped, by=c("bnf_code"="code","drug_name"="drug_name")) %>% left_join(dmd_unique_drugnames_mapped, by=c("dmd_code"="code","drug_name"="drug_name")) %>% mutate( chem_name = coalesce(chem_name.x, chem_name.y,chem_name), func_class = coalesce(functional_class.x, functional_class.y,functional_class), code_system = coalesce(code_system.x,code_system.y,code_system)) %>% select(eid,data_provider,issue_date,read_2_no00,bnf_code,dmd_code,drug_name,quantity,chem_name,func_class,code_system) %>% filter(!is.na(issue_date)) -> bnf_read2_dmd_mapped # 82662 unique individuals (note that number of individuals may vary) # write output fwrite(bnf_read2_dmd_mapped, "ADs_extracted.txt",col.names=T,row.names=F,quote=F,sep="\t") ########################################### ### MERGE DIAGNOSIS AND MEDICATION DATA ### ########################################### # adding prescription data to diagnosis data psych_codes %>% bind_rows(bnf_read2_dmd_mapped) -> dep_med # create new phenotypes: # xx_readcode: specific read code for depression/psychosis/bipolar/substance abuse # xx_ncode: count the total number of codes per individual per disorder # xx_code_distinct: count number of distinct code per individual per disorder # dep_diagnosis_noqc: depression diagnosis (0|1) without removing individuals with psychosis/bipolar disorder/substance abuse # dep_diagnosis_qc: depression diagnosis (0|1) excluding individuals with psychosis/bipolar disorder/substance abuse dep_med %>% group_by(eid) %>% mutate(depression_readcode=ifelse(type=="depression", read_3, NA), psychosis_readcode=ifelse(type=="psychosis", read_3, NA), bipolar_readcode=ifelse(type=="bipolar_disorder", read_3, NA), sub_abuse_readcode=ifelse(type=="substance_abuse", read_3, NA), depression_ncode=sum(!is.na(depression_readcode)), psychosis_ncode=sum(!is.na(psychosis_readcode)), bipolar_ncode=sum(!is.na(bipolar_readcode)), sub_abuse_ncode=sum(!is.na(sub_abuse_readcode)), chem_name_ncode = sum(!is.na(chem_name)), depression_code_distinct=n_distinct(depression_readcode,na.rm=TRUE), psychosis_code_distinct=n_distinct(psychosis_readcode,na.rm=TRUE), bipolar_code_distinct=n_distinct(bipolar_readcode,na.rm=TRUE), sub_abuse_code_distinct=n_distinct(sub_abuse_readcode,na.rm=TRUE), chem_name_code_distinct = n_distinct(chem_name, na.rm=TRUE), dep_vs_med= ifelse(!is.na(type), "depression", ifelse(!is.na(code_system), "medication", NA)), dep_diagnosis_noqc = ifelse((depression_ncode >=2 | depression_code_distinct >=2),1,0), dep_diagnosis_qc = ifelse(((depression_ncode >=2 | depression_code_distinct >=2) & (sub_abuse_code_distinct==0 & bipolar_code_distinct==0 & psychosis_code_distinct==0)),1,0)) %>% ungroup() -> dep_med_2 # filter down to people who have depression read code dep_med_2 %>% group_by(eid) %>% filter(any(!is.na(depression_readcode))) %>% ungroup() -> gp_depression gp_depression$issue_date <- as.Date(gp_depression$issue_date, "%d/%m/%Y") # dep_vs_med : describes lines reporting a diagnosis (dep, bp, psych) vs. antidepressant prescr. # write the output: fwrite(gp_depression, "dep_ADs_data.txt", col.names=T,row.names=F,quote=F,sep="\t") ######################################## ####### CREATE TRD PHENOTYPE ########### ######################################## dep <- read.delim("dep_ADs_data.txt", h=T,stringsAsFactors=F) ## this file includes diagnosis of depression and antidepressant medications prescriptions, see output above # selections of subjects with depression diagnosis and rows containing antidepressant prescription data: dep<-dep[dep$dep_diagnosis_qc == 1,] dep<-dep[!dep$chem_name=='',] ## see also column dep_vs_med to select rows with antidepressant prescription data # delete non needed columns: dep <- dep[,-c(2:10,20:36)] # in one case prescription date is 1902-02-02, changed to missing: dep$issue_date[dep$issue_date == '1902-02-02' ] <- NA dep <- dep[!is.na(dep$issue_date),] # remove duplicated rows: dep <- distinct(dep) # thryptophan was not considered as an antidepressant prescription valid for the definition of TRD dep <- dep[!dep$func_class == 'amino_acid_supplement',] # recode mirtazapine class to NaSSA: dep$func_class <- ifelse(dep$chem_nome == 'mirtazapine', 'NaSSA', dep$func_class) # create drug prescription episodes (sometimes the same antidepressant is prescribed in periods of different years) # 6 months between prescriptions of the same antidepressant to distinguish different prescription episodes dep %>% group_by(eid, chem_name) %>% arrange(issue_date) %>% mutate(prev_drug_date = dplyr::lag(issue_date, n = 1, default = NA)) %>% mutate(diff_weeks_drug = as.numeric(difftime(issue_date, prev_drug_date, units = "weeks"))) %>% mutate(prescription_episode = ifelse (diff_weeks_drug > 26, seq_along(diff_weeks_drug), 1)) %>% mutate(prescription_episode = ifelse (is.na(prescription_episode), 999, prescription_episode)) %>% mutate(prescription_episode = ifelse ( prescription_episode == 1, NA, prescription_episode)) %>% fill(prescription_episode) %>% mutate(prescription_episode = ifelse (prescription_episode == 999, 1, prescription_episode)) %>% ungroup() -> dep2 dep2 %>% group_by(eid,chem_name,prescription_episode) %>% arrange(issue_date) %>% mutate(duration_prescr_ep = as.numeric(difftime(lead(issue_date,1), issue_date, units = "weeks"))) %>% mutate(prescription_episode_drug = sum(as.numeric(duration_prescr_ep),na.rm=T)) %>% mutate(adequate_prescr_period = ifelse((diff_weeks_drug <= 14 & !is.na(diff_weeks_drug)), 1, ifelse(is.na(diff_weeks_drug), NA, 0))) %>% # if there are > 14 weeks between two consecutive prescriptions the adherence to the drug could be low mutate(n_adequate_prescr = sum(adequate_prescr_period,na.rm=T)) %>% mutate(n_prescr = length(!is.na(adequate_prescr_period))) %>% mutate(adequate_prescr_prop = ifelse((!is.na(adequate_prescr_period) & !is.na(n_adequate_prescr) & n_adequate_prescr >0), n_adequate_prescr / n_prescr, NA)) %>% mutate(last_drug_prescr = ifelse(issue_date == max(issue_date), 'yes', 'no')) %>% # this says when each prescription episode for a drug ends ungroup() -> dep2 dep2 %>% group_by(eid) %>% mutate(adequate_prescr_mean = mean(adequate_prescr_prop, na.rm=T)) %>% mutate(adequate_prescr_sd = sd(adequate_prescr_prop, na.rm=T)) %>% ungroup() -> dep2 dep2 %>% group_by(eid) %>% arrange(issue_date) %>% mutate(drug_distinct = n_distinct(chem_name,na.rm=T)) %>% mutate(drug_switch = ifelse((chem_name != lead(chem_name,1) & last_drug_prescr == 'yes'), 'yes', 'no')) %>% mutate(prescription_episode_drug = ifelse((prescription_episode_drug == 0 & as.numeric(difftime(lead(issue_date,1), issue_date, units = "weeks")) <=14), as.numeric(difftime(lead(issue_date,1), issue_date, units = "weeks")), prescription_episode_drug)) %>% mutate(between_drugs_weeks = ifelse(drug_switch == 'yes', abs(as.numeric(difftime(issue_date, lead(issue_date,1), units = "weeks"))), 999)) %>% mutate(treatment_resistance_switch = ifelse((prescription_episode_drug >= 6 & between_drugs_weeks <= 14 & between_drugs_weeks > 0 & drug_switch == 'yes' & drug_distinct > 2), 1, 0)) %>% mutate(treatment_resistance_drug = ifelse(sum(as.numeric(treatment_resistance_switch),na.rm=T) >= 2, 1, 0)) %>% ungroup() -> dep2 # check if some subjects switched starting from the same drug more than once: dep3 <- dep2[dep2$treatment_resistance_switch == 1,] dep3 <- dep3[!is.na(dep3$chem_name),] ids <- subset(dep3$eid, !duplicated(dep3$eid)) library(foreach) count <- foreach(i=1:length(ids),.combine='rbind') %dopar% data.frame(eid=ids[i], n_rep=length(subset(dep3[dep3$eid == ids[i],]$chem_name, duplicated(dep3[dep3$eid == ids[i],]$chem_name))), tot=nrow(dep3[dep3$eid == ids[i],])) count$diff <- ifelse(count$n_rep > 0, count$tot - count$n_rep, NA) sub <- count[count$diff <= 2 & !is.na(count$diff),] # if count$diff = 2, it means two valid (non-repeated drugs) switches were done # if count$diff = 1, only one non-repeated drug switch was done, these should not be considered as TRD del_drug <- sub[sub$diff == 1,] # 111 individuals (note that number may vary) colnames(del_drug) <- c('eid', 'n_rep_switch_drug', 'tot_drug_swtiches', 'diff_drug_sw') dep2 <- merge(dep2, del_drug, by='eid', all.x=T) dep2$treatment_resistance_drug <- ifelse((dep2$diff_drug_sw==1 & !is.na(dep2$diff_drug_sw)), 0, dep2$treatment_resistance_drug) # write the output: fwrite(dep2, "TRD_phenotype.txt", col.names=T,row.names=F,quote=F,sep="\t")