# *----------------------------------------------------------------- # | PROGRAM NAME: R Code Pallete # | DATE: 3/11/20 # | CREATED BY: MATT BOGARD # | PROJECT FILE: # *---------------------------------------------------------------- # | PURPOSE: # *---------------------------------------------------------------- #---------------------------------- # documentation templates # --------------------------------- # *----------------------------------------------------------------- # | PROGRAM NAME: # | DATE: # | CREATED BY: # | PROJECT FILE: # *---------------------------------------------------------------- # | PURPOSE: # *---------------------------------------------------------------- ############################################################################### ############################################################################### ################# construction zone ############################# ############################################################################### ############################################################################### ## ## new section ## # *-----------------------------------------------------------------* # | # | # | # | reading data # | # | # | # | # *------------------------------------------------------------------* # identify location of data used for project by specifying a working directory # alternatively, if data is stored in numerous places, you can reference # the file locaton directly rm(list=ls()) # get rid of any existing data options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation cat("\f") # clear console clc <- function() cat(rep("\n", 50)) rm("temp") # remove specific data frames ls() # view open data sets (should be empty if you just ran the code above) detach("package:vegan", unload=TRUE) # unload packages that we don't need or that mask other functions # sometimes we may want to unload different versions of packages - the function below handles this + # provides a specific example of how Matchit loads MASS which maskes dplyer # function to detach MASS package which masks the 'select' operation within dplyr used for merging data # we use dplyr extensively to process data but later in the analysis using the MASS package to estimate # negative binomial models (for utilization counts like ER visits) it 'masks' or prevents 'select' from working # this is a problem if we were previously using MASS (or Matchit which leverages MASS) in an analysis and then wanted to run this program # so we want to 'unload' or 'detach' any versions of MASS which may be loaded before trying to build the data # for the analysis below leveraging dplyr functions like 'select.' This is unnecessary if MASS has not been loaded # see also: https://stackoverflow.com/questions/6979917/how-to-unload-a-package-without-restarting-r detach_package <- function(pkg, character.only = FALSE) { if(!character.only) { pkg <- deparse(substitute(pkg)) } search_item <- paste("package", pkg, sep = ":") while(search_item %in% search()) { detach(search_item, unload = TRUE, character.only = TRUE) } } detach_package("MatchIt", TRUE) # first have to unload 'MatchIt' because it imports 'MASS' detach_package("MASS", TRUE) # unload/detach MASS # view specific columns View(df1[,100:103]) str(df1) # variable names and formats str(df1, list.len=ncol(df1)) # in case R truncates the output above # set working directory setwd('/Users/wkuuser/Desktop/R Data Sets') # mac setwd("P:\\R Code References\\R Training") # windows # *------------------------------------------------------------------* # | make R talk on a macbook # *------------------------------------------------------------------* b <- sprintf("say Pumpkin pie is the best") system(b, intern = FALSE, ignore.stdout = FALSE, ignore.stderr = FALSE, wait = TRUE, input = NULL) # *--------------------------------------------------------------------------------------------------* # | direct reading of a single variable vector from command line- similar to cards statement in SAS # *------------------------------------------------------------------------------------------------* GARST <- c(150,140,145,137,141,145,149,153,157,161) # read in specified values for the variable GARST print(GARST) GARST <- c(150,140,145,137,141,145,149,153,157,161) PIO <- c(160,150,146,138,142,146,150,154,158,162) MYC <- c(137,148,151,139,143,120,115,136,130,129) DEK <- c(150,149,145,140,144,148,152,156,160,164) PLOT <- c(1,2,3,4,5,6,7,8,9,10) BT <- c('Y','Y','N','N','N','N','Y','N','Y','Y') RR <- c('Y','N','Y','N','N','N','N','Y','Y','N') yield_data <- data.frame(GARST,PIO,MYC,DEK,PLOT,BT,RR) rm(GARST,PIO,MYC,DEK,PLOT,BT,RR) # cleanup # scatter plot plot(yield_data$GARST,yield_data$PIO) # this is even more like the cards statement in SAS toread <- "id sex age inc r1 r2 r3 1 F 35 17 7 2 2 17 M 50 14 5 5 3 33 F 45 6 7 2 7 49 M 24 14 7 5 7 65 F 52 9 4 7 7 81 M 44 11 7 7 7 2 F 34 17 6 5 3 18 M 40 14 7 5 2 34 F 47 6 6 5 6 50 M 35 17 5 7 5" survey <- read.table(textConnection(toread), header = TRUE) closeAllConnections() #-------------------------------------------------- # reading a file from the web #--------------------------------------------------- library(data.table) mydat <- fread('https://raw.githubusercontent.com/dincerti/dincerti.github.io/master/data/mepsdiab.csv') head(mydat) # *------------------------------------------------------------------* # | reading data from a txt file # *------------------------------------------------------------------* # data looks like this: # GARST PIO MYC DEK PLOT BT RR # 150 160 137 150 1 Y Y # 140 150 148 149 2 Y N # 145 146 151 145 3 N Y # 137 138 139 140 4 N N # 141 142 143 144 5 N N # 145 146 120 148 6 N N # 149 150 115 152 7 Y N # 153 154 136 156 8 N ? # 157 158 130 160 9 Y Y # 161 162 129 164 10 Y N yield_data<-read.table("Yield_plots.txt", header=TRUE) # read text file names(yield_data) # var list print(yield_data) # print data set yield_data # note: print() is not necessary to print # get all factor variables is.fact <- sapply(yield_data, is.factor) tmp <- yield_data[,is.fact] # *------------------------------------------------------------------* # | reading data from a csv file # *------------------------------------------------------------------* # data looks like this # HYBRID TRAIT P1 P2 P3 # GARST RR 150 140 145 # MYC BT 160 150 146 # PIO RR 137 148 151 # DEK BT 150 149 145 plots <- read.csv("plots.csv", na.strings=c(".", "NA", "", "?"), encoding="UTF-8") plots # create function to reformat column names from csv files (from:https://stackoverflow.com/questions/17152483/how-to-replace-the-in-column-names-generated-by-read-csv-with-a-single-spa ) makeColNamesUserFriendly <- function(ds) { # FIXME: Repetitive. # Convert any number of consecutive dots to a single space. names(ds) <- gsub(x = names(ds), pattern = "(\\.)+", replacement = "_") # Drop the trailing spaces. names(ds) <- gsub(x = names(ds), pattern = "( )+$", replacement = "") ds } # create unique ID for each line in data frame tmp_claims$RowID <- 1:nrow(tmp_claims) # convert index to column names <- rownames(yield_data) yield_data <- cbind(names,yield_data) # repair read error for first ID (somethimes csv file read cause the first element in file to misread) df$ID <- as.character(df$ID) df[1,1] <- '123456' # *------------------------------------------------------------------* # | reading data from a excel xlsx # *------------------------------------------------------------------* xlsxFile <- "C:\\Users\\mtb2901\\Documents\\Data Science Projects\\Projects 2018\\201801 Hospital Risk Profile\\Raw Data\\TX-Hospital-Tricare-Network.xlsx" # reference: https://www.rdocumentation.org/packages/openxlsx/versions/4.0.17/topics/read.xlsx # read.xlsx(xlsxFile, sheet = 1, startRow = 1, colNames = TRUE, # rowNames = FALSE, detectDates = FALSE, skipEmptyRows = TRUE, # skipEmptyCols = TRUE, rows = NULL, cols = NULL, check.names = FALSE, # namedRegion = NULL, na.strings = "NA", fillMergedCells = FALSE) vars <- read.xlsx("FraudT3CB14jun18HEADER.xlsx",sheetIndex = 1) # read header file with variable names #------------------------------------- # writing files #-------------------------------------- # ex without rownames write.csv(opioid_dat, file = "//Documents//Briefcase///Data//opioid_dat.csv", row.names = FALSE) # export as R data file save(MBR_FACT_ANLY, file = "//projects//Data//MBR_FACT_ANLY.RData") # export as text with tab dilemeter write.table(df1, //Projects//Data//df1.txt",row.names = FALSE, sep="\t") # *-----------------------------------------------------------------* # | # | # | # | printing and subsetting data # | # | # | # | # *------------------------------------------------------------------* # print selected variables yield_data[c("GARST")] # subset data via variable selection my_hybrids <- yield_data[ c("GARST", "PIO")] print(my_hybrids) # subset based on a single variable GARST<-yield_data[c("GARST")] print(GARST) df <- df[ , !names(df) %in% c("xvar")] # drop field # print a subset of data based on observed variable values print(yield_data[yield_data$GARST==150 & yield_data$PIO==160,]) # subset data based on observed values high_yields <- yield_data [ yield_data$GARST==150 & yield_data$PIO==160,] print(high_yields) stacked_traits <-yield_data[ yield_data$BT =="Y" & yield_data$RR =="Y",] stacked_traits # subset based on just one variable value LOW_GARST<-yield_data[GARST==140,] print(LOW_GARST) LOW_GARST<-yield_data[GARST < 150,] print(LOW_GARST) RR<- yield_data[yield_data$RR =="Y",] RR Bt<- yield_data[yield_data$BT=="Y",] Bt # subset based on %in% vc <- c(150,145,195) tmp <- yield_data[yield_data$GARST %in% vc,] # or tmp <- yield_data[yield_data$GARST %in% c(150,145,195,161),] # this might also works using dplyr (which allows for additional manipulation if you want to # 'pipe' in additional aggregations or calculations) library(dplyr) tmp <- yield_data %>% filter(GARST %in% c(150,145,195)) # subset based on 'not' %in% using dplyr tmp <- filter(yield_data,!BT %in% c('Y')) library(sqldf) tmp <- sqldf('select * from yield_data where GARST in(150,145,195)') #------------------------------------------------ # connect to SQL server #------------------------------------------------ # read initial analysis file library(RODBC) # connecct to the SQL Server LOUSQLWTS853 DB_Connection <- odbcDriverConnect('driver={SQL Server};server=arlsql123;database=ANLY_SBOX;trusted_connection=true') # connect to specific table (see documentation to limit rows etc.) MBR_FACT_ANLY <- sqlFetch(DB_Connection, "mbr_fact_anly") # export as R data file for future use(so we don't have to hit SQL every time) save(MBR_FACT_ANLY, file = "//projects//mtt94674//Data//MBR_FACT_ANLY.RData") # make sure to close your database connection close(DB_Connection) # *-----------------------------------------------------------------* # | # | # | # | summary statistics # | # | # | # | # *------------------------------------------------------------------* summary(yield_data) # whole data set summary(yield_data[c("GARST")]) # single variable summary(yield_data$GARST) # single variable (more efficient) summary(yield_data[c("GARST" , "PIO")]) # selected variables summary(sqrt(yield_data$GARST) ) # transformation # by processing by(yield_data, yield_data$BT, summary) # using dplyr yield_data%>% group_by(BT) %>% # group by s summarize(mean = mean(GARST)) # mean ### more examples with dplyr # how many members per risk group (members are duplicated by transactions) # as such the same member can have more than one transaction scored by the model # that are in the same risk group and a member can belong to multiple # risk groups if they have multiple transactions scored accordingly ### brute force example tmp <- df4%>%filter(riskcat == '1:LOW')%>%distinct(mbrid) tmp <- df4%>%filter(riskcat == '2:MOD')%>%distinct(mbrid) tmp <- df4%>%filter(riskcat == '3:HI')%>%distinct(mbrid) tmp <- df4%>%filter(riskcat == '4:VHI')%>%distinct(mbrid) ### a little less brute force tmp1 <- df4%>%select(riskcat,mbrid)%>%arrange(riskcat,mbrid) # sort by risk and ID tmp2 <-tmp1%>%group_by(riskcat)%>%distinct(mbrid) # get unique IDs per risk strata # how many members per strata tmp2%>% group_by(riskcat)%>% summarize(total = n()) ### can we do the things above in one step df4%>%select(riskcat,mbrid)%>%arrange(riskcat,mbrid)%>%group_by(riskcat)%>%distinct(mbrid)%>% summarize(total = n()) library(Hmisc) # frequencies library(Hmisc) describe(yield_data) # contents of data set - (Hmisc) contents(yield_data) # aggregations and crosstabulations summary(ACTIV_PARTICIP ~ AGECAT, data=hc_mbr_anly, fun=mean) # tables tmp <- data.frame(table(complications$Measure_ID)) # saved as an easily readable data frame aggregate(ACTIV_PARTICIP~RUCC_2013_DESC,hc_mbr_anly,mean) CROP <- c('Cotton','Cotton','Corn','Corn','Corn','SB','SB','SB','SB','SB') RR <- c('Y','N','Y','N','N','N','N','Y','Y','N') yield_data <- data.frame(CROP,RR) rm(CROP,RR) # cleanup mytable <-table(yield_data$RR,yield_data$CROP) tmp <- data.frame(mytable) names(tmp) <- c("RR","Crop","Total") print(tmp) tmp <- data.frame(prop.table(mytable)) # cell percentages names(tmp) <- c("RR","Crop","Pct") print(tmp) tmp <- data.frame(prop.table(mytable, 1)) # row percentages (%Crop within RR status) names(tmp) <- c("RR","Crop","RowPct") print(tmp) tmp <- data.frame(prop.table(mytable, 2)) # column percentages (%RR within Crop) names(tmp) <- c("RR","Crop","ColPct") print(tmp) # cross tabulations across a range of variables # average garst across BT and RR is.fact <- sapply(yield_data, is.factor) tmp <- yield_data[,is.fact] vars <- names(tmp) for(i in 1:2) { print("+-+-+-+-+-+-+-+-+-+-+-+-") print(vars[i]) print("+-+-+-+-+-+-+-+-+-+-+-+-") print(by(yield_data$GARST, yield_data[c(vars[i])],mean)) } # cross tablulations across a range of columns in a data frame names(sb) # list columns for reference xvar <- as.list(names(sb)) # save names as a list to reference in loop # loop across fields and produce frequencies for(i in 1:36) { print(xvar[i]) print(data.frame(table(sb[c(paste(xvar[i]))]))) } aggregate(totmthcov~pre_term,df1,mean) # using dplyr df1%>% summarize(mean = mean(prob,na.rm = TRUE), min = min(prob,na.rm = TRUE), Q1=quantile (prob, probs=0.25,na.rm = TRUE), med = median(prob,na.rm = TRUE), Q3=quantile(prob, probs=0.75,na.rm = TRUE), P85=quantile(prob, probs=0.85,na.rm = TRUE), P90=quantile(prob, probs=0.90,na.rm = TRUE), P95=quantile(prob, probs=0.95,na.rm = TRUE), P99=quantile(prob, probs=0.99,na.rm = TRUE)) # aggregate totals per person df5 <- df4%>% group_by(ID) %>% summarize(TOTAL_ALLOW_PRE = sum(TOT_ALLOWED_AMT)) xtabs(~ BT, data = yield_data) # counts of types of BT xtabs(~ BT + RR, data = yield_data) # BT by RR xtabs(GARST ~ RR, data=yield_data) # total RR for GARST # categorical frequencies by treatment status mytable <- table(df1$PrimaryDiagDesc,df1$treat) prop.table(mytable, 2) # column percentage # *-----------------------------------------------------------------* # | # | # | # | creating and adding new variables # | # | # | # | # *------------------------------------------------------------------* # create a new data set that includes a new variable that is # the sum of "garst" and "pio" yield_data2<-transform(yield_data, sum=GARST+PIO) print (yield_data2) # add a new variable to existing data set that is # the sum of all hybrids #first create data set HYBRIDS hybrids<-yield_data print(hybrids) # add new variable to hybrids hybrids<-transform(hybrids, SUM=GARST+PIO+MYC+DEK) print(hybrids) # or add 2 new variables hybrids<-transform(hybrids, SUM=GARST+PIO+MYC+DEK, MEAN=SUM/4) print(hybrids) # combine or concatenating two columns yield_data$x <- paste(yield_data$RR,yield_data$BT) yield_data$x <- paste(yield_data$RR, "-",yield_data$BT) df$x <- paste(df$n,"-",df$s) for inserting a separator. # *-----------------------------------------------------------------* # | # | # | # | conditional processing # | # | # | # | # *------------------------------------------------------------------* # recall from the data set there are 10 plots # first assume that in plot 3 we accidentally planted a Bt # garst hybrid that was supposed to be a non-Bt hybrid # so we don't want to include it in the non Bt # average for hybrids- we can use 'ifelse' conditions # to exclude this from the mean calculation # first create data set trials trials<-yield_data print(trials) print(trials$PLOT) trials$AVG <- ifelse ( trials$PLOT == 3 , (trials$AVG <- (trials$PIO+trials$MYC+trials$DEK)/3), # excludes GARST in the calculation if PLOT = 3 (trials$AVG <- (trials$GARST+trials$PIO+trials$MYC+trials$DEK)/4 ) ) # for all other plots # creates new variable AVG in data set trials trials # another example - notice the nexting of the ifelse conditions: grades$letter <- ifelse(grades$score >= .90,"A", ifelse(grades$score >= .80 & grades$score < .90, "B",ifelse (grades$score >= .70 & grades$score < .80,"C",ifelse(grades$score >= .60 & grades$score < .70, "D","F")))) table(grades$letter) # create binary target banking$target <- ifelse(banking$y =="no",0,1) # model calibration / risk stratification df1$risk <- ifelse(df1$prob < .12,"1:LOW", ifelse(df1$prob >= .12 & df1$prob < .30, "2:MOD", ifelse(df1$prob >= .3 & df1$prob < .60, "3:HI","4:VHI"))) table(df1$risk) # age categorization df1$AgeCat <- ifelse(df1$age < 18,'Under 18', ifelse(df1$age >= 18 & df1$age <= 25,'18-25', ifelse(df1$age > 25 & df1$age <= 29, '26-29', ifelse(df1$age > 29 & df1$age <= 39,'30-39', ifelse(df1$age > 39 & df1$age <= 49, '40-49', ifelse(df1$age > 49 & df1$age <= 59, '50-59', ifelse(df1$age >59 & df1$age <= 64, '60-64','65-65+'))))))) # *-----------------------------------------------------------------* # | # | # | # | stacking / concatenating/ adding data sets # | # | # | # | # *------------------------------------------------------------------* # get only bt hybrids- same as subsetting BT_HYBRIDS <- yield_data[yield_data$BT=="Y", ] print(BT_HYBRIDS) # get only non bt hybrids NON_BT_HYBRIDS<- yield_data[yield_data$BT=="N",] print(NON_BT_HYBRIDS) # *------------------------------------------------------------------* # | combine with the rbind function # *------------------------------------------------------------------* both<-rbind(BT_HYBRIDS,NON_BT_HYBRIDS) print(both) # *------------------------------------------------------------------* # | stack with SQL using sqldf # *------------------------------------------------------------------* library(sqldf) library(ggplot2) both_sql = sqldf(' select * from BT_HYBRIDS union all select * from NON_BT_HYBRIDS order by PLOT ') # note: the 'all' statement keeps all rows, if you omit 'all' you get only unique rows both_sql # *-----------------------------------------------------------------* # | # | # | # | # | merging based on common variables # | # | # | # *------------------------------------------------------------------* # first split the data, one set contains plot, bt, rr, garst, pio # the other set has plot, rr, bt, myc, dek # we'll work under the assumption that the first set is 'early' # hybrids and the second set is 'late' hybrids # as if we collected data on these separately and want to combine the # data set to get one set, like we started with with yield_data # create early / late planting data set early<-yield_data[c("PLOT","GARST","PIO","BT","RR")] early late<-yield_data[c("PLOT","MYC","DEK","BT","RR")] late # *------------------------------------------------------------------* # | merge by common variables- PLOT, BT, RR # *------------------------------------------------------------------* hybrids<-merge(early,late,by=c("PLOT","BT","RR")) hybrids # *------------------------------------------------------------------* # | merging by a single common variable # *------------------------------------------------------------------* # the above merges back to the orignianl data set, but # we had to be careful to merge by ALL of the common variables # suppose we had the following data set strip_trial.txt # HYBRID YIELD TRAIT # GARST 150 BT # MYC 140 RR # PIO 160 RR # DEK 145 BT # lets read it in and split it up to form 2 data sets 'yields' # and 'traits' with a SINGLE common variable 'HYBRID' to merge by #'yields' # HYBRID YIELD # GARST 150 # MYC 140 # PIO 160 # DEK 145 # 'traits' # HYBRID TRAIT # GARST BT # MYC RR # PIO RR # DEK BT # read data 'yields' yields <- read.table("yields.txt", header=TRUE) # read text file yields # read data 'traits' traits <- read.table("traits.txt", header=TRUE) # read text file traits # now to demonstate merging data sets by a common variable field_data<-merge(yields,traits,by=c("HYBRID")) field_data # *------------------------------------------------------------------* # | merging with SQL using sqldf # *------------------------------------------------------------------* # library(sqldf) # library(ggplot2) # left join yields and traits on HYBRID field_data_sql = sqldf(' select a.HYBRID,a.YIELD,b.TRAIT from yields a left join traits b on a.HYBRID =b.HYBRID order by a.YIELD') field_data_sql # left join early and late (from above) on PLOT- note for each plot RR and BT are the # same regardless of the hybrid so specifying BT and RR for only one data set will suffice, # but we need to join on PLOT this seems to give a much more straightforward solution to the # merge above using the R merge statement hybrids_sql = sqldf(' select a.PLOT,a.GARST,a.PIO, b.MYC, b.DEK, a.BT, a.RR from early a left join late b on a.PLOT =b.PLOT order by a.PLOT') hybrids_sql #----------------------------------------------------- # merging using dplyr #---------------------------------------------------- tmp3 <- tmp2%>% left_join(tmp1, by = "State") # get all months between EFFMTH and ENDMTH in cohort history file monthlist <- df1%>%left_join(select(tmp5,MBRID,Month,covmth), by = "MBRID")%>%filter(Month >= EFFMTH & Month <= ENDMTH) monthlist <- monthlist %>%arrange(MBRID,Month) # more complicated join wiht dplyr # merge claims data with cohort file keeping only claims between the coverage periods df3 <- df2%>%select(MBRID,EFFMTH,ENDMTH,INDEX_MTH_DT,pre12,post12,claimflag)%>%left_join(select(raw_claims,MBRID = PID,Claim_Number,SVC_FROM_DT,TOT_ALLOWED_AMT), by = "DMBRID")%>%filter(SVC_FROM_DT >= EFFMTH & SVC_FROM_DT <= ENDMTH) # *-----------------------------------------------------------------* # | # | # | # | sorting data # | # | # | # *------------------------------------------------------------------* # use file plots2 weed pressure # LOCATION TRAITS P1 P2 P3 P4 # 1 RR 1 1 5 1 # 2 RR 2 1 4 1 # 1 RR 2 2 4 3 # 2 RR 3 1 NA 3 # 1 BT 4 5 2 4 # 2 BT 5 4 5 5 # 1 BT 5 3 4 4 # 2 BT 4 5 5 5 plots2<- read.table("plots2.txt", header =TRUE) plots2 # sort data by location plotsSorted<-plots2[order(plots2$LOCATION),] plots2 plotsSorted # sort by trait then location plots2Sorted<-plots2[order(plots2$TRAITS, plots2$LOCATION),] plots2Sorted # for descending order, prefix any variable with a minus sign plots2Sorted<-plots2[order(-plots2$LOCATION,plots2$TRAITS),] plots2Sorted # dplyr tmp <- arrange(tmp,desc(Freq)) # *-----------------------------------------------------------------* # | # | # | # | graphics # | # | # | # *------------------------------------------------------------------* # *------------------------------------------------------------------* # | examples of r graphics capabilities # *------------------------------------------------------------------* demo(graphics) demo(persp) library(lattice) demo(lattice) # *------------------------------------------------------------------* # | basic plots # *------------------------------------------------------------------* # histogram hist(yield_data$GARST) # bar plot barplot(yield_data$GARST, main="Garst Yields", xlab ="Plots",col =c("darkblue","red"), legend =rownames(yield_data$PLOT)) # bar plot (example) counts <- mytable$Freq[mytable$Freq > 8] barplot(counts, main="Conditions of Members", xlab="Conditions", col=c("darkblue","red"), legend = mytable$Var1[mytable$Freq > 8], beside=False) # using xtabs to create plots plot(~xtabs(GARST ~ RR, data=yield_data)) # barplot plot(xtabs(~ BT, data = yield_data)) # barplot plot(xtabs(~ BT + RR, data = yield_data)) # mosaic plot # scatter plot plot(yield_data$GARST,yield_data$PIO) # scatterplot matrix of whole data set plot(yield_data) # scatterplot matrix- data used in regression in analysis section land_prices<- read.table("land_prices.txt", header =TRUE) land_prices plot(land_prices) # brushed scatterplot - as in model visualization ggplot(df1,aes(x=SUMRX_NONLTOT_QB_GE120MEDD,y=SUMRX_LTOT_QB_GE120MEDD, shape = risk,color = risk)) + scale_color_manual(values=c("green","yellow", "red","red")) + geom_point() # labels + jittering set.seed(456) ggplot(df1,aes(x=SUMRX_NONLTOT_QB_GE120MEDD,y=SUMRX_LTOT_QB_GE120MEDD, shape = risk,color = risk)) + scale_color_manual(values=c("green","yellow", "red","red")) + geom_point() + geom_jitter(width = 2, height =2) + xlab("Total Non-Qualified >= 120") + ylab("Total Qualified >= 120") + ggtitle("Observed Rx Counts Overlaid by Predicted Risk/Stratification") # *------------------------------------------------------------------* # | stacked bar chart # *------------------------------------------------------------------* # get data bytrait<- read.table("biotech_yield.txt", header =TRUE) bytrait names(bytrait) # summarize data for stacking by trait counts <- table(bytrait$RR, bytrait$YIELD) # list stacked variable first counts barplot(counts, main ="Yield by Trait for Garst",xlab ="Yield Category", col =c("darkblue","red"),legend =rownames(counts)) #-------------------------------------------- # pie and bar charts in ggplot #-------------------------------------------- table(df$pre_term) # get values by group tmp <- data.frame( group = c("Full-Term", "Pre-Term"), value = c(7999,6000) ) # stacked bar bp <- ggplot(tmp, aes(x="", y=value, fill=group))+ geom_bar(width = 1, stat = "identity") bp # pie pie <- bp + coord_polar("y", start=0) pie pie + scale_fill_manual(values=c("red","blue")) # *------------------------------------------------------------------* # | basic density plots # *------------------------------------------------------------------* # plot and save histogram data garsths <- hist(yield_data$GARST, main = "Garst Yield Histogram") print(garsths) # compute density estimates using default gaussian & rule of thumb bandwidth garstdens <- density(x=yield_data$GARST, bw="nrd0",kernel="gaussian",n=20) print(garstdens) # rescale density so it will plot on the same graph as GARST histogram rs <- max(garsths$counts/max(garstdens$y)) # create scaling factor for plotting the density lines(garstdens$x, garstdens$y*rs, col=2) # plot density over histogram using lines statment # compare this to just plotting the non-rescaled density over the histogram lines(density(x=yield_data$GARST, bw="nrd0", n=20),col=3) # green line barely shows up # turn off/close graphics window dev.off() # *------------------------------------------------------------------* # | overlapping density plots # *------------------------------------------------------------------* library(colorspace) # package for rainbow_hcl function # Generate just the data for a histogram of GARST grouping by RR trait ds <- rbind(data.frame(dat=yield_data[,][,"GARST"], grp="All"), data.frame(dat=yield_data[,][yield_data$RR=="N","GARST"], grp="N"), data.frame(dat=yield_data[,][yield_data$RR=="Y","GARST"], grp="Y")) # histogram for all GARST data hs <- hist(ds[ds$grp=="All",1], main="", xlab="GARST", col="grey90", ylim=c(0, 9.46395633979357), breaks="fd", border=TRUE) # compute density, rescale, and plot for all GARST data dens <- density(ds[ds$grp=="All",1], na.rm=TRUE) rs <- max(hs$counts)/max(dens$y) lines(dens$x, dens$y*rs, type="l", col=rainbow_hcl(3)[1]) # compute density, rescale, and plot where RR = 'N' dens <- density(ds[ds$grp=="N",1], na.rm=TRUE) rs <- max(hs$counts)/max(dens$y) lines(dens$x, dens$y*rs, type="l", col=rainbow_hcl(3)[2]) # compute density, rescale, and plot where RR = 'Y' dens <- density(ds[ds$grp=="Y",1], na.rm=TRUE) rs <- max(hs$counts)/max(dens$y) lines(dens$x, dens$y*rs, type="l", col=rainbow_hcl(3)[3]) # Add a rug to illustrate density rug(ds[ds$grp=="N", 1], col=rainbow_hcl(3)[2]) rug(ds[ds$grp=="Y", 1], col=rainbow_hcl(3)[3]) # Add a legend to the plot. legend("topright", c("All", "N", "Y"), bty="n", fill=rainbow_hcl(3)) # Add a title to the plot. title(main="Distribution of GARST by RR", sub=paste("Created Using R Statistical Package", format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"])) ### pie chart table(tmp$Enrollee) # get counts slices <- c(3111,14141) # define slices lbls <- c("True","False") # slice labels pct <- round(slices/sum(slices)*100,digits =2) # calc pct lbls <- paste(lbls, pct) # add calculated percents to labels lbls <- paste(lbls,"%",sep="") # ad % to labels pie(slices,labels = lbls, col=rainbow(length(lbls)), main="Members") ### aggregate time series and produce multiple line chart (dply & ggplot2) # subset only top 5 departmetns for OT Hrs tmp2<- filter(df2,Role=="Specialist"|Role=="Finder"|Role=="Customer Care Specialist"|Role =="Clinical"|Role=="Content Specialist") # aggregate by department and date tmp3 <- tmp2%>% group_by(Role,End_Date) %>% summarize(TotalOTHrs = sum(Overtime_Hours)) ggplot(tmp3,aes(End_Date,TotalOTHrs,colour=Role)) + geom_line() + geom_point() ### aggregate and simple plot # aggregate & visualize total OT by pay period tmp <- df2%>% group_by(End_Date) %>% summarize(TotalOTHrs = sum(Overtime_Hours)) plot(tmp$End_Date,tmp$TotalOTHrs,type="b", xlab = "End Date", ylab = "Total OT Hrs") #------------------------------------------------ # side by side plots #------------------------------------------------ par(mfrow=c(1,2)) # set the plotting area into a 1*2 array hist(treat,breaks = 50) hist(ctrl, breaks = 50) # *-----------------------------------------------------------------* # | # | # | # | analysis # | # | # | # *------------------------------------------------------------------* # *------------------------------------------------------------------* # | t-tests # *------------------------------------------------------------------* # independent samples t-test t.test(yield_data$GARST,yield_data$PIO ) # difference in GARST vs PIO yields from yield_data file t.test(plots2$P4[plots2$TRAITS=='RR'],plots2$P4[plots2$TRAITS=='BT'] ) # difference in weed pressure for bt and rr traits in plot 4 # paried t-test t.test(plots2$P1,plots2$P2,paired=TRUE) # differences in overall weed pressure for plots 1 and 2 (more appropriately used on same subjects- before and after tests) # example if plot 1 was before herbicide application while plot 2 was the same plot measured for weed pressure after application t.test(yield_data$GARST,yield_data$PIO,paired=TRUE) # not actually appropriate but notice the difference in the results from the previous t-test on # GARST and PIO # *------------------------------------------------------------------* # | measures of association # *------------------------------------------------------------------* # pearson correlations - no sig tests rcorr(cbind(plots2$P1,plots2$P2,plots2$P3,plots2$P4)) # gives pearson correlations # the cor.test has p-value, ci, but only 2 vars at a time cor.test(plots2$P1,plots2$P2,use="pairwise") # spearman correlations using the hmisc rcorr function rcorr( cbind(plots2$P1,plots2$P2,plots2$P3,plots2$P4), type='spearman') rcorr( cbind(plots2$P1,plots2$P2,plots2$P3,plots2$P4), type='pearson') # compare, also same as defalt # *------------------------------------------------------------------* # | simple linear regression # *------------------------------------------------------------------* # lets use another data set land_prices # PRICE ACRES IMPROVEMENTS # 36 9 8 # 80 15 7 # 44 10 9 # 55 11 10 # 35 10 6 land_prices<- read.table("land_prices.txt", header =TRUE) land_prices plot(land_prices) # scatterplot of dependent and independent variables myRegModel<-lm(PRICE~ACRES,data=land_prices) # run regression summary(myRegModel) # view estimates anova(myRegModel) # simple plots- plot data and regresion line plot(land_prices$ACRES,land_prices$PRICE) abline(myRegModel) # automated plots plot(myRegModel) termplot(myRegModel) # *------------------------------------------------------------------* # | multipe linear regression # *------------------------------------------------------------------* myRegModel<-lm(PRICE~ACRES+IMPROVEMENTS,data=land_prices) # run regression summary(myRegModel) # view estimates anova(myRegModel) plot(myRegModel) termplot(myRegModel) # *------------------------------------------------------------------* # | analysis of variance # *------------------------------------------------------------------* # look at data set 'yield_data' yield_data<-read.table("Yield_plots.txt", header=TRUE) # read text file print(yield_data) # print data set # data looks like this: # GARST PIO MYC DEK PLOT BT RR # 150 160 137 150 1 Y Y # 140 150 148 149 2 Y N # 145 146 151 145 3 N Y # 137 138 139 140 4 N N # 141 142 143 144 5 N N # 145 146 120 148 6 N N # 149 150 115 152 7 Y N # 153 154 136 156 8 N ? # 157 158 130 160 9 Y Y # 161 162 129 164 10 Y N # we would like to set up a design such as this: # let treatments = hybrid, blocks = plot #------------------------------------------------# # HYBRID # GARST PIO MYC DEK # PLOT 1 150 137 160 150 # PLOT 2 140 148 150 149 # PLOT 3 145 151 146 145 #------------------------------------------------# # actual data set will need to look like this 'yield_trials' # HYBRID PLOT YIELD # GARST P1 150 # GARST P2 140 # GARST P3 145 # PIO P1 137 # PIO P2 148 # PIO P3 151 # MYC P1 160 # MYC P2 150 # MYC P3 146 # DEK P1 150 # DEK P2 149 # DEK P3 145 yield_trials<-read.table("yield_trials.txt", header=TRUE) # read text file print(yield_trials) # print data set # *------------------------------------------------------------------* # | 2 WAY AOV # *------------------------------------------------------------------* myModel<-aov(YIELD~HYBRID+PLOT,data=yield_trials) summary(myModel) anova(myModel) # should give same results as above plot(myModel) termplot(myModel) #-------------------------------------------------- # getting marginal/adjusted/least squares means from multivariable models #-------------------------------------------------- ### example linear regression summary(m1 <- lm(Post_ER_Admit ~ Control_Study_Flag + CaseOpenReasonDesc + Diag_Grouping + SourceFlag + Gender_Code + LowIncome + RuralZips + Pre_ER_Admit + AcutePre + Length_Stay_Pre + PrimaryCareVisitCount + Cancer_abnormal_cervical + Asthma + Pulmonary_Disease + HTN + Heart_Disease + Diabetes + Obesity + Depression + HIV + Other_Mental_Disorders + RiskLvl + OtherProgramFlag + Total_Bene + Total_TEPRV_ID + Total_Hospital_Beds + Total_Hospitals +BranchDesc + SponsorRankDesc + AgeCat, data = df_chrt)) # b_trt = 1.03 more ER visits in the treatment group compared to .64 fewer in the matched analysis # use emmeans package to calculate 'marginal means' or adjusted means library(emmeans) emmeans (m1, ~ Control_Study_Flag) # 4.81-3.77 = 1.04 which is ~ what we get directly from the regression above ### example negative binomial model summary(m1 <- glm.nb(Post_ER_Admit ~ Control_Study_Flag + CaseOpenReasonDesc + Diag_Grouping + SourceFlag + Gender_Code + LowIncome + RuralZips +Pre_ER_Admit + AcutePre + Length_Stay_Pre + PrimaryCareVisitCount + Cancer_abnormal_cervical + Asthma + Pulmonary_Disease + HTN + Heart_Disease + Diabetes + Obesity + Depression +HIV + Other_Mental_Disorders + RiskLvl + OtherProgramFlag +Total_Bene + Total_TEPRV_ID + Total_Hospital_Beds + Total_Hospitals + BranchDesc + SponsorRankDesc + AgeCat, control = glm.control(maxit = 500),data = df_chrt)) exp(0.184) # IRR = 1.202 treatment group has ~ 20% more ER visits than controls m1.mmeans <- emmeans(m1, ~ Control_Study_Flag) # estimate marginal means summary(m1.mmeans, infer = TRUE, type = 'response') # summary with backtransformation to reflect original scale (.0634/.0527) # this gives a ratio of marginal means = 1.203 which is consistent with the exponentiated result directly from the model # *-----------------------------------------------------------------* # | # | # | # | labels and formats # | # | # | # *------------------------------------------------------------------* #-----------------------------------------------------------------# # value labels or formats #-----------------------------------------------------------------# ## ## example 1: recoding ## # change location from 'numeric' into a 'factor' with character values # this is not actually creating labels, bt actually changing the numeric values # of LOCATION to more meaningful character values such as "N" for north and "S" for south plots2$LOCATION <- factor(plots2$LOCATION,levels=c(1,2,3,4),labels=c("N", "S", "E", "W")) plots2 ## ## example 2: creating new variables to act as labels for values ## # lets create new variales to act as labels for insect pressure levels 1-5 # instead of changing the numeric values for P1-P4, we are creating a new # set of character variables to describe P1-P4 myPlevels<-c(1,2,3,4,5) # set the # of levels myPlabels <-c("Very Heavy","Heavy","Moderate","Light","Very Light") # labels # now create a new set of variables that describe P1-P4 plots2$P1f <- factor(plots2$P1, myPlevels, myPlabels) plots2$P2f <- factor(plots2$P2, myPlevels, myPlabels) plots2$P3f <- factor(plots2$P3, myPlevels, myPlabels) plots2$P4f <- factor(plots2$P4, myPlevels, myPlabels) plots2 # Get summary and see that LOCATIONS are now counted. summary( plots2[ c("P1f","P2f","P3f","P4f") ] ) ## ## note we could have just renamed these values just as we did for ## location in the first example ## plots2b <- read.table("plots2.txt", header =TRUE) # copy of plots2 for this example plots2b # note we are re-using pre-defined labels from above so those statements # are not repeated here plots2b$P1 <- factor(plots2b$P1, myPlevels, myPlabels) plots2b$P2 <- factor(plots2b$P2, myPlevels, myPlabels) plots2b$P3 <- factor(plots2b$P3, myPlevels, myPlabels) plots2b$P4 <- factor(plots2b$P4, myPlevels, myPlabels) plots2b #-----------------------------------------------------------# # variable labels #-----------------------------------------------------------# # note in the examples above we were concerned with the formatting # the values a particular variable could take, such as values N, S # for the variable LOCATION or the value "Very Heavy" for the variables P1-P4 # here we are looking at changing the name or appearance of the variables themselves # vs the values they may take library(Hmisc) # start fresh with the origial unformatted data from plots2 plots2<- read.table("plots2.txt", header =TRUE) plots2 # assign variable names to act as labels - lets assume we # want vriables p1-p4 to appear as plot1 - plot4 # this example 'pastes' new names over the old ones or # simply changes the variable names # note the reference [3:6] simply identifies the 3rd-6th variable names # in the data frame for processing. names(plots2)[3:6] <- c( "PLOT 1", "PLOT 2", "PLOT 3", "PLOT 4") names(plots2) # verfy varable names have been changed #-------------------------------------------------------# # complete example for varaible name and value labels #-------------------------------------------------------# # utilize all of the methods above to re-format a data set # suppose again we start off with a data set such as plots2 # LOCATION TRAITS P1 P2 P3 P4 # 1 RR 1 1 5 1 # 2 RR 2 1 4 1 # 1 RR 2 2 4 3 # 2 RR 3 1 NA 3 # 1 BT 4 5 2 4 # 2 BT 5 4 5 5 # 1 BT 5 3 4 4 # 2 BT 4 5 5 5 # start fresh with the origial unformatted data from plots2 plots2<- read.table("plots2.txt", header =TRUE) plots2 # suppose we want the the values for the variable LOCATION to be N for LOCATON =1 and # S for LOCATION= 2. We also want the values for the variabes P1-P4 to be # 1 = "Very Heavy" 2 ="Heavy" 3 = "Moderate" 4 = "Light" 5 ="Very Light" # and we want the variabe names P1-P4 to be "PLOT 1" - "PLOT 4" # STEP 1: change values for variable LOCATION plots2$LOCATION <- factor(plots2$LOCATION,levels=c(1,2,3,4),labels=c("N", "S", "E", "W")) # STEP 2: change values for the variables P1-P4 myPlevels<-c(1,2,3,4,5) # set the # of levels myPlabels <-c("Very Heavy","Heavy","Moderate","Light","Very Light") # labels plots2$P1 <- factor(plots2$P1, myPlevels, myPlabels) # note we are actally changing the values for P1-P4 plots2$P2 <- factor(plots2$P2, myPlevels, myPlabels) # vs creating varables to describe the values plots2$P3 <- factor(plots2$P3, myPlevels, myPlabels) plots2$P4 <- factor(plots2$P4, myPlevels, myPlabels) # STEP 3: change the VARIABLE labels for P1-P4 names(plots2)[3:6] <- c( "PLOT 1", "PLOT 2", "PLOT 3", "PLOT 4") # print re-formatted data set plots2 # functions # create function myvar.port that iterates throught the weights and calculates portfolio variance myvar.port <- function(nWeights) { sigma.z <- matrix(c(0), nrow = nWeights) #initialize - we will compute 11 standard deviations for (i in 1:nWeights){ sigma.z <- sqrt((Wa)^2*(std.a)^2 + (Wb)^2*std.b^2 + 2*p.ab*Wa*Wb*std.a*std.b) } return (sigma.z) } # example: produce summary statistics for each category in a field cats <- as.vector(tmp$Var1) # extract measure ids as a list for(i in 1:19) { print(cats[i]) print(summary(complications$Score[complications$Measure_ID ==cats[i]])) } #-------------------------------------------------- # dealing with scientific notation #-------------------------------------------------- x <- 1.82*(10^-7) y <- format(x, scientific = FALSE) # or even better: options(scipen = 999) #---------------------------------------------------- # PROPENSITY SCORE MATCHING #--------------------------------------------------- ###################################################### # example from MatchIt documentation ##################################################### # example data set is a subset of the job training program analyzed in Lalonde (1986) and Dehejia and Wahba (1999). # MatchIt includes a subsample of the original data con- sisting of the National Supported Work Demonstration (NSW) # treated group and the comparison sample from the Population Survey of Income Dynamics (PSID).1 The variables in this data # set include participation in the job training program (treat, which is equal to 1 if participated in the program, and 0 otherwise), # age (age), years of education (educ), race (black which is equal to 1 if black, and 0 otherwise; # hispan which is equal to 1 if hispanic, and 0 otherwise), marital status (married, which is equal to 1 if married, 0 otherwise), # high school degree (nodegree, which is equal to 1 if no degree, 0 otherwise), 1974 real earnings (re74), 1975 real earnings # (re75), and the main outcome variable, 1978 real earnings (re78) head(lalonde) dim(lalonde) names(lalonde) # "treat" "age" "educ" "black" "hispan" "married" "nodegree" "re74" "re75" "re78" #################################################### # nearest neighbor matching #################################################### # Matching is done using a distance measure specified by the distance option (default=logit). # Matches are chosen for each treated unit one at a time, with the order specified by the m.order command (default=largest to smallest). # At each matching step we choose the control unit that is not yet matched but is closest to the treated unit on the distance measure. m.out1 <- matchit(treat ~ re74 + re75 + age + educ, data = lalonde, method = "nearest", distance = "logit") summary(m.out1) # check balance m.data1 <- match.data(m.out1,distance ="pscore") # create ps matched data set head(m.data1) # view # perform paired t-tests (not in documentation) t.test(m.data1$re78[m.data1$treat==1],m.data1$re78[m.data1$treat==0],paired=TRUE) t.test(lalonde$re78[lalonde$treat==1],lalonde$re78[lalonde$treat==0],paired = FALSE) # export data to compare in SAS or perform additional analysis write.csv(lalonde, file = "//Documents//Briefcase///Data//lalonde.csv") # ex write.csv(w1, file = "r goals.csv") # assumes assigned directory or default directory write.csv(m.data1, file = "//Documents//Briefcase///Data//lalonde_nearest.csv") # ex without rownames write.csv(opioid_dat, file = "//Documents//Briefcase///Data//opioid_dat.csv", row.names = FALSE) #--------------------------------- # working with dates #-------------------------------- # calculate difference in dates # ex 1 survey$date_diff <- as.Date(as.character(survey$date), format="%Y/%m/%d")-as.Date(as.character(survey$tx_start), format="%Y/%m/%d") survey # ex 2 df$diff_in_days<- difftime(df$datevar1 ,df$datevar2 , units = c("days")) # application temp3_dates$diff_days <- abs((as.Date(as.character(temp3_dates$Measure_Start_Date), format="%m/%d/%Y")-as.Date(as.character(temp3_dates$Measure_End_Date), format="%m/%d/%Y"))) # or temp3_dates$diff_days <- abs((difftime(as.Date(as.character(temp3_dates$Measure_Start_Date), format="%m/%d/%Y"),as.Date(as.character(temp3_dates$Measure_End_Date), format="%m/%d/%Y") , units = c("days")))) # create date format df1$CovEffDate <- as.Date(df1$Coverage_Effective_Date, "%Y-%m-%d") # sometimes you need to supply the origin if your data is numeric as_datetime(y, origin = lubridate::origin) # I have also seen as.Date(17536,origin="1970-01-01") # get min and max dates tmp <- df1%>% summarize(mindate = min(CovEffDate, na.rm=TRUE), maxdate = max(CovEffDate, na.rm=TRUE)) ## first days of years seq(as.Date("1910/1/1"), as.Date("1999/1/1"), "years") ## by month seq(as.Date("2000/1/1"), by = "month", length.out = 12) ## quarters seq(as.Date("2000/1/1"), as.Date("2003/1/1"), by = "quarter") ## find all 7th of the month between two dates, the last being a 7th. st <- as.Date("1998-12-17") en <- as.Date("2000-1-7") ll <- seq(en, st, by = "-1 month") rev(ll[ll > st & ll < en]) # application: calculate age at index date for an observational study df1$BirthDate <- as.character(df1$DOB) # format as character df1$BirthDate <- as.Date(df1$BirthDate, "%Y-%m-%d") # format as date df1$age <- round(difftime(df1$index_date,df1$BirthDate, units = c("days"))/(365.25)) # calculate age df1$age <- as.numeric(df1$age) # make sure this is numeric # calculate age using libridate function df1$BirthDate <- as.character(df1$DOB) # format as character df1$BirthDate <- mdy(df1$BirthDate) # apply date format (lubridate function) df1$age <- round(difftime(df1$index_date,df1$BirthDate, units = c("days"))/(365.25)) # calculate age df1$age <- as.numeric(df1$age) # make numeric #---------------------------- # sample training and validation #--------------------------- # store total number of observations in your data N <- 400 print(N) # Number of training observations Ntrain <- N * 0.5 print(Ntrain) # add an explicit row number variable for tracking id <- seq(1,400) apps2 <- cbind(apps,id) # Randomly arrange the data and divide it into a training # and test set. dat <- apps2[sample(1:N),] train <- dat[1:Ntrain,] validate <- dat[(Ntrain+1):N,] dim(dat) dim(train) dim(validate) # sort and look at data sets to see that they are different sort_train <- train[order(train$id),] print(sort_train) sort_val <- validate[order(validate$id),] print(sort_val) # random sample randomSample = function(df,n) { return (df[sample(nrow(df), n),]) } rs <- randomSample(tmp,5) # random sample (with set.seed for repeatability) randomSample = function(df,n) { set.seed(123) return (df[sample(nrow(df), n),]) } randomSample(tmp,5) # first call randomSample(tmp,5) # second call #----------------------------------- # bootstrap #----------------------------------- # regression summary(fit <- lm(GARST ~ PIO, data = yield_data)) # boot strap for regression (this may not be accurate with such small sample) bstrap <- c() for (i in 1:10000) { newsample <- yield_data[sample(nrow(yield_data), 5, replace = T), ] bstrap <- c(bstrap, as.vector(coef(lm(GARST ~ PIO,newsample))[2])) } hist(bstrap) summary(bstrap) sd(bstrap, na.rm = TRUE) # SE quantile(bstrap, c(.025,.975),na.rm = TRUE) #------------------------------------ # transpose or reshape data #------------------------------------ # reference: https://stats.idre.ucla.edu/r/faq/how-can-i-reshape-my-data-in-r/ # create example data id <- c(1,1,1,2,2,2,3,3,3) measure <- c("depth","temp","width","depth","temp","width","depth","temp","width") values <- c(2,50,18,1.5,53,18,2.5,60,18) dat <- data.frame(id,measure,values) # transpose panel data to wide format datwide <- reshape(dat, timevar = "measure", # the panel variable(s) we want to be new columns idvar = c("id"), # vars we want to keep constant direction = "wide") # collapse wide data into panel datnarrow <- reshape(datwide, varying = c("depth","temp","width"), # things we want to collapse into a single column v.names = "value", # name for new column that will hold panel of values timevar = "measure", # name for new column that collapses old variable names into values times = c("depth","temp","width"), # old variable names that become new values new.row.names = 1:1000, direction = "long") names(datwide) <- gsub("values.", "", names(datwide )) # cleanup names #---------------------------- # check duplicates #--------------------------- # check duplication tmp <- data.frame(table(temp1_stars$Provider_ID)) summary(tmp) # remove duplicates or get distinct DEK <- c(150,150,145,140,144,148,152,156,160,164) PLOT <- c(1,1,1,1,5,6,7,8,9,10) BT <- c('Y','Y','Y','N','N','N','Y','N','Y','Y') dat <- data.frame(DEK,PLOT,BT) # how many unique values of BT? length(levels(dat$BT)) # how many unique plot values length(unique(dat$PLOT)) # this data is duplicated in many ways deduped.dat <- unique(dat[,1:3 ] ) # gets unique based on all 3 fields # dplyr offers several options deduped.dat <- dat%>%distinct(PLOT, .keep_all = TRUE) # based on PLOT deduped.dat <- dat%>%distinct(PLOT,BT, .keep_all = TRUE) # based on PLOT and BT deduped.dat <- dat%>%distinct(PLOT,BT,DEK, .keep_all = TRUE) # truly unique based on all listed fields deduped.dat <- dat%>% distinct # same as above without listing fields related to duplication # get distinct instance based on ID and pre term flag = 0 tmp1 <- tmp1_births%>%arrange(MBR_ID,pre_term)%>%distinct(MBR_ID, .keep_all = TRUE) tmp2 <- data.frame(table(tmp1$MBR_ID)) tmp4 <- tmp1[tmp1$MBR_ID =='10060',] # get distinct last instance based on ID and pre term flag = 1 tmp1 <- tmp1_births%>%arrange(MBR_ID,desc(pre_term))%>%distinct(MBR_ID, .keep_all = TRUE) tmp2 <- data.frame(table(tmp1$MBR_ID)) tmp4 <- tmp1[tmp1$MBR_ID =='10060',] # correcting plot boundaries par(mar=c(1,1,1,1)) # example routing checking and correcting duplicates based on key and 1 addiitonal field # check duplicates tmp1 <- data.frame(table(df2$MBR_ID)) # find duplicates tmp <- df2[df2$MBR_ID =='10060',] # appears duplicated on diagnosis &/or claim # # get distinct sorting to keep most recent diagnosis (judgement call) tmp2 <- df2%>%arrange(MBR_ID,desc(SVC_FROM_DT))%>%distinct(MBR_ID, .keep_all = TRUE) tmp3 <- tmp2[tmp2$MBR_ID =='10060',] # check # update data frame df2 <- tmp2 rm(tmp,tmp1,tmp2,tmp3) # cleanup # example referencing against a list that is a random sample of numeric IDs # note: change formatting in 'vals' if character IDs are required # check duplicates tmp1 <- data.frame(table(df5$ID)) # find duplicates tmp <- df5[df5$ID =='80050',] # appears duplicated on flag # # check against a random sample of duplictes rs <- randomSample(tmp1[tmp1$Freq > 1,],5) # look at random sample of duplicates vals <- as.list((as.numeric(as.character(rs$Var1)))) # get ID as list from rs tmp <- df5[df5$ID %in% vals,] # subset data with duplicated rows - check # get distinct sorting to give priority to pre-term flag = 1 (judgement call) tmp2 <- df5%>%arrange(ID,desc(pre_term))%>%distinct(ID, .keep_all = TRUE) tmp3 <- tmp2[tmp2$ID %in% vals,] # check again - all pre_term = 1 obs should be kept # update data frame df5 <- tmp2 rm(tmp,tmp1,tmp2,tmp3,rs,vals) # cleanup #----------------------------- # missing values #----------------------------- # count total missing values in a data frame colSums(is.na(df1_chrt)) ### example recode missing category df1$Lvl <- ifelse(is.na(df1$Level) == TRUE,'1-Low-Risk',df1$Lvl) ### example - categorizing with a missing value category df1$riskcat <- ifelse(df1$Risk_Score <= 3,'1-Low', ifelse(df1$Risk_Score > 3 & df1$Risk_Score <= 5, '2-Moderate', '3-High')) # account for NA group df1$riskcat <- ifelse(is.na(df1$Risk_Score) == TRUE,'0-NA',df1$riskcat) # remove missing based on specified field tmp3 <- tmp3[is.na(tmp3$RUCC_2013) == 'FALSE',] tmp2 <- na.omit(tmp1) # listwise deletion of all missing # basic imputation impute.mean <- function(x) { z <- mean(x, na.rm = TRUE) x[is.na(x)] <- z return(x) } apply(df$xvar,2,impute.mean) impute.zero <- function(x) { x[is.na(x)] <- 0 return(x) } df1 <- data.frame(apply(df,2,impute.zero)) # this imputes all missing column values to zero #-------------------------- # loop processing #------------------------- # loop across fields and produce frequencies results=list() # create list for storing results # run loop and simulate data for(i in 1:5) { results[i] = sample(1:15, 1) } x <- as.numeric(as.character(unlist(results))) # convert results to object summary(x) # analysis #-------------------------------------- # working with functins #-------------------------------------- apply(df,2,sd) # calculate standard deviation for every column in data set df2 <- sapply(df1,as.numeric) # convert all columns to numeric #----------------------------------- # substring text and character operations #----------------------------------- # if ENDT still NULL and there is an coverage effective date then replace null end date with today's date df1$ENDT <- ifelse((substr(df1$ENDT,1,1) =="N" & (substr(df1$DateVar,1,1) != "N")),as.character.Date("2018-10-14"),df1$ENDT) # remove first character df1$ID <- substring(df$ID, 2) # remove last character df1$ID <- substring(df1$ID,1,nchar(df1$ID)-1) # last quote