# *-----------------------------------------------------------------
# | PROGRAM NAME: R Code Pallete
# | DATE: 3/11/20
# | CREATED BY: MATT BOGARD 
# | PROJECT FILE:             
# *----------------------------------------------------------------
# | PURPOSE: 
# *----------------------------------------------------------------


#----------------------------------
# documentation templates
# ---------------------------------


# *-----------------------------------------------------------------
# | PROGRAM NAME: 
# | DATE: 
# | CREATED BY: 
# | PROJECT FILE:             
# *----------------------------------------------------------------
# | PURPOSE: 
# *----------------------------------------------------------------


###############################################################################
###############################################################################
#################          construction zone      #############################
###############################################################################
###############################################################################


##
## new section
## 


#  *-----------------------------------------------------------------*
#  |
#  |
#  |
#  |  reading data
#  |
#  |
#  |
#  |
#  *------------------------------------------------------------------*


# identify location of data used for project by specifying a working directory
# alternatively, if data is stored in numerous places, you can reference
# the file locaton directly

rm(list=ls()) # get rid of any existing data 

options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation

cat("\f") # clear console

clc <- function() cat(rep("\n", 50))

rm("temp") # remove specific data frames

ls() # view open data sets (should be empty if you just ran the code above)

detach("package:vegan", unload=TRUE) # unload packages that we don't need or that mask other functions

# sometimes we may want to unload different versions of packages - the function below handles this + 
# provides a specific example of how Matchit loads MASS which maskes dplyer

# function to detach MASS package which masks the 'select' operation within dplyr used for merging data
# we use dplyr extensively to process data but later in the analysis using the MASS package to estimate
# negative binomial models (for utilization counts like ER visits) it 'masks' or prevents 'select' from working
# this is a problem if we were previously using MASS (or Matchit which leverages MASS) in an analysis and then wanted to run this program
# so we want to 'unload' or 'detach' any versions of MASS which may be loaded before trying to build the data
# for the analysis below leveraging dplyr functions like 'select.' This is unnecessary if MASS has not been loaded

# see also: https://stackoverflow.com/questions/6979917/how-to-unload-a-package-without-restarting-r 

detach_package <- function(pkg, character.only = FALSE)
{
  if(!character.only)
  {
    pkg <- deparse(substitute(pkg))
  }
  search_item <- paste("package", pkg, sep = ":")
  while(search_item %in% search())
  {
    detach(search_item, unload = TRUE, character.only = TRUE)
  }
}

detach_package("MatchIt", TRUE) # first have to unload 'MatchIt' because it imports 'MASS'
detach_package("MASS", TRUE) # unload/detach MASS


# view specific columns
View(df1[,100:103])

str(df1) # variable names and formats
str(df1, list.len=ncol(df1)) # in case R truncates the output above


# set working directory

setwd('/Users/wkuuser/Desktop/R Data Sets') # mac 

setwd("P:\\R  Code References\\R Training") # windows


#  *------------------------------------------------------------------*
#  | make R talk on a macbook
#  *------------------------------------------------------------------* 

b <- sprintf("say Pumpkin pie is the best")
system(b, intern = FALSE, ignore.stdout = FALSE, ignore.stderr =
         FALSE, wait = TRUE, input = NULL)


#  *--------------------------------------------------------------------------------------------------*
#  | direct reading of a single variable vector from command line- similar to cards statement in SAS
#  *------------------------------------------------------------------------------------------------*


GARST <- c(150,140,145,137,141,145,149,153,157,161) # read in specified values for the variable GARST
print(GARST) 

GARST <- c(150,140,145,137,141,145,149,153,157,161) 
PIO <- c(160,150,146,138,142,146,150,154,158,162)
MYC <- c(137,148,151,139,143,120,115,136,130,129)
DEK <- c(150,149,145,140,144,148,152,156,160,164)
PLOT <- c(1,2,3,4,5,6,7,8,9,10)
BT <- c('Y','Y','N','N','N','N','Y','N','Y','Y')
RR <- c('Y','N','Y','N','N','N','N','Y','Y','N')

yield_data <- data.frame(GARST,PIO,MYC,DEK,PLOT,BT,RR)

rm(GARST,PIO,MYC,DEK,PLOT,BT,RR) # cleanup

# scatter plot
plot(yield_data$GARST,yield_data$PIO)

# this is even more like the cards statement in SAS

toread <- "id sex age inc r1 r2 r3 
1  F  35 17  7 2 2 
17  M  50 14  5 5 3 
33  F  45  6  7 2 7 
49  M  24 14  7 5 7 
65  F  52  9  4 7 7 
81  M  44 11  7 7 7 
2   F  34 17  6 5 3 
18  M  40 14  7 5 2 
34  F  47  6  6 5 6 
50  M  35 17  5 7 5" 

survey <- read.table(textConnection(toread), header = TRUE) 
closeAllConnections()

#--------------------------------------------------
# reading a file from the web
#---------------------------------------------------


library(data.table)
mydat <- fread('https://raw.githubusercontent.com/dincerti/dincerti.github.io/master/data/mepsdiab.csv')
head(mydat)


#  *------------------------------------------------------------------*
#  | reading data from a txt file
#  *------------------------------------------------------------------* 


# data looks like this:
# GARST	PIO	MYC	DEK	PLOT BT	RR
# 150	160	137	150	1	 Y	Y
# 140	150	148	149	2	 Y	N
# 145	146	151	145	3	 N	Y
# 137	138	139	140	4	 N	N
# 141	142	143	144	5	 N	N	
# 145	146	120	148	6	 N	N
# 149	150	115	152	7	 Y	N
# 153	154	136	156	8	 N	?
# 157	158	130	160	9	 Y	Y
# 161	162	129	164	10	 Y	N


yield_data<-read.table("Yield_plots.txt", header=TRUE) # read text file
names(yield_data) # var list
print(yield_data) # print data set
yield_data # note: print() is not necessary to print

# get all factor variables

is.fact <- sapply(yield_data, is.factor)
tmp <- yield_data[,is.fact]

#  *------------------------------------------------------------------*
#  | reading data from a csv file
#  *------------------------------------------------------------------* 


# data looks like this
# HYBRID TRAIT	P1	P2	P3
# GARST  RR	    150	140	145
# MYC    BT	    160	150	146
# PIO    RR	    137	148	151
# DEK    BT 	150	149	145


plots <- read.csv("plots.csv", na.strings=c(".", "NA", "", "?"), encoding="UTF-8")
plots


# create function to reformat column names from csv files (from:https://stackoverflow.com/questions/17152483/how-to-replace-the-in-column-names-generated-by-read-csv-with-a-single-spa )

makeColNamesUserFriendly <- function(ds) {
  # FIXME: Repetitive.
  
  # Convert any number of consecutive dots to a single space.
  names(ds) <- gsub(x = names(ds),
                    pattern = "(\\.)+",
                    replacement = "_")
  
  # Drop the trailing spaces.
  names(ds) <- gsub(x = names(ds),
                    pattern = "( )+$",
                    replacement = "")
  ds
}

# create unique ID for each line in data frame
tmp_claims$RowID <- 1:nrow(tmp_claims) 

# convert index to column
names <- rownames(yield_data)
yield_data <- cbind(names,yield_data)

# repair read error for first ID (somethimes csv file read cause the first element in file to misread)
df$ID <- as.character(df$ID)
df[1,1] <- '123456'


#  *------------------------------------------------------------------*
#  | reading data from a excel xlsx
#  *------------------------------------------------------------------* 

xlsxFile <- "C:\\Users\\mtb2901\\Documents\\Data Science Projects\\Projects 2018\\201801 Hospital Risk Profile\\Raw Data\\TX-Hospital-Tricare-Network.xlsx"

# reference: https://www.rdocumentation.org/packages/openxlsx/versions/4.0.17/topics/read.xlsx
# read.xlsx(xlsxFile, sheet = 1, startRow = 1, colNames = TRUE,
# rowNames = FALSE, detectDates = FALSE, skipEmptyRows = TRUE,
# skipEmptyCols = TRUE, rows = NULL, cols = NULL, check.names = FALSE,
# namedRegion = NULL, na.strings = "NA", fillMergedCells = FALSE)

vars <- read.xlsx("FraudT3CB14jun18HEADER.xlsx",sheetIndex = 1) # read header file with variable names


#-------------------------------------
# writing files
#--------------------------------------


# ex without rownames
write.csv(opioid_dat, file = "//Documents//Briefcase///Data//opioid_dat.csv", row.names = FALSE)

# export as R data file 
save(MBR_FACT_ANLY, file = "//projects//Data//MBR_FACT_ANLY.RData")

# export as text with tab dilemeter
write.table(df1, //Projects//Data//df1.txt",row.names = FALSE, sep="\t")


#  *-----------------------------------------------------------------*
#  |
#  |
#  |
#  |  printing and subsetting data
#  |
#  |
#  |
#  |
#  *------------------------------------------------------------------*


# print selected variables

yield_data[c("GARST")]   

# subset data via variable selection  

my_hybrids <- yield_data[ c("GARST", "PIO")]

print(my_hybrids)

# subset based on a single variable

GARST<-yield_data[c("GARST")]
print(GARST)

df  <- df[ , !names(df) %in% c("xvar")] # drop field

# print a subset of data based on observed variable values

print(yield_data[yield_data$GARST==150 & yield_data$PIO==160,])

# subset data based on observed values

high_yields <- yield_data [ yield_data$GARST==150 & yield_data$PIO==160,]

print(high_yields)

stacked_traits <-yield_data[ yield_data$BT =="Y" & yield_data$RR =="Y",] 
stacked_traits


# subset based on just one variable value

LOW_GARST<-yield_data[GARST==140,]
print(LOW_GARST)

LOW_GARST<-yield_data[GARST < 150,]
print(LOW_GARST)

RR<- yield_data[yield_data$RR =="Y",]
RR

Bt<- yield_data[yield_data$BT=="Y",]
Bt

# subset based on %in%

vc <- c(150,145,195)
tmp <- yield_data[yield_data$GARST %in% vc,]

# or
tmp <- yield_data[yield_data$GARST %in% c(150,145,195,161),]

# this might also works using dplyr (which allows for additional manipulation if you want to
# 'pipe' in additional aggregations or calculations)

library(dplyr)

tmp <- yield_data %>% 
  filter(GARST %in% c(150,145,195))
  
# subset based on 'not' %in% using dplyr
tmp <- filter(yield_data,!BT %in% c('Y'))

library(sqldf)

tmp <- sqldf('select * from yield_data where GARST in(150,145,195)')

#------------------------------------------------
# connect to SQL server 
#------------------------------------------------

# read initial analysis file 
library(RODBC)

# connecct to the SQL Server LOUSQLWTS853
DB_Connection <- odbcDriverConnect('driver={SQL Server};server=arlsql123;database=ANLY_SBOX;trusted_connection=true')

# connect to specific table (see documentation to limit rows etc.)
MBR_FACT_ANLY <- sqlFetch(DB_Connection, "mbr_fact_anly")

# export as R data file for future use(so we don't have to hit SQL every time)
save(MBR_FACT_ANLY, file = "//projects//mtt94674//Data//MBR_FACT_ANLY.RData")

# make sure to close your database connection
close(DB_Connection)

#  *-----------------------------------------------------------------*
#  |
#  |
#  |
#  |   summary statistics
#  |
#  |
#  |
#  |
#  *------------------------------------------------------------------*


summary(yield_data) # whole data set

summary(yield_data[c("GARST")])  # single variable

summary(yield_data$GARST)  # single variable (more efficient)

summary(yield_data[c("GARST" , "PIO")]) # selected variables

summary(sqrt(yield_data$GARST) ) # transformation 


# by processing

by(yield_data, yield_data$BT, summary)

# using dplyr
yield_data%>% 
  group_by(BT) %>%     # group by s
  summarize(mean = mean(GARST))  # mean 

### more examples with dplyr

# how many members per risk group (members are duplicated by transactions)
# as such the same member can have more than one transaction scored by the model
# that are in the same risk group and a member can belong to multiple 
# risk groups if they have multiple transactions scored accordingly 

### brute force example
tmp <- df4%>%filter(riskcat == '1:LOW')%>%distinct(mbrid)
tmp <- df4%>%filter(riskcat == '2:MOD')%>%distinct(mbrid)
tmp <- df4%>%filter(riskcat == '3:HI')%>%distinct(mbrid)
tmp <- df4%>%filter(riskcat == '4:VHI')%>%distinct(mbrid)

### a little less brute force
tmp1 <- df4%>%select(riskcat,mbrid)%>%arrange(riskcat,mbrid) # sort by risk and ID
tmp2 <-tmp1%>%group_by(riskcat)%>%distinct(mbrid) # get unique IDs per risk strata

# how many members per strata
tmp2%>%  
  group_by(riskcat)%>% 
  summarize(total = n())

### can we do the things above in one step
df4%>%select(riskcat,mbrid)%>%arrange(riskcat,mbrid)%>%group_by(riskcat)%>%distinct(mbrid)%>% 
  summarize(total = n())

library(Hmisc)

# frequencies
library(Hmisc)
describe(yield_data)

# contents of data set  - (Hmisc) 
contents(yield_data)


# aggregations and crosstabulations

summary(ACTIV_PARTICIP ~ AGECAT, data=hc_mbr_anly, fun=mean)

# tables

tmp <- data.frame(table(complications$Measure_ID)) # saved as an easily readable data frame

aggregate(ACTIV_PARTICIP~RUCC_2013_DESC,hc_mbr_anly,mean)

CROP <- c('Cotton','Cotton','Corn','Corn','Corn','SB','SB','SB','SB','SB')
RR <- c('Y','N','Y','N','N','N','N','Y','Y','N')

yield_data <- data.frame(CROP,RR)

rm(CROP,RR) # cleanup

mytable <-table(yield_data$RR,yield_data$CROP)

tmp <- data.frame(mytable)
names(tmp) <-  c("RR","Crop","Total")
print(tmp)

tmp <- data.frame(prop.table(mytable)) # cell percentages
names(tmp) <- c("RR","Crop","Pct")
print(tmp)

tmp <- data.frame(prop.table(mytable, 1)) # row percentages (%Crop within RR status)
names(tmp) <- c("RR","Crop","RowPct")
print(tmp)

tmp <- data.frame(prop.table(mytable, 2)) # column percentages (%RR within Crop)
names(tmp) <- c("RR","Crop","ColPct")
print(tmp)

# cross tabulations across a range of variables

# average garst across BT and RR

is.fact <- sapply(yield_data, is.factor)
tmp <- yield_data[,is.fact]
vars <- names(tmp)


for(i in 1:2) {
  print("+-+-+-+-+-+-+-+-+-+-+-+-")  
  print(vars[i])
  print("+-+-+-+-+-+-+-+-+-+-+-+-") 
  print(by(yield_data$GARST, yield_data[c(vars[i])],mean))
}

# cross tablulations across a range of columns in a data frame
names(sb) # list columns for reference
xvar <- as.list(names(sb)) # save names as a list to reference in loop

# loop across fields and produce frequencies
for(i in 1:36) {
  print(xvar[i]) 
  print(data.frame(table(sb[c(paste(xvar[i]))])))
}

aggregate(totmthcov~pre_term,df1,mean)

# using dplyr
df1%>% 
  summarize(mean = mean(prob,na.rm = TRUE),
            min = min(prob,na.rm = TRUE),
            Q1=quantile (prob, probs=0.25,na.rm = TRUE),
            med = median(prob,na.rm = TRUE), 
            Q3=quantile(prob, probs=0.75,na.rm = TRUE),
            P85=quantile(prob, probs=0.85,na.rm = TRUE),
            P90=quantile(prob, probs=0.90,na.rm = TRUE),
            P95=quantile(prob, probs=0.95,na.rm = TRUE),
            P99=quantile(prob, probs=0.99,na.rm = TRUE))


# aggregate totals per person
df5 <- df4%>% 
  group_by(ID) %>%
  summarize(TOTAL_ALLOW_PRE = sum(TOT_ALLOWED_AMT))  

xtabs(~ BT, data = yield_data) # counts of types of BT
xtabs(~ BT + RR, data = yield_data) # BT by RR 
xtabs(GARST ~ RR, data=yield_data)  # total RR for GARST

# categorical frequencies by treatment status

mytable <- table(df1$PrimaryDiagDesc,df1$treat)  
prop.table(mytable, 2) # column percentage


#  *-----------------------------------------------------------------*
#  |
#  |
#  |
#  |  creating and adding new variables
#  |
#  |
#  |
#  |
#  *------------------------------------------------------------------*


# create a new data set that includes a new variable that is 
# the sum of  "garst" and "pio"


yield_data2<-transform(yield_data, sum=GARST+PIO)

print (yield_data2)

# add a new variable to existing data set that is
# the sum of all hybrids

#first create data set HYBRIDS

hybrids<-yield_data
print(hybrids)

# add new variable to hybrids
hybrids<-transform(hybrids, SUM=GARST+PIO+MYC+DEK)
print(hybrids)

# or add 2 new variables

hybrids<-transform(hybrids, SUM=GARST+PIO+MYC+DEK,
                   MEAN=SUM/4)

print(hybrids)

# combine or concatenating two columns

yield_data$x <- paste(yield_data$RR,yield_data$BT)

yield_data$x <- paste(yield_data$RR, "-",yield_data$BT)
df$x <- paste(df$n,"-",df$s) for inserting a separator.

#  *-----------------------------------------------------------------*
#  |
#  |
#  |
#  |  conditional processing 
#  |
#  |
#  |
#  |
#  *------------------------------------------------------------------*


# recall from the data set there are 10 plots
# first assume that in plot 3 we accidentally planted a Bt 
# garst hybrid that was supposed to be a non-Bt hybrid
# so we don't want to include it in the non Bt
# average for hybrids- we can use 'ifelse' conditions
# to exclude this from the mean calculation

# first create data set trials

trials<-yield_data
print(trials) 
print(trials$PLOT)


trials$AVG <- ifelse ( trials$PLOT == 3 ,
                       
                       (trials$AVG <- (trials$PIO+trials$MYC+trials$DEK)/3), # excludes GARST in the calculation if PLOT = 3
                       (trials$AVG <- (trials$GARST+trials$PIO+trials$MYC+trials$DEK)/4 ) ) # for all other plots
# creates new variable AVG in data set trials

trials

# another example - notice the nexting of the ifelse conditions:

grades$letter <- ifelse(grades$score >= .90,"A", ifelse(grades$score >= .80 & grades$score < .90, "B",ifelse (grades$score >= .70 & grades$score < .80,"C",ifelse(grades$score >= .60 & 
                                                                                                                                                                    grades$score < .70, "D","F"))))
table(grades$letter)

# create binary target
banking$target <- ifelse(banking$y =="no",0,1)

# model calibration / risk stratification

df1$risk <- ifelse(df1$prob < .12,"1:LOW", 
                   ifelse(df1$prob >= .12 & df1$prob < .30, "2:MOD",
                          ifelse(df1$prob >= .3 & df1$prob < .60, "3:HI","4:VHI")))

table(df1$risk)

# age categorization

df1$AgeCat <- ifelse(df1$age < 18,'Under 18',
                         ifelse(df1$age >= 18 & df1$age <= 25,'18-25',
                                ifelse(df1$age > 25 & df1$age <= 29, '26-29',
                                       ifelse(df1$age > 29 & df1$age <= 39,'30-39',
                                              ifelse(df1$age > 39 & df1$age <= 49, '40-49',
                                                     ifelse(df1$age > 49 & df1$age <= 59, '50-59',
                                                            ifelse(df1$age >59 & df1$age <= 64, '60-64','65-65+'))))))) 


#  *-----------------------------------------------------------------*
#  |
#  |
#  |
#  |  stacking / concatenating/ adding data sets
#  |
#  |
#  |
#  |
#  *------------------------------------------------------------------*


# get only bt hybrids- same as subsetting

BT_HYBRIDS <- yield_data[yield_data$BT=="Y", ]
print(BT_HYBRIDS)

# get only non bt hybrids

NON_BT_HYBRIDS<- yield_data[yield_data$BT=="N",]
print(NON_BT_HYBRIDS)

#  *------------------------------------------------------------------*
#  | combine with the rbind function
#  *------------------------------------------------------------------* 


both<-rbind(BT_HYBRIDS,NON_BT_HYBRIDS)
print(both)


#  *------------------------------------------------------------------*
#  | stack with SQL using sqldf
#  *------------------------------------------------------------------* 

library(sqldf)
library(ggplot2)

both_sql = sqldf('
                 select  *
                 from BT_HYBRIDS union all select * from NON_BT_HYBRIDS 
                 order by PLOT
                 ')
# note: the 'all' statement keeps all rows, if you omit 'all' you get only unique rows	
both_sql


#  *-----------------------------------------------------------------*
#  |
#  |
#  |
#  |
#  |		merging based on common variables
#  |
#  |
#  |
#  *------------------------------------------------------------------*


# first split the data, one set contains plot, bt, rr, garst, pio
# the other set has plot, rr, bt, myc, dek
# we'll work under the assumption that the first set is 'early'
# hybrids and the second set is 'late' hybrids
# as if we collected data on these separately and want to combine the 
# data set to get one set, like we started with with yield_data


# create early / late planting data set

early<-yield_data[c("PLOT","GARST","PIO","BT","RR")]
early

late<-yield_data[c("PLOT","MYC","DEK","BT","RR")]
late

#  *------------------------------------------------------------------*
#  | merge by common variables- PLOT, BT, RR
#  *------------------------------------------------------------------* 

hybrids<-merge(early,late,by=c("PLOT","BT","RR"))

hybrids

#  *------------------------------------------------------------------*
#  | merging by a single common variable
#  *------------------------------------------------------------------* 


# the above merges back to the orignianl data set, but 
# we had to be careful to merge by ALL of the common variables
# suppose we had the following data set strip_trial.txt

# HYBRID	YIELD	TRAIT
# GARST	150	BT
# MYC	140	RR 
# PIO	160	RR
# DEK	145	BT

# lets read it in and split it up to form 2 data sets 'yields'
# and 'traits' with a SINGLE common variable 'HYBRID' to merge by

#'yields'

# HYBRID	YIELD	
# GARST	150	
# MYC	140	
# PIO	160	
# DEK	145	


# 'traits'

# HYBRID	TRAIT	
# GARST	BT	
# MYC	RR	
# PIO	RR	
# DEK	BT

# read data 'yields'

yields <- read.table("yields.txt", header=TRUE) # read text file
yields

# read data 'traits'

traits <- read.table("traits.txt", header=TRUE) # read text file
traits

# now to demonstate merging data sets by a common variable

field_data<-merge(yields,traits,by=c("HYBRID"))
field_data 

#  *------------------------------------------------------------------*
#  | merging with SQL using sqldf
#  *------------------------------------------------------------------* 

# library(sqldf)
# library(ggplot2)

# left join yields and traits on HYBRID
field_data_sql = sqldf('
                       select  a.HYBRID,a.YIELD,b.TRAIT
                       from yields a left join traits b
                       on a.HYBRID =b.HYBRID 
                       order by a.YIELD')

field_data_sql

# left join  early and late (from above) on PLOT- note for each plot RR and BT are the 
# same regardless of the hybrid  so specifying BT and RR for only one data set will suffice, 
# but we need to join on PLOT  this seems to give a much more straightforward solution to the 
# merge above using the R merge statement

hybrids_sql = sqldf('
                    select  a.PLOT,a.GARST,a.PIO,
                    b.MYC, b.DEK, a.BT, a.RR
                    from early a left join late b
                    on a.PLOT =b.PLOT 
                    order by a.PLOT')

hybrids_sql


#-----------------------------------------------------
# merging using dplyr
#----------------------------------------------------

tmp3 <- tmp2%>% left_join(tmp1, by = "State") 

# get all months between EFFMTH and ENDMTH in cohort history file
monthlist <- df1%>%left_join(select(tmp5,MBRID,Month,covmth), 
                             by = "MBRID")%>%filter(Month >= EFFMTH & Month <= ENDMTH)

monthlist <- monthlist %>%arrange(MBRID,Month)


# more complicated join wiht dplyr
# merge claims data with cohort file keeping only claims between the coverage periods
df3 <- df2%>%select(MBRID,EFFMTH,ENDMTH,INDEX_MTH_DT,pre12,post12,claimflag)%>%left_join(select(raw_claims,MBRID = PID,Claim_Number,SVC_FROM_DT,TOT_ALLOWED_AMT), 
                                                                                         by = "DMBRID")%>%filter(SVC_FROM_DT >= EFFMTH & SVC_FROM_DT <= ENDMTH)


#  *-----------------------------------------------------------------*
#  |
#  |
#  | 
#  |  sorting data
#  |
#  |
#  |
#  *------------------------------------------------------------------*


# use file plots2 weed pressure 

# LOCATION TRAITS P1 P2 P3 P4
#       1     RR  1  1  5  1
#       2     RR  2  1  4  1
#       1     RR  2  2  4  3
#       2     RR  3  1 NA  3
#       1     BT  4  5  2  4
#       2     BT  5  4  5  5
#       1     BT  5  3  4  4
#       2     BT  4  5  5  5


plots2<- read.table("plots2.txt", header =TRUE)
plots2

# sort data by location

plotsSorted<-plots2[order(plots2$LOCATION),]
plots2
plotsSorted

# sort by trait then location

plots2Sorted<-plots2[order(plots2$TRAITS, plots2$LOCATION),]
plots2Sorted

# for descending order, prefix any variable with a minus sign

plots2Sorted<-plots2[order(-plots2$LOCATION,plots2$TRAITS),]
plots2Sorted

# dplyr

tmp <- arrange(tmp,desc(Freq))

#  *-----------------------------------------------------------------*
#  |                
#  |
#  |		 	
#  |	graphics
#  |
#  |
#  |
#  *------------------------------------------------------------------*


#  *------------------------------------------------------------------*
#  | examples of r graphics capabilities
#  *------------------------------------------------------------------* 

demo(graphics)
demo(persp)

library(lattice)
demo(lattice)

#  *------------------------------------------------------------------*
#  | basic plots
#  *------------------------------------------------------------------*

# histogram
hist(yield_data$GARST)

# bar plot
barplot(yield_data$GARST, main="Garst Yields", xlab ="Plots",col =c("darkblue","red"), legend =rownames(yield_data$PLOT))

# bar plot (example)
counts <- mytable$Freq[mytable$Freq > 8]
barplot(counts, main="Conditions of Members",
        xlab="Conditions", col=c("darkblue","red"),
        legend = mytable$Var1[mytable$Freq > 8], beside=False)

# using xtabs to create plots
plot(~xtabs(GARST ~ RR, data=yield_data)) # barplot
plot(xtabs(~ BT, data = yield_data)) # barplot
plot(xtabs(~ BT + RR, data = yield_data)) # mosaic plot

# scatter plot
plot(yield_data$GARST,yield_data$PIO)

# scatterplot matrix of whole data set
plot(yield_data)

# scatterplot matrix- data used in regression in analysis section
land_prices<- read.table("land_prices.txt", header =TRUE)
land_prices

plot(land_prices)

# brushed scatterplot - as in model visualization
ggplot(df1,aes(x=SUMRX_NONLTOT_QB_GE120MEDD,y=SUMRX_LTOT_QB_GE120MEDD, shape = risk,color = risk)) +
  scale_color_manual(values=c("green","yellow", "red","red")) + geom_point()

# labels + jittering
set.seed(456)
ggplot(df1,aes(x=SUMRX_NONLTOT_QB_GE120MEDD,y=SUMRX_LTOT_QB_GE120MEDD, shape = risk,color = risk)) +
  scale_color_manual(values=c("green","yellow", "red","red")) + geom_point() + geom_jitter(width = 2, height =2) +
  xlab("Total Non-Qualified >= 120") +
  ylab("Total Qualified >= 120") +
  ggtitle("Observed Rx Counts Overlaid by Predicted Risk/Stratification")


#  *------------------------------------------------------------------*
#  | stacked bar chart
#  *------------------------------------------------------------------*

# get data
bytrait<- read.table("biotech_yield.txt", header =TRUE)
bytrait
names(bytrait)

# summarize data for stacking by trait 
counts <- table(bytrait$RR, bytrait$YIELD) # list stacked variable first
counts

barplot(counts, main ="Yield by Trait for Garst",xlab ="Yield Category", col =c("darkblue","red"),legend =rownames(counts))


#--------------------------------------------
# pie and bar charts in ggplot
#--------------------------------------------

table(df$pre_term) # get values by group

tmp <- data.frame(
  group = c("Full-Term", "Pre-Term"),
  value = c(7999,6000)
)

# stacked bar
bp <- ggplot(tmp, aes(x="", y=value, fill=group))+
  geom_bar(width = 1, stat = "identity")

bp

# pie
pie <- bp + coord_polar("y", start=0)
pie

pie + scale_fill_manual(values=c("red","blue"))


#  *------------------------------------------------------------------*
#  | basic density plots
#  *------------------------------------------------------------------*


# plot and save histogram data 
garsths <- hist(yield_data$GARST, main = "Garst Yield Histogram")
print(garsths)

# compute density estimates using default gaussian & rule of thumb bandwidth
garstdens <- density(x=yield_data$GARST, bw="nrd0",kernel="gaussian",n=20)
print(garstdens)

# rescale density so it will plot on the same graph as GARST histogram
rs <- max(garsths$counts/max(garstdens$y)) # create scaling factor for plotting the density
lines(garstdens$x, garstdens$y*rs, col=2) # plot density over histogram using lines statment

# compare this to just plotting the non-rescaled density over the histogram
lines(density(x=yield_data$GARST, bw="nrd0", n=20),col=3) # green line barely shows up

# turn off/close graphics window
dev.off()


#  *------------------------------------------------------------------*
#  | overlapping density plots
#  *------------------------------------------------------------------*


library(colorspace) # package for rainbow_hcl function

# Generate just the data for a histogram of GARST grouping by RR trait

ds <- rbind(data.frame(dat=yield_data[,][,"GARST"], grp="All"),
            data.frame(dat=yield_data[,][yield_data$RR=="N","GARST"], grp="N"),
            data.frame(dat=yield_data[,][yield_data$RR=="Y","GARST"], grp="Y"))

# histogram for all GARST data			
hs <- hist(ds[ds$grp=="All",1], main="", xlab="GARST", col="grey90", ylim=c(0, 9.46395633979357), breaks="fd", border=TRUE)

# compute density, rescale, and plot for all GARST data
dens <- density(ds[ds$grp=="All",1], na.rm=TRUE)
rs <- max(hs$counts)/max(dens$y)
lines(dens$x, dens$y*rs, type="l", col=rainbow_hcl(3)[1])

# compute density, rescale, and plot where RR = 'N'
dens <- density(ds[ds$grp=="N",1], na.rm=TRUE)
rs <- max(hs$counts)/max(dens$y)
lines(dens$x, dens$y*rs, type="l", col=rainbow_hcl(3)[2])

# compute density, rescale, and plot where RR = 'Y'
dens <- density(ds[ds$grp=="Y",1], na.rm=TRUE)
rs <- max(hs$counts)/max(dens$y)
lines(dens$x, dens$y*rs, type="l", col=rainbow_hcl(3)[3])

# Add a rug to illustrate density

rug(ds[ds$grp=="N", 1], col=rainbow_hcl(3)[2])
rug(ds[ds$grp=="Y", 1], col=rainbow_hcl(3)[3])

# Add a legend to the plot.

legend("topright", c("All", "N", "Y"), bty="n", fill=rainbow_hcl(3))

# Add a title to the plot.

title(main="Distribution of GARST
      by RR", sub=paste("Created Using R Statistical Package", format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))

### pie chart

table(tmp$Enrollee) # get counts
slices <- c(3111,14141) # define slices
lbls <- c("True","False") # slice labels
pct <- round(slices/sum(slices)*100,digits =2) # calc pct
lbls <- paste(lbls, pct) # add calculated percents to labels 
lbls <- paste(lbls,"%",sep="") # ad % to labels 
pie(slices,labels = lbls, col=rainbow(length(lbls)),
    main="Members")

### aggregate time series and produce multiple line chart (dply & ggplot2)

# subset only top 5 departmetns for OT Hrs
tmp2<- 
  filter(df2,Role=="Specialist"|Role=="Finder"|Role=="Customer Care Specialist"|Role =="Clinical"|Role=="Content Specialist")

# aggregate by department and date
tmp3 <- tmp2%>%
  group_by(Role,End_Date) %>%
  summarize(TotalOTHrs = sum(Overtime_Hours))

ggplot(tmp3,aes(End_Date,TotalOTHrs,colour=Role)) + geom_line() + geom_point()

### aggregate and simple plot
# aggregate & visualize total OT by pay period
tmp <- df2%>%
  group_by(End_Date) %>%
  summarize(TotalOTHrs = sum(Overtime_Hours))

plot(tmp$End_Date,tmp$TotalOTHrs,type="b", xlab = "End Date", ylab = "Total OT Hrs")

#------------------------------------------------
# side by side plots
#------------------------------------------------

par(mfrow=c(1,2))    # set the plotting area into a 1*2 array
hist(treat,breaks = 50)
hist(ctrl, breaks = 50)


#  *-----------------------------------------------------------------*
#  |
#  |
#  |
#  |     analysis
#  |
#  |
#  |
#  *------------------------------------------------------------------*


#  *------------------------------------------------------------------*
#  | t-tests
#  *------------------------------------------------------------------* 


# independent samples t-test

t.test(yield_data$GARST,yield_data$PIO ) # difference in GARST vs PIO yields from yield_data file

t.test(plots2$P4[plots2$TRAITS=='RR'],plots2$P4[plots2$TRAITS=='BT'] ) # difference in weed pressure for bt and rr traits in plot 4


# paried t-test

t.test(plots2$P1,plots2$P2,paired=TRUE) # differences in overall weed pressure for plots 1 and 2 (more appropriately used on same subjects- before and after tests)
# example if plot 1 was before herbicide application while plot 2 was the same plot measured for weed 
pressure after application

t.test(yield_data$GARST,yield_data$PIO,paired=TRUE) # not actually appropriate but notice the difference in the results from the previous t-test on 						
# GARST and PIO


#  *------------------------------------------------------------------*
#  | measures of association
#  *------------------------------------------------------------------* 


# pearson correlations - no sig tests

rcorr(cbind(plots2$P1,plots2$P2,plots2$P3,plots2$P4)) # gives pearson correlations

# the cor.test has p-value, ci, but only 2 vars at a time

cor.test(plots2$P1,plots2$P2,use="pairwise")

# spearman correlations using the hmisc rcorr function

rcorr( cbind(plots2$P1,plots2$P2,plots2$P3,plots2$P4), type='spearman')

rcorr( cbind(plots2$P1,plots2$P2,plots2$P3,plots2$P4), type='pearson') # compare, also same as defalt


#  *------------------------------------------------------------------*
#  | simple linear regression
#  *------------------------------------------------------------------* 


# lets use another data set land_prices

# PRICE	ACRES IMPROVEMENTS
# 36	9	  8
# 80	15	  7
# 44	10	  9
# 55	11	  10
# 35	10	  6

land_prices<- read.table("land_prices.txt", header =TRUE)
land_prices

plot(land_prices) # scatterplot of dependent and independent variables

myRegModel<-lm(PRICE~ACRES,data=land_prices) # run regression

summary(myRegModel) # view estimates

anova(myRegModel) 

# simple plots- plot data and regresion line

plot(land_prices$ACRES,land_prices$PRICE)
abline(myRegModel)

# automated plots

plot(myRegModel)

termplot(myRegModel)


#  *------------------------------------------------------------------*
#  | multipe linear regression
#  *------------------------------------------------------------------* 


myRegModel<-lm(PRICE~ACRES+IMPROVEMENTS,data=land_prices) # run regression

summary(myRegModel) # view estimates

anova(myRegModel) 

plot(myRegModel)

termplot(myRegModel)


#  *------------------------------------------------------------------*
#  | analysis of variance
#  *------------------------------------------------------------------* 


# look at data set 'yield_data'

yield_data<-read.table("Yield_plots.txt", header=TRUE) # read text file
print(yield_data) # print data set

# data looks like this:
# GARST	PIO	MYC	DEK	PLOT BT	RR
# 150	160	137	150	1	 Y	Y
# 140	150	148	149	2	 Y	N
# 145	146	151	145	3	 N	Y
# 137	138	139	140	4	 N	N
# 141	142	143	144	5	 N	N	
# 145	146	120	148	6	 N	N
# 149	150	115	152	7	 Y	N
# 153	154	136	156	8	 N	?
# 157	158	130	160	9	 Y	Y
# 161	162	129	164	10	 Y	N


# we would like to set up a design such as this:
# let treatments = hybrid, blocks = plot 

#------------------------------------------------#
#		HYBRID

#		GARST	PIO	MYC	DEK

# PLOT 1	150	137	160	150

# PLOT 2	140	148	150	149

# PLOT 3	145	151	146	145
#------------------------------------------------#


# actual data set will need to look like this 'yield_trials'

# HYBRID PLOT YIELD
# GARST	 P1	  150
# GARST	 P2	  140
# GARST	 P3	  145
# PIO	 P1	 137
# PIO	 P2	 148
# PIO	 P3	 151
# MYC	 P1	 160
# MYC	 P2	 150
# MYC	 P3	 146
# DEK	 P1	 150
# DEK	 P2	 149
# DEK	 P3	 145


yield_trials<-read.table("yield_trials.txt", header=TRUE) # read text file
print(yield_trials) # print data set

#  *------------------------------------------------------------------*
#  | 2 WAY AOV
#  *------------------------------------------------------------------* 


myModel<-aov(YIELD~HYBRID+PLOT,data=yield_trials)

summary(myModel)
anova(myModel) # should give same results as above
plot(myModel)
termplot(myModel)

#--------------------------------------------------
# getting marginal/adjusted/least squares means from multivariable models
#--------------------------------------------------

### example linear regression 

summary(m1 <- lm(Post_ER_Admit ~  Control_Study_Flag + CaseOpenReasonDesc + Diag_Grouping + SourceFlag + Gender_Code + LowIncome + RuralZips + Pre_ER_Admit + AcutePre + Length_Stay_Pre + PrimaryCareVisitCount + Cancer_abnormal_cervical +  Asthma + Pulmonary_Disease + HTN + Heart_Disease + Diabetes + Obesity + Depression + HIV + Other_Mental_Disorders + RiskLvl + OtherProgramFlag + Total_Bene + Total_TEPRV_ID + Total_Hospital_Beds + Total_Hospitals +BranchDesc + SponsorRankDesc + AgeCat, data = df_chrt))

# b_trt = 1.03 more ER visits in the treatment group compared to .64 fewer in the matched analysis

# use emmeans package to calculate 'marginal means' or adjusted means

library(emmeans)
emmeans (m1,  ~ Control_Study_Flag) # 4.81-3.77 = 1.04 which is ~ what we get directly from the regression above

### example negative binomial model

summary(m1 <- glm.nb(Post_ER_Admit ~ Control_Study_Flag  + CaseOpenReasonDesc + Diag_Grouping + SourceFlag + Gender_Code + LowIncome + RuralZips +Pre_ER_Admit + AcutePre + Length_Stay_Pre + PrimaryCareVisitCount + Cancer_abnormal_cervical +  Asthma + Pulmonary_Disease + HTN + Heart_Disease + Diabetes + Obesity + Depression +HIV + Other_Mental_Disorders + RiskLvl + OtherProgramFlag +Total_Bene + Total_TEPRV_ID + Total_Hospital_Beds + Total_Hospitals + BranchDesc + SponsorRankDesc + AgeCat, control = glm.control(maxit = 500),data = df_chrt))

exp(0.184) # IRR = 1.202 treatment group has ~  20% more ER visits than controls

m1.mmeans <- emmeans(m1,  ~ Control_Study_Flag) # estimate marginal means
summary(m1.mmeans, infer = TRUE, type = 'response') # summary with backtransformation to reflect original scale

(.0634/.0527) # this gives a ratio of marginal means = 1.203 which is consistent with the exponentiated result directly from the model


#  *-----------------------------------------------------------------*
#  |
#  |
#  |
#  |  labels and formats
#  |
#  |
#  |
#  *------------------------------------------------------------------*


#-----------------------------------------------------------------#

# value labels or formats

#-----------------------------------------------------------------#


##
## example 1: recoding
##

# change location from 'numeric' into a 'factor' with character values
# this is not actually creating labels, bt actually changing the numeric values
# of LOCATION to more meaningful character values such as "N" for north and "S" for south

plots2$LOCATION <- factor(plots2$LOCATION,levels=c(1,2,3,4),labels=c("N", "S", "E", "W"))
plots2


##
## example 2: creating new variables to act as labels for values
##


# lets create new variales to act as labels for insect pressure levels 1-5 
# instead of changing the numeric values for P1-P4, we are creating a new
# set of character variables to describe P1-P4

myPlevels<-c(1,2,3,4,5) # set the # of levels

myPlabels <-c("Very Heavy","Heavy","Moderate","Light","Very Light") #  labels 

# now create a new set of variables that describe P1-P4

plots2$P1f <- factor(plots2$P1, myPlevels, myPlabels)
plots2$P2f <- factor(plots2$P2, myPlevels, myPlabels)
plots2$P3f <- factor(plots2$P3, myPlevels, myPlabels)
plots2$P4f <- factor(plots2$P4, myPlevels, myPlabels)

plots2

# Get summary and see that LOCATIONS are now counted.

summary( plots2[ c("P1f","P2f","P3f","P4f") ] )


##
## note we could have just renamed these values just as we did for 
## location in the first example
##

plots2b <-  read.table("plots2.txt", header =TRUE) # copy of plots2 for this example
plots2b

# note we are re-using pre-defined labels from above so those statements
# are not repeated here

plots2b$P1 <- factor(plots2b$P1, myPlevels, myPlabels)
plots2b$P2 <- factor(plots2b$P2, myPlevels, myPlabels)
plots2b$P3 <- factor(plots2b$P3, myPlevels, myPlabels)
plots2b$P4 <- factor(plots2b$P4, myPlevels, myPlabels)

plots2b


#-----------------------------------------------------------#

# variable labels 

#-----------------------------------------------------------#

# note in the examples above we were concerned with the formatting
# the values a particular variable could take, such as values N, S 
# for the variable LOCATION or the value "Very Heavy" for the variables P1-P4
# here we are looking at changing the name or appearance of the variables themselves
# vs the values they may take

library(Hmisc)

# start fresh with the origial unformatted data from plots2

plots2<- read.table("plots2.txt", header =TRUE)
plots2

# assign variable names to act as labels - lets assume we 
# want vriables p1-p4 to appear as plot1 - plot4
# this example 'pastes' new names over the old ones or 
# simply changes the variable names

# note the reference [3:6] simply identifies the 3rd-6th variable names
# in the data frame for processing. 

names(plots2)[3:6] <- c(
  "PLOT 1",
  "PLOT 2",
  "PLOT 3",
  "PLOT 4")
names(plots2) # verfy varable names have been changed


#-------------------------------------------------------#

#  complete example for varaible name and value labels

#-------------------------------------------------------#

# utilize all of the methods above to re-format a data set
# suppose again we start off with a data set such as plots2

# LOCATION TRAITS P1 P2 P3 P4
#       1     RR  1  1  5  1
#       2     RR  2  1  4  1
#       1     RR  2  2  4  3
#       2     RR  3  1 NA  3
#       1     BT  4  5  2  4
#       2     BT  5  4  5  5
#       1     BT  5  3  4  4
#       2     BT  4  5  5  5

# start fresh with the origial unformatted data from plots2

plots2<- read.table("plots2.txt", header =TRUE)
plots2


# suppose we want the the values for the variable LOCATION to be N for LOCATON =1 and 
# S for LOCATION= 2. We also want the values for the variabes P1-P4 to be 
# 1 = "Very Heavy" 2 ="Heavy" 3 = "Moderate" 4 = "Light" 5 ="Very Light"
# and we want the variabe names P1-P4 to be "PLOT 1" - "PLOT 4"

# STEP 1: change values for variable LOCATION

plots2$LOCATION <- factor(plots2$LOCATION,levels=c(1,2,3,4),labels=c("N", "S", "E", "W"))

# STEP 2: change values for the variables P1-P4

myPlevels<-c(1,2,3,4,5) # set the # of levels

myPlabels <-c("Very Heavy","Heavy","Moderate","Light","Very Light") #  labels 


plots2$P1 <- factor(plots2$P1, myPlevels, myPlabels) # note we are actally changing the values for P1-P4
plots2$P2 <- factor(plots2$P2, myPlevels, myPlabels) # vs creating varables to describe the values
plots2$P3 <- factor(plots2$P3, myPlevels, myPlabels)
plots2$P4 <- factor(plots2$P4, myPlevels, myPlabels)

# STEP 3: change the VARIABLE labels for P1-P4

names(plots2)[3:6] <- c(
  "PLOT 1",
  "PLOT 2",
  "PLOT 3",
  "PLOT 4")

# print re-formatted data set 

plots2


# functions

# create function myvar.port that iterates throught the weights and calculates portfolio variance

myvar.port <- function(nWeights) {
  sigma.z <- matrix(c(0), nrow = nWeights) #initialize - we will compute 11 standard deviations
  for (i in 1:nWeights){
    sigma.z <- sqrt((Wa)^2*(std.a)^2 + (Wb)^2*std.b^2 + 2*p.ab*Wa*Wb*std.a*std.b)
  }
  return (sigma.z)
  
}


# example: produce summary statistics for each category in a field 

cats <- as.vector(tmp$Var1) # extract measure ids as a list

for(i in 1:19) {
  
  print(cats[i]) 
  print(summary(complications$Score[complications$Measure_ID ==cats[i]]))
  
}


#--------------------------------------------------
# dealing with scientific notation
#--------------------------------------------------


x <- 1.82*(10^-7) 
y <- format(x, scientific = FALSE)


# or even better:

options(scipen = 999)


#----------------------------------------------------
# PROPENSITY SCORE MATCHING
#---------------------------------------------------

######################################################
# example from MatchIt documentation
#####################################################
# example data set is a subset of the job training program analyzed in Lalonde (1986) and Dehejia and Wahba (1999). 
# MatchIt includes a subsample of the original data con- sisting of the National Supported Work Demonstration (NSW) 
# treated group and the comparison sample from the Population Survey of Income Dynamics (PSID).1 The variables in this data 
# set include participation in the job training program (treat, which is equal to 1 if participated in the program, and 0 otherwise),
# age (age), years of education (educ), race (black which is equal to 1 if black, and 0 otherwise; 
# hispan which is equal to 1 if hispanic, and 0 otherwise), marital status (married, which is equal to 1 if married, 0 otherwise),
# high school degree (nodegree, which is equal to 1 if no degree, 0 otherwise), 1974 real earnings (re74), 1975 real earnings
#  (re75), and the main outcome variable, 1978 real earnings (re78)

head(lalonde)
dim(lalonde)
names(lalonde)
#  "treat"    "age"      "educ"     "black"    "hispan"   "married"  "nodegree" "re74"     "re75"     "re78"   

####################################################
# nearest neighbor matching
####################################################

# Matching is done using a distance measure specified by the distance option (default=logit). 
# Matches are chosen for each treated unit one at a time, with the order specified by the m.order command (default=largest to smallest).
#  At each matching step we choose the control unit that is not yet matched but is closest to the treated unit on the distance measure.

m.out1 <- matchit(treat ~ re74 + re75 + age + educ, data = lalonde, method = "nearest", distance = "logit")

summary(m.out1) # check balance


m.data1 <- match.data(m.out1,distance ="pscore") # create ps matched data set

head(m.data1) # view

# perform paired t-tests (not in documentation)
t.test(m.data1$re78[m.data1$treat==1],m.data1$re78[m.data1$treat==0],paired=TRUE)
t.test(lalonde$re78[lalonde$treat==1],lalonde$re78[lalonde$treat==0],paired = FALSE)


# export data to compare in SAS or perform additional analysis

write.csv(lalonde, file = "//Documents//Briefcase///Data//lalonde.csv")

# ex 

write.csv(w1, file = "r goals.csv") # assumes assigned directory or default directory
write.csv(m.data1, file = "//Documents//Briefcase///Data//lalonde_nearest.csv")

# ex without rownames
write.csv(opioid_dat, file = "//Documents//Briefcase///Data//opioid_dat.csv", row.names = FALSE)


#---------------------------------
# working with dates
#--------------------------------

# calculate difference in dates

# ex 1
survey$date_diff <- as.Date(as.character(survey$date), format="%Y/%m/%d")-as.Date(as.character(survey$tx_start), format="%Y/%m/%d")
survey

# ex 2
df$diff_in_days<- difftime(df$datevar1 ,df$datevar2 , units = c("days"))


# application
temp3_dates$diff_days <- abs((as.Date(as.character(temp3_dates$Measure_Start_Date), format="%m/%d/%Y")-as.Date(as.character(temp3_dates$Measure_End_Date), format="%m/%d/%Y")))

# or
temp3_dates$diff_days <- abs((difftime(as.Date(as.character(temp3_dates$Measure_Start_Date), format="%m/%d/%Y"),as.Date(as.character(temp3_dates$Measure_End_Date), format="%m/%d/%Y") , units = c("days"))))


# create date format
df1$CovEffDate <- as.Date(df1$Coverage_Effective_Date, "%Y-%m-%d")

# sometimes you need to supply the origin if your data is numeric
as_datetime(y, origin = lubridate::origin)

# I have also seen
as.Date(17536,origin="1970-01-01")

# get min and max dates
tmp <- df1%>%
  summarize(mindate = min(CovEffDate, na.rm=TRUE),
            maxdate = max(CovEffDate, na.rm=TRUE))

## first days of years
seq(as.Date("1910/1/1"), as.Date("1999/1/1"), "years")
## by month
seq(as.Date("2000/1/1"), by = "month", length.out = 12)
## quarters
seq(as.Date("2000/1/1"), as.Date("2003/1/1"), by = "quarter")

## find all 7th of the month between two dates, the last being a 7th.
st <- as.Date("1998-12-17")
en <- as.Date("2000-1-7")
ll <- seq(en, st, by = "-1 month")
rev(ll[ll > st & ll < en])

# application: calculate age at index date for an observational study
df1$BirthDate <- as.character(df1$DOB) # format as character
df1$BirthDate <- as.Date(df1$BirthDate, "%Y-%m-%d") # format as date
df1$age <- round(difftime(df1$index_date,df1$BirthDate, units = c("days"))/(365.25)) # calculate age
df1$age <- as.numeric(df1$age) # make sure this is numeric

# calculate age using libridate function
df1$BirthDate <- as.character(df1$DOB) # format as character
df1$BirthDate <- mdy(df1$BirthDate) # apply date format (lubridate function)
df1$age <- round(difftime(df1$index_date,df1$BirthDate, units = c("days"))/(365.25)) # calculate age
df1$age <- as.numeric(df1$age) # make numeric

#----------------------------
# sample training and validation
#---------------------------

# store total number of observations in your data
N <- 400 
print(N)

# Number of training observations
Ntrain <- N * 0.5
print(Ntrain)

# add an explicit row number variable for tracking

id <- seq(1,400)

apps2 <- cbind(apps,id)

# Randomly arrange the data and divide it into a training
# and test set.

dat <- apps2[sample(1:N),]
train <- dat[1:Ntrain,]
validate <- dat[(Ntrain+1):N,]

dim(dat)
dim(train)
dim(validate)

# sort and look at data sets to see that they are different

sort_train <- train[order(train$id),]
print(sort_train)

sort_val <- validate[order(validate$id),]
print(sort_val)


# random sample
randomSample = function(df,n) { 
  return (df[sample(nrow(df), n),])
}

rs <- randomSample(tmp,5)


# random sample (with set.seed for repeatability)
randomSample = function(df,n) { 
  set.seed(123)
  return (df[sample(nrow(df), n),])
}

randomSample(tmp,5) # first call
randomSample(tmp,5) # second call

#-----------------------------------
#  bootstrap
#-----------------------------------


# regression 
summary(fit <- lm(GARST ~ PIO, data = yield_data))

# boot strap for regression (this may not be accurate with such small sample)

bstrap <- c()
   for (i in 1:10000) {
   newsample <- yield_data[sample(nrow(yield_data), 5, replace = T), ]
   bstrap <- c(bstrap, as.vector(coef(lm(GARST ~ PIO,newsample))[2])) }
hist(bstrap)
summary(bstrap)

sd(bstrap, na.rm = TRUE)  # SE
quantile(bstrap, c(.025,.975),na.rm = TRUE) 


#------------------------------------
# transpose or reshape data
#------------------------------------

# reference: https://stats.idre.ucla.edu/r/faq/how-can-i-reshape-my-data-in-r/

# create example data
id <- c(1,1,1,2,2,2,3,3,3)
measure <- c("depth","temp","width","depth","temp","width","depth","temp","width")
values <- c(2,50,18,1.5,53,18,2.5,60,18)
dat <- data.frame(id,measure,values) 

# transpose panel data to wide format
datwide <- reshape(dat, 
                   timevar = "measure", # the panel variable(s) we want to be new columns
                   idvar = c("id"), # vars we want to keep constant
                   direction = "wide")

# collapse wide data into panel
datnarrow <- reshape(datwide, 
                     varying = c("depth","temp","width"), # things we want to collapse into a single column
                     v.names = "value", # name for new column that will hold panel of values
                     timevar = "measure",  # name for new column that collapses old variable names into values
                     times = c("depth","temp","width"), # old variable names that become new values
                     new.row.names = 1:1000,
                     direction = "long")

names(datwide) <- gsub("values.", "", names(datwide )) # cleanup names


#----------------------------
# check duplicates
#---------------------------


# check duplication
tmp <- data.frame(table(temp1_stars$Provider_ID))
summary(tmp)


# remove duplicates or get distinct

DEK <- c(150,150,145,140,144,148,152,156,160,164)
PLOT <- c(1,1,1,1,5,6,7,8,9,10)
BT <- c('Y','Y','Y','N','N','N','Y','N','Y','Y')

dat <- data.frame(DEK,PLOT,BT)

# how many unique values of BT?
length(levels(dat$BT))

# how many unique plot values
length(unique(dat$PLOT))

# this data is duplicated in many ways
deduped.dat <- unique(dat[,1:3 ] ) # gets unique based on all 3 fields

# dplyr offers several options
deduped.dat <- dat%>%distinct(PLOT, .keep_all = TRUE) # based on PLOT
deduped.dat <- dat%>%distinct(PLOT,BT, .keep_all = TRUE) # based on PLOT and BT
deduped.dat <- dat%>%distinct(PLOT,BT,DEK, .keep_all = TRUE) # truly unique based on all listed fields
deduped.dat <- dat%>% distinct # same as above without listing fields related to duplication


# get distinct instance based on ID and pre term flag = 0
tmp1 <- tmp1_births%>%arrange(MBR_ID,pre_term)%>%distinct(MBR_ID, .keep_all = TRUE)
tmp2 <- data.frame(table(tmp1$MBR_ID)) 
tmp4 <- tmp1[tmp1$MBR_ID =='10060',]

# get distinct last instance based on ID and pre term flag = 1
tmp1 <- tmp1_births%>%arrange(MBR_ID,desc(pre_term))%>%distinct(MBR_ID, .keep_all = TRUE)
tmp2 <- data.frame(table(tmp1$MBR_ID)) 
tmp4 <- tmp1[tmp1$MBR_ID =='10060',] 

# correcting plot boundaries
par(mar=c(1,1,1,1))


# example routing checking and correcting duplicates based on key and 1 addiitonal field

# check duplicates
tmp1 <- data.frame(table(df2$MBR_ID)) # find duplicates
tmp <- df2[df2$MBR_ID =='10060',] # appears duplicated on diagnosis &/or claim #

# get distinct sorting to keep most recent diagnosis (judgement call)
tmp2 <- df2%>%arrange(MBR_ID,desc(SVC_FROM_DT))%>%distinct(MBR_ID, .keep_all = TRUE)
tmp3 <- tmp2[tmp2$MBR_ID =='10060',] # check

# update data frame
df2 <- tmp2

rm(tmp,tmp1,tmp2,tmp3) # cleanup

# example referencing against a list that is a random sample of numeric IDs
# note: change formatting in 'vals' if character IDs are required

# check duplicates
tmp1 <- data.frame(table(df5$ID)) # find duplicates
tmp <- df5[df5$ID =='80050',] # appears duplicated on flag #

# check against a random sample of duplictes
rs <- randomSample(tmp1[tmp1$Freq > 1,],5) # look at random sample of duplicates
vals <- as.list((as.numeric(as.character(rs$Var1)))) # get ID as list from rs
tmp <- df5[df5$ID %in% vals,] # subset data with duplicated rows - check


# get distinct sorting to give priority to pre-term flag = 1 (judgement call)
tmp2 <- df5%>%arrange(ID,desc(pre_term))%>%distinct(ID, .keep_all = TRUE)
tmp3 <- tmp2[tmp2$ID %in% vals,] # check again - all pre_term = 1 obs should be kept

# update data frame
df5 <- tmp2

rm(tmp,tmp1,tmp2,tmp3,rs,vals) # cleanup


#-----------------------------
# missing values
#-----------------------------

# count total missing values in a data frame
colSums(is.na(df1_chrt))

### example recode missing category
df1$Lvl <- ifelse(is.na(df1$Level) == TRUE,'1-Low-Risk',df1$Lvl)

### example - categorizing with a missing value category

df1$riskcat <- ifelse(df1$Risk_Score <= 3,'1-Low',
                        ifelse(df1$Risk_Score > 3 &  df1$Risk_Score <= 5, '2-Moderate', '3-High'))

# account for NA group
df1$riskcat <- ifelse(is.na(df1$Risk_Score) == TRUE,'0-NA',df1$riskcat)

# remove missing based on specified field
tmp3 <- tmp3[is.na(tmp3$RUCC_2013) == 'FALSE',] 
tmp2 <- na.omit(tmp1) # listwise deletion of all missing

# basic imputation

impute.mean <- function(x) {
  z <- mean(x, na.rm = TRUE)
  x[is.na(x)] <- z
  return(x)
}

apply(df$xvar,2,impute.mean)

impute.zero <- function(x) {
x[is.na(x)] <- 0
return(x)
}
         
df1 <- data.frame(apply(df,2,impute.zero)) # this imputes all missing column values to zero

#--------------------------
# loop processing
#-------------------------


# loop across fields and produce frequencies
results=list()  # create list for storing results

# run loop and simulate data
for(i in 1:5) {
  results[i] = sample(1:15, 1) 
}

x <- as.numeric(as.character(unlist(results))) # convert results to object
summary(x) # analysis

#--------------------------------------
# working with functins
#--------------------------------------

apply(df,2,sd)  # calculate standard deviation for every column in data set

df2 <- sapply(df1,as.numeric) # convert all columns to numeric


#-----------------------------------
# substring text and character operations
#-----------------------------------

# if ENDT still NULL and there is an coverage effective date then replace null end date with today's date
df1$ENDT <- ifelse((substr(df1$ENDT,1,1) =="N" & (substr(df1$DateVar,1,1) != "N")),as.character.Date("2018-10-14"),df1$ENDT)

# remove first character
df1$ID <- substring(df$ID, 2)

# remove last character
df1$ID <- substring(df1$ID,1,nchar(df1$ID)-1) # last quote