Skip to content

Instantly share code, notes, and snippets.

@glesserd
Created September 1, 2016 13:28
Show Gist options
  • Select an option

  • Save glesserd/406519a4a79a49efb2353cfe05bcc6ee to your computer and use it in GitHub Desktop.

Select an option

Save glesserd/406519a4a79a49efb2353cfe05bcc6ee to your computer and use it in GitHub Desktop.
summarySE, normDataWithin and summarySEwithin of cookbook-r in python
import pandas as pd
import scipy as sp
from scipy.stats import t
import numpy as np
#from: http://www.cookbook-r.com/Graphs/Plotting_means_and_error_bars_%28ggplot2%29/
## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%).
## data: a data frame.
## measurevar: the name of a column that contains the variable to be summariezed
## groupvars: a vector containing names of columns that contain grouping variables
## conf_interval: the percent range of the confidence interval (default is 95%)
def summarySE(data, measurevar, groupvars, conf_interval=0.95):
def std(s):
return np.std(s, ddof=1)
def stde(s):
return std(s) / np.sqrt(len(s))
def ci(s):
# Confidence interval multiplier for standard error
# Calculate t-statistic for confidence interval:
# e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
ciMult = t.ppf(conf_interval/2.0 + .5, len(s)-1)
return stde(s)*ciMult
def ciUp(s):
return np.mean(s)+ci(s)
def ciDown(s):
return np.mean(s)-ci(s)
data = data[groupvars+measurevar].groupby(groupvars).agg([len, np.mean, std, stde, ciUp, ciDown, ci])
data.reset_index(inplace=True)
data.columns = groupvars+ ['_'.join(col).strip() for col in data.columns.values[len(groupvars):]]
return data
#from: http://www.cookbook-r.com/Graphs/Plotting_means_and_error_bars_%28ggplot2%29/
## Norms the data within specified groups in a data frame; it normalizes each
## subject (identified by idvar) so that they have the same mean, within each group
## specified by betweenvars.
## data: a data frame.
## idvar: the name of a column that identifies each subject (or matched subjects)
## measurevar: the name of a column that contains the variable to be summariezed
## betweenvars: a vector containing names of columns that are between-subjects variables
def normDataWithin(data, idvar, measurevar, betweenvars=[]):
def std(s):
return np.std(s, ddof=1)
#temp = data[data.cond == "PC_IDLE"]
#temp = temp[idvar+betweenvars+measurevar]
#temp.columns = idvar+betweenvars + [x+"_PC_IDLE" for x in measurevar]
data_subjMean = data.groupby(idvar+betweenvars).agg([np.mean])
data_subjMean.reset_index(inplace=True)
data_subjMean.columns = idvar+betweenvars + ['_'.join(col).strip() for col in data_subjMean.columns.values[len(idvar+betweenvars):]]
data = pd.merge(data, data_subjMean, on=idvar+betweenvars)
#data = pd.merge(data, temp, on=idvar+betweenvars)
for obj in measurevar:
data[obj+"_norm"] = data[obj] - data[obj+"_mean"] + data[obj].mean()
#data[obj+"_norm"] = std(data[obj])/data[obj+"_std"]*(data[obj] - data[obj+"_mean"]) + data[obj].mean()
#data[obj+"_norm"] = data[obj] - data[obj+"_PC_IDLE"]
#del data[obj+"_mean"]
#del data[obj+"_std"]
return data
#from: http://www.cookbook-r.com/Graphs/Plotting_means_and_error_bars_%28ggplot2%29/
## Summarizes data, handling within-subjects variables by removing inter-subject variability.
## It will still work if there are no within-S variables.
## Gives count, un-normed mean, normed mean (with same between-group mean),
## standard deviation, standard error of the mean, and confidence interval.
## If there are within-subject variables, calculate adjusted values using method from Morey (2008).
## data: a data frame.
## measurevar: the name of a column that contains the variable to be summariezed
## betweenvars: a vector containing names of columns that are between-subjects variables
## withinvars: a vector containing names of columns that are within-subjects variables
## idvar: the name of a column that identifies each subject (or matched subjects)
## conf_interval: the percent range of the confidence interval (default is 95%)
def summarySEwithin(data, measurevar, betweenvars=[], withinvars=[], idvar=[], conf_interval=.95):
# Get the means from the un-normed data
datac = summarySE(data, measurevar, groupvars=betweenvars+withinvars, conf_interval=conf_interval)
for e in measurevar:
del datac[e+"_std"]
del datac[e+"_stde"]
del datac[e+"_ci"]
del datac[e+"_ciUp"]
del datac[e+"_ciDown"]
# Norm each subject's data
ndata = normDataWithin(data, idvar, measurevar, betweenvars)
# This is the name of the new columns
measurevar_n = [x+"_norm" for x in measurevar]+measurevar
# Collapse the normed data - now we can treat between and within vars the same
ndatac = summarySE(ndata, measurevar_n, groupvars=betweenvars+withinvars,
conf_interval=conf_interval)
# Apply correction from Morey (2008) to the standard error and confidence interval
# Get the product of the number of conditions of within-S variables
nWithinGroups = 1
for v in withinvars:
nWithinGroups = nWithinGroups*len(ndatac[v].unique())
correctionFactor = np.sqrt( nWithinGroups / (nWithinGroups-1) )
# Apply the correction factor
for m in measurevar_n:
ndatac[m+"_std"] = ndatac[m+"_std"] * correctionFactor
ndatac[m+"_stde"] = ndatac[m+"_stde"] * correctionFactor
ndatac[m+"_ci"] = ndatac[m+"_ci"] * correctionFactor
return ndatac
@timokvamme
Copy link

Awesome

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment