Я очень поздно опубликовал решение, которое помогло мне. Я уверен, что есть улучшения, которые можно сделать, поэтому, пожалуйста, не стесняйтесь комментировать!
Цель этого упражнения состояла в том, чтобы понять, в какой степени линейное преобразование рейтингов предложений повлияет на выбор предложений. Идея состояла в том, чтобы попытаться отделить «качество предложения» от «предвзятого отношения» и «систематического отклонения».
Одним из способов сделать это, по сути, является центрирование всех оценок по среднему значению панели, а затем выполнить преобразование среднее значение / sd для оценок, центрированных на панели, с использованием общего среднего значения и sd для всех оценок. Эта процедура в функции adj.scores
.
Это нетривиально, поскольку предложения оцениваются людьми, и может быть большое количество финансовых стимулов, основанных на успешной оценке предложений (гранты, контракты и т. Д.).
Любые мысли об улучшениях или конкурирующих стратегиях приветствуются.
####################
########## proposal ratings project
########## 17 June 2011
########## original code by: jjb with help from es
##########------ functions to be read in and called when desired
########## applying this function to a single matrix will give detailed output
########## calculating generalizability theory components
########## not a very robust formulation, but a good place to start
########## for future, put panel facet on this design
g.pxr.long = function(x)
{
m.raters <<- colMeans(x)
n.raters <<- length(m.raters)
m.props <<- rowMeans(x)
n.props <<- length(m.props)
m.total <<- mean(x)
n.total <<- nrow(x)*ncol(x)
m.raters.2 <<- m.raters^2
m.props.2 <<- m.props^2
sum.m.raters.2 <<- sum(m.raters.2)
sum.m.props.2 <<- sum(m.props.2)
ss.props <<- n.raters*(sum.m.props.2) - n.total*(m.total^2)
ss.raters <<- n.props*(sum.m.raters.2) - n.total*(m.total^2)
ss.pr <<- sum(x^2) - n.raters*(sum.m.props.2) - n.props*(sum.m.raters.2) + n.total*(m.total^2)
df.props <- n.props - 1
df.raters <- n.raters - 1
df.pr <- df.props*df.raters
ms.props <- ss.props / df.props
ms.raters <- ss.raters / df.raters
ms.pr <- ss.pr / df.pr
var.p <- (ms.props - ms.pr) / n.raters
var.r <- (ms.raters - ms.pr) / n.props
var.pr <- ms.pr
out.1 <- as.data.frame( matrix(c( df.props, df.raters, df.pr,
ss.props, ss.raters, ss.pr,
ms.props, ms.raters, ms.pr,
var.p, var.r, var.pr), ncol = 4))
out.2 <- as.data.frame(matrix(c("p", "r", "pr" ), ncol = 1))
g.out.1 <- as.data.frame(cbind(out.2, out.1))
colnames(g.out.1) <- c(" source", " df ", " ss ", " ms ", " variance")
var.abs <- (var.r / n.raters) + (var.pr / n.raters)
var.rel <- (var.pr / n.raters)
var.xbar <- (var.p / n.props) + (var.r / n.raters) + (var.pr / (n.raters*n.props) )
gen.coef <- var.p / (var.p + var.rel)
dep.coef <- var.p / (var.p + var.abs)
out.3 <- as.data.frame(matrix(c(var.rel, var.abs, var.xbar, gen.coef, dep.coef), ncol=1))
out.3 <- round(out.3, digits = 4)
out.4 <- as.data.frame(matrix(c("relative error variance", "absolute error variance", "xbar error variance", "E(rho^2)", "Phi"), ncol=1))
g.out.2 <- as.data.frame(cbind(out.4,out.3))
colnames(g.out.2) <- c(" estimate ", " value")
outs <- list(g.out.1, g.out.2)
names(outs) <- c("generalizability theory: G-study components", "G-study Indices")
return(outs)
}
##########----- calculating confidence intervals using Chebychev, Cramer, and Normal
##########----- use this to find alpha for desired k
factor.choose = function(k)
{
alpha <- 1 / k^2
return(alpha)
}
conf.intervals = function(dat, alpha)
{
k <- sqrt( 1 / alpha ) # specifying what the k factor is...
x.bar <- mean(dat)
x.sd <- sd(dat)
n.obs <- length(dat)
sem <- x.sd / sqrt(n.obs)
ub.cheb <- x.bar + k*sem
lb.cheb <- x.bar - k*sem
ub.crem <- x.bar + (4/9)*k*sem
lb.crem <- x.bar - (4/9)*k*sem
ub.norm <- x.bar + qnorm(1-alpha/2)*sem
lb.norm <- x.bar - qnorm(1-alpha/2)*sem
out.lb <- matrix(c(lb.cheb, lb.crem, lb.norm), ncol=1)
out.ub <- matrix(c(ub.cheb, ub.crem, ub.norm), ncol=1)
dat <- as.data.frame(dat)
mean.raters <- as.matrix(rowMeans(dat))
dat$z.values <- matrix((mean.raters - x.bar) / x.sd)
select.cheb <- dat[ which(abs(dat$z.values) >= k ) , ]
select.crem <- dat[ which(abs(dat$z.values) >= (4/9)*k ) , ]
select.norm <- dat[ which(abs(dat$z.values) >= qnorm(1-alpha/2)) , ]
count.cheb <- nrow(select.cheb)
count.crem <- nrow(select.crem)
count.norm <- nrow(select.norm)
out.dat <- list()
out.dat <- list(select.cheb, select.crem, select.norm)
names(out.dat) <- c("Selected cases: Chebychev", "Selected cases: Cramer's", "Selected cases: Normal")
out.sum <- matrix(c(x.bar, x.sd, n.obs), ncol=3)
colnames(out.sum) <- c("mean", "sd", "n")
out.crit <- matrix(c(alpha, k, (4/9)*k, qnorm(1-alpha/2)) ,ncol=4 )
colnames(out.crit) <- c("alpha", "k", "(4/9)*k", "z" )
out.count <- matrix(c(count.cheb, count.crem, count.norm) ,ncol=3 )
colnames(out.count) <- c("Chebychev", "Cramer" , "Normal")
rownames(out.count) <- c("count")
outs <- list(out.sum, out.crit, out.count, out.dat)
names(outs) <- c("Summary of data", "Critical values", "Count of Selected Cases", "Selected Cases")
return(outs)
}
rm(list = ls())
set.seed(271828)
means <- matrix(c( rep(50,19), rep(70,1) ), ncol = 1) # matrix of true proposal values
bias.u <- matrix(c(0,2,4), nrow=1) # unidirectional bias
bias.b <- matrix(c(0,5, -5), nrow=1) # bidirectional bias
ones.u <- matrix(rep(1,ncol(bias.u)), nrow = 1) # number of raters is the number of columns (r)
ones.b <- matrix(rep(1,ncol(bias.b)), nrow = 1)
ones.2 <- matrix(rep(1,nrow(means)), ncol = 1) # number of proposals is the number of rows (p)
true.ratings <- means%*%ones.u # gives matrix of true proposal value for each rater (p*r)
uni.bias <- ones.2%*%bias.u
bid.bias <- ones.2%*%bias.b # gives matrix of true rater bias for each proposal (p*r)
pan.bias.pos <- matrix(20,nrow=nrow(true.ratings), ncol=ncol(true.ratings)) # gives a matrix of bias for every member in a panel (p*r)
n.val <- nrow(true.ratings)*ncol(true.ratings)
# true.ratings
# uni.bias
# bid.bias
# pan.bias.pos
library(MASS)
#####
##### generating replicate data...
#####
n.panels <- 10 # put in the number of replicate panels that should be produced
reps <- 2 # put in the number of times each bias condition should be included in a panel
t.reps <- list()
n.bias <- list()
u.bias <- list()
b.bias <- list()
pan.bias <- list()
n.bias.out <- list()
u.bias.out <- list()
b.bias.out <- list()
pan.bias.out <- list()
for (i in 1:n.panels)
{
{
for(j in 1:reps)
n.bias[[j]] <- true.ratings + matrix(round(rnorm(n.val,4,2), digits=0), nrow = nrow(means))
for(j in 1:reps)
u.bias[[j]] <- true.ratings + uni.bias + matrix(round(rnorm(n.val,4,2), digits=0), nrow = nrow(means))
for(j in 1:reps)
b.bias[[j]] <- true.ratings + bid.bias + matrix(round(rnorm(n.val,4,2), digits=0), nrow = nrow(means))
}
pan.bias[[i]] <- true.ratings + pan.bias.pos + matrix(round(rnorm(n.val,4,2), digits=0), nrow = nrow(means))
n.bias.out[[i]] <- n.bias
u.bias.out[[i]] <- u.bias
b.bias.out[[i]] <- b.bias
t.reps[[i]] <- c(n.bias, u.bias, b.bias, pan.bias[i])
}
# t.reps
# rm(list = ls())
##########----- this is the standardization formula to be applied to proposal ratings **WITHIN** a panel
adj.scores <- function(x, tot.dat)
{
t.sd <- sd(array(tot.dat))
t.mn <- mean(array(tot.dat))
ones.t.mn <- diag(1,ncol(x))
p <- nrow(x)
r <- ncol(x)
ones.total <- matrix(1,p,r)
r.sd <- diag(apply(x,2, sd))
r.mn <- diag(apply(x,2, mean))
den.r.sd <- ginv(r.sd)
b.shift <- x%*%den.r.sd
a <- t.mn*ones.t.mn - den.r.sd%*%r.mn
a.shift <- ones.total%*%a
l.x <- b.shift + a.shift
return(l.x)
}
##########----- applying standardization formula and getting
##########----- proposal means for adjusted and unadjusted ratings
adj.rep <- list()
m.adj <- list()
m.reg <- list()
for (i in 1:n.panels)
{
panel.data <- array(unlist(t.reps[[i]]))
adj.rep[[i]] <- lapply(t.reps[[i]], adj.scores, tot.dat = panel.data)
m.adj[[i]] <- lapply(adj.rep[[i]], rowMeans)
m.reg[[i]] <- lapply(t.reps[[i]], rowMeans)
}
##########-----
##########----- This function extracts the replicate proposal means for each set of reviews
##########----- across panels. So, if there are 8 sets of reviewers that are simulated 10 times,
##########----- this extracts the first set of means across the 10 replications and puts them together,
##########----- and then extracts the second set of means across replications and puts them together, etc..
##########----- The result will be 8 matrices made up of 10 columns with rows .
##########-----
##########----- first for unadjusted means
means.reg <- matrix(unlist(m.reg), nrow=nrow(means))
sets <- length(m.reg[[1]])
counter <- n.panels*length(m.reg[[1]])
m.reg.panels.set <- list()
for (j in 1:sets)
{
m.reg.panels.set[[j]] <- means.reg[ , c(seq( j, counter, sets))]
}
##########----- now for adjusted means
means.adj <- matrix(unlist(m.adj), nrow=nrow(means))
sets <- length(m.adj[[1]])
counter <- n.panels*length(m.adj[[1]])
m.adj.panels.set <- list()
for (j in 1:sets)
{
m.adj.panels.set[[j]] <- means.adj[ , c(seq( j, counter, sets))]
}
########## calculating msd as bias^2 and error^2
msd.calc <- function(x)
{
true.means <- means
calc.means <- as.matrix(rowMeans(x))
t.means.mat <- matrix(rep(true.means, n.panels), ncol=ncol(x))
c.means.mat <- matrix(rep(calc.means, n.panels), ncol=ncol(x))
msd <- matrix( rowSums( (x - t.means.mat )^2) / ncol(c.means.mat) )
bias.2 <- (calc.means - true.means)^2
e.var <- matrix( rowSums( (x - c.means.mat )^2) / ncol(c.means.mat ) )
msd <- matrix(c(msd, bias.2, e.var), ncol = 3)
colnames(msd) <- c("msd", "bias^2", "e.var")
return(msd)
}
########## checking work...
# msd.calc <- bias.2 + e.var
# all.equal(msd, msd.calc)
##########----- applying function to each set across panels
msd.reg.panels <- lapply(m.reg.panels.set, msd.calc)
msd.adj.panels <- lapply(m.adj.panels.set, msd.calc)
msd.reg.panels
msd.adj.panels
########## for the unadjusted scores, the msd is very large (as is expected)
########## for the adjusted scores, the msd is in line with the other panels
##########----- trying to evaluate impact of adjusting scores on proposal awards
reg.panel.1 <- matrix(unlist(m.reg[[1]]), nrow=nrow(means))
adj.panel.1 <- matrix(unlist(m.adj[[1]]), nrow=nrow(means))
reg <- matrix(array(reg.panel.1), ncol=1)
adj <- matrix(array(adj.panel.1), ncol=1)
panels.1 <- cbind(reg,adj)
colnames(panels.1) <- c("unadjusted", "adjusted")
cor(panels.1, method="spearman")
plot(panels.1)
######## identify(panels.1)
plot(panels.1, xlim = c(25,95), ylim = c(25,95) )
abline(0,1, col="red")
########## the adjustment greatly reduces variances of ratings
########## compare the projection of the panel means onto the respective margins
##########----- examining the selection of the top 10% of the proposals
pro.names <- matrix(seq(1,nrow(reg),1))
df.reg <- as.data.frame(cbind(pro.names, reg))
colnames(df.reg) <- c("pro", "rating")
df.reg$q.pro <- ecdf(df.reg$rating)(df.reg$rating)
df.adj <- as.data.frame(cbind(pro.names, adj))
colnames(df.adj) <- c("pro", "rating")
df.adj$q.pro <- ecdf(df.adj$rating)(df.adj$rating)
awards.reg <- df.reg[ which(df.reg$q.pro >= .90) , ]
awards.adj <- df.adj[ which(df.adj$q.pro >= .90) , ]
awards.reg[order(-awards.reg$q.pro) , ]
awards.adj[order(-awards.adj$q.pro) , ]
awards.reg[order(-awards.reg$pro) , ]
awards.adj[order(-awards.adj$pro) , ]
##### using unadjusted scores, the top 10% of proposals come mostly from one (biased) panel, and the rest are made up of the
##### highest scoring proposal (from the biased rater) from the remaining panels.
##### using the adjusted scores, the top 10% of proposals are made up of the highest scoring proposal (the biased rater) from the
##### several panels, and a few proposals from other panels. Doesn't seem to be a systematic explanation as to why...
#########----- treating rating data in an ANOVA model
ratings <- do.call("rbind", t.reps[[1]] )
panels <- factor(gl(7,20))
fit <- manova(ratings ~ panels)
summary(fit, test="Wilks")
adj.ratings <- do.call("rbind", adj.rep[[1]] )
adj.fit <- manova(adj.ratings ~ panels)
summary(adj.fit, test="Wilks")
##########----- thinking... extra ideas for later
##########----- calculating Mahalanobis distance to identify biased panels
mah.dist = function(dat)
{md.dat <- as.matrix(mahalanobis(dat, center = colMeans(dat) , cov = cov(dat) ))
md.pv <- as.matrix(pchisq(md.dat, df = ncol(dat), lower.tail = FALSE))
out <- cbind(md.dat, md.pv)
out.2 <- as.data.frame(out)
colnames(out.2) <- c("MD", "pMD")
return(out.2)
}
mah.dist(ratings)
mah.dist(adj.ratings)