PCA Обнаружение аномалий и определение того, какая переменная в кадре данных действительно способствует ненормальному поведению в основном компоненте-1 на уровне наблюдения. Пример показан в ссылке для наблюдения 34 на последнем изображении.https://pubs.rsc.org/en/content/articlehtml/2014/ay/c3ay41907j#tab1.
Итак, я применил PCA к данным Wine, где данные состоят из типа вина и различных характеристик вина соответственно.Как только мы применяем pca к данным, у нас есть PC1, PC2 и множество компонентов.когда мы наносим на график PC1, мы видим для конкретного наблюдения (например, Наблюдение 34), мы получаем паттерн аномалии.поэтому нам нужно понять, почему, какая переменная является причиной, это точка данных аномалии.до сих пор я вычислял загрузку, которая дает общий вклад переменной, где я хочу для каждого переменного вклада каждого столбца соответственно
# Creating Wine data frame for PCA Anomaly Contribution plot
rm(list=ls())
Wine_Type <- c("ARG-BNS1", "ARG-DDA1", "ARG-FFL1", "ARG-FLM1", "ARG-ICR1", "ARG-SAL1", "AUS-CAV1", "AUS-EAG1", "AUS-HAR1", "AUS-IB41", "AUS-KIL1", "AUS-KIR1", "AUS-NUG1", "AUS-SOC1", "AUS-TGH1", "AUS-VAF1", "AUS-WBL1", "AUS-WES1", "CHI-CDD1", "CHI-CDM1", "CHI-CMO1", "CHI-CSU1", "CHI-GNE1", "CHI-IND1", "CHI-LJO1", "CHI-S151", "CHI-SCH1", "CHI-SHE1", "CHI-SUN1", "CHI-UND1", "CHI-UTA1", "CHI-VDA1", "CHI-VDS1", 'SOU-HHI1', "SOU-INS1", "SOU-KWV1", 'SOU-NED1', 'SOU-PDM1', "SOU-ROO1", "SOU-RW21", "SOU-SAV1", "SOU-SIM1", "SOU-SPI1", "SOU-SRE1")
Ethanol <- c(13.6199999, 14.0600004, 13.7399998, 13.9499998, 14.4700003, 14.6099997, 13.6499996, 14.1199999, 13.1300001, 13.49, 15.0900002, 14.6300001, 13.6300001, 13.6700001, 14.4300003, 13.4499998, 13.8299999, 13.8500004, 13.9700003, 12.8400002, 14.1899996, 14.1300001, 13.6599998, 14.2700005, 13.8400002, 13.5799999, 13.2299995, 13.6099997, 13.7700005, 13.6199999, 13.5100002, 12.79, 14.6300001, 13.96, 14.0500002, 14.0200005, 13.9799995, 14.5, 13.8999996, 14.2200003, 13.8000002, 14.4499998, 14.3599997, 14.25)
Total_acid <- c(3.53999996, 3.74000001, 3.26999998, 3.66000009, 3.66000009, 3.45000005, 4.30999994, 3.88000011, 3.82999992, 3.69, 3.98000002, 4.78000021, 4.63999987, 3.86999989, 4.51000023, 4.34000015, 4.21999979, 4.15999985, 3.53999996, 3.22000003, 3.4000001, 3.61999989, 3.07999992, 3.43000007, 3.04999995, 3.24000001, 3.03999996, 3.07999992, 3.36999989, 3.49000001, 3.42000008, 2.99000001, 3.1400001, 4.55000019, 3.5, 3.88000011, 4.36000013, 4.42000008, 4.0999999, 4.05999994, 3.58999991, 3.83999991, 3.81999993, 3.56999993)
Volatile_acid<- c(0.289999992, 0.589999974, 0.469999999, 0.469999999, 0.379999995, 0.519999981, 0.319999993, 0.370000005, 0.270000011, 0.32, 0.469999999, 0.430000007, 0.709999979, 0.540000021, 0.400000006, 0.460000008, 0.330000013, 0.360000014, 0.289999992, 0.340000004, 0.349999994, 0.330000013, 0.280000001, 0.439999998, 0.25999999, 0.239999995, 0.340000004, 0.280000001, 0.300000012, 0.209999993, 0.319999993, 0.370000005, 0.439999998, 1.02999997, 0.379999995, 0.409999996, 0.430000007, 0.360000014, 0.649999976, 0.419999987, 0.349999994, 0.340000004, 0.319999993, 0.479999989)
Malic_acid <- c(0.889999986, 0.239999995, -0.07, 0.090000004, 0.610000014, 0.159999996, 0.180000007, 0.360000014, 0.400000006, 0.5, 0.439999998, 0.519999981, 0.189999998, 0.159999996, 0.310000002, 0.469999999, 0.49000001, 0.170000002, 0.479999989, 0.419999987, 0.460000008, 0.310000002, 0.419999987, 0.449999988, 0.469999999, 0.529999971, 0.419999987, 0.50999999, 0.319999993, 0.449999988, 0.540000021, 0.289999992, 0.379999995, -0.620000005, 0.479999989, 0.449999988, 0.280000001, 0.529999971, 0.189999998, 0.280000001, 0.550000012, 0.709999979, 0.50999999, 0.389999986)
pH <- c(3.71000004, 3.73000002, 3.86999989, 3.78999996, 3.70000005, 3.92000008, 3.5999999, 3.70000005, 3.66000009, 3.7, 3.67000008, 3.52999997, 3.6400001, 3.6500001, 3.70000005, 3.48000002, 3.54999995, 3.53999996, 3.6400001, 3.71000004, 3.72000003, 3.6400001, 3.68000007, 3.75999999, 3.71000004, 3.61999989, 3.76999998, 3.6400001, 3.68000007, 3.57999992, 3.63000011, 3.70000005, 3.75, 3.88000011, 3.81999993, 3.68000007, 3.66000009, 3.61999989, 3.66000009, 3.63000011, 3.82999992, 3.75999999, 3.8499999, 3.70000005)
Latic_acid <- c(0.779999971, 1.25, 1.13, 1, 0.810000002, 1.75999999, 1.37, 1.01999998, 1.13, 1.02, 1.00999999, 0.839999974, 1.78999996, 1.22000003, 1.49000001, 1.16999996, 1.13, 0.959999979, 0.779999971, 1.19000006, 0.850000024, 0.819999993, 0.910000026, 0.790000021, 0.800000012, 0.829999983, 1.08000004, 0.939999998, 0.930000007, 0.800000012, 0.839999974, 1.26999998, 0.939999998, 2.95000005, 1.13999999, 1.00999999, 1.44000006, 0.980000019, 1.17999995, 1.25, 1.13999999, 1.00999999, 1.02999997, 0.949999988)
Rest_sugar <- c(1.46000004, 2.42000008, 1.51999998, 4.17000008, 1.25, 1.39999998, 3.79999995, 4.32000017, 3.99000001, 6.4, 1.05999994, 1.20000005, 1.45000005, 0.620000005, 5.61999989, 1.27999997, 1.19000006, 2.58999991, 1.14999998, 1.37, 1.75, 1.77999997, 4.23999977, 1.51999998, 2.07999992, 2.45000005, 1.05999994, 4.0999999, 2.74000001, 1.37, 1.14999998, 1.15999997, 2.19000006, 2.3599999, 1.60000002, 1.45000005, 1.01999998, 0.75999999, 1.35000002, 1.53999996, 2.27999997, 2.11999989, 2.68000007, 1.92999995)
Citric_acid <- c(0.310000002, 0.180000007, 0.389999986, 0.409999996, 0.140000001, 0.100000001, 0.239999995, 0.319999993, 0.340000004, 0.13, -0.039999999, -0.050000001, 0.159999996, 0.370000005, 0.430000007, -0.01, 0.090000004, 0.200000003, 0.119999997, 0.119999997, 0.170000002, 0.25999999, 0.100000001, 0.059999999, 0.209999993, 0.25999999, 0.209999993, 0.059999999, 0.300000012, 0.25999999, 0.090000004, 0.200000003, 0.200000003, 0.25, -0.039999999, 0.07, 0.059999999, 0.159999996, 0.079999998, 0.090000004, 0.079999998, 0.200000003, -0.02, 0.07)
Co2 <- c(85.6100006, 175.199997, 513.73999, 379.399994, 154.880005, 156.300003, 462.619995, 244.149994, 212, 419.38, 48.0200005, 154.820007, 243.960007, 563.400024, 347.880005, 263.459991, 288.970001, 272.190002, 210.529999, 338.869995, 245.070007, 183.699997, 353.529999, 247.289993, 399.070007, 475.890015, 603.320007, 400.470001, 180.139999, 495.329987, 388.079987, 390.149994, 228.389999, 282.73999, 510.079987, 243.580002, 452.959991, 184.940002, 183.059998, 246.339996, 297.540009, 104.269997, 510.089996, 260.079987)
Density <- c(0.99000001, 1, 0.99000001, 1, 0.99000001, 0.99000001, 1, 1, 1, 1, 0.99000001, 1, 1, 0.99000001, 1, 0.99000001, 0.99000001, 1, 0.99000001, 1, 0.99000001, 0.99000001, 1, 0.99000001, 0.99000001, 0.99000001, 0.99000001, 1, 0.99000001, 0.99000001, 0.99000001, 0.99000001, 0.99000001, 0.99000001, 1, 0.99000001, 0.99000001, 0.99000001, 0.99000001, 0.99000001, 1, 1, 1, 0.99000001)
Total.polyphenol.index <- c(60.9199982, 70.6399994, 63.5900002, 73.3000031, 71.6900024, 71.7900009, 59.5999985, 59.5, 59.4199982, 63.86, 70.0999985, 72.3700027, 55.0699997, 63.0400009, 63.5200005, 62.6899986, 59.0800018, 83.5100021, 64.3099976, 53.0999985, 66.8199997, 64.8300018, 52.1599998, 63.75, 56.5499992, 53.1300011, 54.8899994, 54.2599983, 64.8000031, 58.8300018, 58.8100014, 50.4399986, 61.9399986, 44.6800003, 62.5800018, 60.8699989, 62.3499985, 60.1300011, 58.4700012, 57.0800018, 60.3600006, 68.6800003, 67.7699966, 57.8600006)
Glycerol <- c(9.72000027, 10.0500002, 10.9200001, 9.68999958, 10.8100004, 10.1899996, 10.6599998, 11.0699997, 8.89000034, 10.35, 11.4300003, 11.6400003, 9.59000015, 11.2799997, 10.9300003, 9.46000004, 11.1000004, 10.4499998, 10.5799999, 8.80000019, 10.1099997, 9.85000038, 9.53999996, 9.93000031, 9.47999954, 9.32999992, 9.02000046, 9.38000011, 10.3299999, 9.86999989, 9.76000023, 8.11999989, 10.0500002, 8.22999954, 10.1000004, 10.6099997, 10.6199999, 12.5200005, 11.7200003, 10.2399998, 10.4700003, 11.1800003, 10.5799999, 9.93999958)
Methanol <- c(0.159999996, 0.200000003, 0.180000007, 0.230000004, 0.200000003, 0.189999998, 0.25, 0.25, 0.230000004, 0.26, 0.189999998, 0.280000001, 0.25, 0.140000001, 0.300000012, 0.180000007, 0.219999999, 0.239999995, 0.180000007, 0.170000002, 0.180000007, 0.219999999, 0.180000007, 0.209999993, 0.189999998, 0.150000006, 0.150000006, 0.170000002, 0.170000002, 0.189999998, 0.180000007, 0.129999995, 0.150000006, 0.170000002, 0.25, 0.200000003, 0.219999999, 0.219999999, 0.180000007, 0.219999999, 0.239999995, 0.209999993, 0.270000011, 0.200000003)
Tartaric.acid <- c(1.74000001, 1.58000004, 1.24000001, 2.25999999, 1.22000003, 0.899999976, 1.80999994, 1.64999998, 2.11999989, 1.81, 1.47000003, 2.11999989, 1.36000001, 1.00999999, 1.80999994, 2.13000011, 1.54999995, 2.47000003, 1.72000003, 1.85000002, 1.48000002, 1.83000004, 1.38, 1.48000002, 1.65999997, 1.63, 1.55999994, 1.41999996, 1.62, 1.77999997, 1.29999995, 1.46000004, 1.30999994, 0.910000026, 1.22000003, 1.30999994, 1.76999998, 1.55999994, 1.40999997, 1.5, 1.70000005, 1.52999997, 2.01999998, 1.42999995)
country <- c("Argentina","Argentina","Argentina","Argentina","Argentina","Argentina","Australia", "Australia","Australia","Australia","Australia","Australia","Australia","Australia","Australia", "Australia","Australia","Australia","Chile","Chile","Chile","Chile","Chile","Chile","Chile","Chile", "Chile","Chile","Chile","Chile","Chile","Chile","Chile","South_Africa","South_Africa","South_Africa", "South_Africa","South_Africa","South_Africa","South_Africa","South_Africa","South_Africa", "South_Africa","South_Africa")
wdata <- data.frame(Wine_Type,Ethanol,Total_acid,Volatile_acid,Malic_acid,pH,Latic_acid,Rest_sugar,Citric_acid,Co2,Density,Total.polyphenol.index,Glycerol,Methanol,Tartaric.acid,country)
wdata
wdata$country <- as.factor(wdata$country)
df_index <- wdata[,c("Wine_Type","country")] # removing column which are not for PCA
dx <- wdata[,!names(wdata) %in% names(df_index)]
rownames(dx) <- df_index$Wine_Type
# Normalising data for PCA Analysis
scaleContinuous = function(data) {
data <- data[,sapply(data, function(x) is.numeric(x))]
data <- as.data.frame(data)
binary = apply(data, 2, function(x) {all(x %in% 0:1)})
data[!binary] = scale(data[!binary])
return(data)
}
dx <- as.data.frame(dx)
dx <- scaleContinuous(dx)
# Performing PCA
pca_df <- princomp(dx, cor = T)
summary(pca_df)
# Getting Score & Loading from PCA Attributes
score_df <- as.data.frame(pca_df$scores)
loading_df <- as.matrix(pca_df$loadings)
Визуализация ПК1 и ПК2 и загрузка для ПК1 и ПК2
plot(as.ts(score_df$Comp.1),main="First Principal Component", type = "l", col = "brown", lwd = 3)
plot(as.ts(score_df$Comp.2),main="Second Principal Component", type = "l", col = "red", lwd = 3)
my_vector1=loading_df[,1]
my_vector2=loading_df[,2]
names(my_vector1)= rownames(loading_df)
names(my_vector2)= rownames(loading_df)
library(RColorBrewer)
coul = brewer.pal(9, "Paired")
Участок для ПК1 и ПК2.
# par(mfrow=c(2,1))
a1=barplot(my_vector1, col=coul , las=1, names.arg="",main = paste("Weight/loading For PC1"))
text(a1[,1], 0.2 ,srt = 90, adj= 1, xpd = TRUE, labels = names(dx) , cex=1.0)
a2 = barplot (my_vector2, col = coul, las =1, names.arg = "", main = paste ("Weight / Loading For PC2")) текст (a2 [, 1], 0.2, srt = 90, adj = 1, xpd = TRUE, label = names (dx), cex = 1,0)
Таким образом, ожидаемый результат - вычисление вклада каждой переменной для выявленной аномалии в PC1 при каждом наблюдении.Для справки прилагается ссылка, в которой Hotelling T2 используется для ее идентификации. https://pubs.rsc.org/en/content/articlehtml/2014/ay/c3ay41907j#tab1
Итак, ожидается: как вычислитьНоминальная оценка Т2 или график вклада для Наблюдения № 34, как показано на ПК1 Линейный график с использованием формул, выделенных красной меткой на последнем изображении, и для дополнительной ссылки также прилагается соответственно.
Basically how to compute the below Equation shown in image