Вы делаете это неправильно, потому что вы используете все переменные при разработке линейной модели командой model = lm(output ~ ., train)
. Но во время построения графика вы используете график зависимости затрат от результата (как в случае b и c), а в случае d вы пытаетесь прогнозировать, используя только одну переменную, т.е. стоимость. График регрессии должен быть сделан между наблюдаемым выходом и прогнозируемым выходом. Для этого вы можете использовать следующий код
library(lattice)
library(mosaic)
#> Loading required package: dplyr
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
#> Loading required package: ggformula
#> Loading required package: ggplot2
#> Loading required package: ggstance
#>
#> Attaching package: 'ggstance'
#> The following objects are masked from 'package:ggplot2':
#>
#> geom_errorbarh, GeomErrorbarh
#>
#> New to ggformula? Try the tutorials:
#> learnr::run_tutorial("introduction", package = "ggformula")
#> learnr::run_tutorial("refining", package = "ggformula")
#> Loading required package: mosaicData
#> Loading required package: Matrix
#> Registered S3 method overwritten by 'mosaic':
#> method from
#> fortify.SpatialPolygonsDataFrame ggplot2
#>
#> The 'mosaic' package masks several functions from core packages in order to add
#> additional features. The original behavior of these functions should not be affected by this.
#>
#> Note: If you use the Matrix package, be sure to load it BEFORE loading mosaic.
#>
#> Attaching package: 'mosaic'
#> The following object is masked from 'package:Matrix':
#>
#> mean
#> The following object is masked from 'package:ggplot2':
#>
#> stat
#> The following objects are masked from 'package:dplyr':
#>
#> count, do, tally
#> The following objects are masked from 'package:stats':
#>
#> binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
#> quantile, sd, t.test, var
#> The following objects are masked from 'package:base':
#>
#> max, mean, min, prod, range, sample, sum
DF1 = read.csv("Nerlove.csv",header=TRUE)
head(DF1, n=5)
#> cost output pl sl pk sk pf sf
#> 1 0.082 2 2.09 0.3164 183 0.4521 17.9 0.2315
#> 2 0.661 3 2.05 0.2073 174 0.6676 35.1 0.1251
#> 3 0.990 4 2.05 0.2349 171 0.5799 35.1 0.1852
#> 4 0.315 4 1.83 0.1152 166 0.7857 32.2 0.0990
#> 5 0.197 5 2.12 0.2300 233 0.3841 28.6 0.3859
split = round(nrow(DF1) * 0.60)
train = (DF1[1:split, ])
test = (DF1[(split + 1):nrow(DF1), ])
model = lm(output ~ ., train)
summary(model)
#>
#> Call:
#> lm(formula = output ~ ., data = train)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -409.65 -112.20 -4.61 94.20 430.76
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 1182.296 1861.876 0.635 0.527
#> cost 146.231 6.127 23.866 < 2e-16 ***
#> pl 14.897 80.142 0.186 0.853
#> sl -2471.312 1930.034 -1.280 0.204
#> pk 1.477 1.011 1.460 0.148
#> sk -869.468 1874.585 -0.464 0.644
#> pf -13.701 2.365 -5.794 1.08e-07 ***
#> sf -947.958 1861.490 -0.509 0.612
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 169.3 on 87 degrees of freedom
#> Multiple R-squared: 0.9111, Adjusted R-squared: 0.904
#> F-statistic: 127.4 on 7 and 87 DF, p-value: < 2.2e-16
#Calibration plotting
pred_cal <- predict(model, newdata=train)
df_cal <- data.frame(Observed=train$output, Predicted=pred_cal)
xyplot(Predicted ~ Observed, data = df_cal, pch = 19, panel=panel.lmbands,
band.lty = c(conf =2, pred = 1))
![](https://i.imgur.com/LwHoGbH.png)
#Validation plottig
pred_val <- predict(model, newdata=test)
df_val <- data.frame(Observed=test$output, Predicted=pred_val)
xyplot(Predicted ~ Observed, data = df_val, pch = 19, panel=panel.lmbands,
band.lty = c(conf =2, pred = 1))
![](https://i.imgur.com/kcadPE0.png)
Создано в 2020-01-07 путем представительный пакет (v0.3.0)