Simple Linear Regression

1: In library(package, lib.loc = lib.loc, character.only = TRUE, logical.return = TRUE, :
there is no package called ‘fOptions’
2: In library(package, lib.loc = lib.loc, character.only = TRUE, logical.return = TRUE, :
there is no package called ‘fOptions’
> a = 3
> b = 4
> sqrt(a ^ 2 + b ^ 2)
[1] 5
>
> plot(sqrt(a ^ 2 + b ^ 2))
> library("ggplot2", lib.loc="~/R/win-library/3.6")
> head(mpg, n = 10)
# A tibble: 10 x 11
manufacturer model displ year cyl trans
<chr> <chr> <dbl> <int> <int> <chr>
1 audi a4 1.8 1999 4 auto~
2 audi a4 1.8 1999 4 manu~
3 audi a4 2 2008 4 manu~
4 audi a4 2 2008 4 auto~
5 audi a4 2.8 1999 6 auto~
6 audi a4 2.8 1999 6 manu~
7 audi a4 3.1 2008 6 auto~
8 audi a4 q~ 1.8 1999 4 manu~
9 audi a4 q~ 1.8 1999 4 auto~
10 audi a4 q~ 2 2008 4 manu~
# ... with 5 more variables: drv <chr>,
# cty <int>, hwy <int>, fl <chr>,
# class <chr>
> The function str() will display the “structure” of the data frame
Error: unexpected 'function' in "The function"
> str(mpg)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 234 obs. of 11 variables:
$ manufacturer: chr "audi" "audi" "audi" "audi" ...
$ model : chr "a4" "a4" "a4" "a4" ...
$ displ : num 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
$ year : int 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
$ cyl : int 4 4 4 4 6 6 6 4 4 4 ...
$ trans : chr "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
$ drv : chr "f" "f" "f" "f" ...
$ cty : int 18 21 20 21 16 18 18 18 16 20 ...
$ hwy : int 29 29 31 30 26 26 27 26 25 28 ...
$ fl : chr "p" "p" "p" "p" ...
$ class : chr "compact" "compact" "compact" "compact" ...
>
> Central Tendency
Error: unexpected symbol in "Central Tendency"
> mean(mpg$cty)
[1] 16.85897
> median(mpg$cty)
[1] 17
> var(mpg$cty)
[1] 18.11307
> sd(mpg$cty)
[1] 4.255946
> IQR(mpg$cty)
[1] 5
> min(mpg$cty)
[1] 9
> max(mpg$cty)
[1] 35
> range(mpg$cty)
[1] 9 35
> table(mpg$drv)

4 f r
103 106 25
> table(mpg$drv) / nrow(mpg)

4 f r
0.4401709 0.4529915 0.1068376
>
> o visualize the data
> Histograms
> • Barplots
> • Boxplots
> • Scatterplots

> Simple Linear Regression

Error: unexpected symbol in "Simple Linear"
> View(cars)
> plot(dist ~ speed, data = cars,
+ xlab = "Speed (in Miles Per Hour)",
+ ylab = "Stopping Distance (in Feet)",
+ main = "Stopping Distance vs Speed",
+ pch = 20,
+ cex = 2,
+ col = "grey")
>
> In the cars example, we are interested in using the predictor variable speed to predict and explain the
Error: unexpected symbol in "In the"
> response variable dist.
> Y = f(X) + ϵ.
"Y = f(X) + \"
> • Response = Prediction + Error
> • Response = Signal + Noise
> • Response = Model + Unexplained
> • Response = Deterministic + Random
> • Response = Explainable + Unexplainable
> we will store the response variable as y and the
Error: unexpected symbol in "we will"
> predictor variable as x
> x = cars$speed
> y = cars$dist
> We then calculate the three sums of squares
> Sxy = sum((x - mean(x)) * (y - mean(y)))
> Sxx = sum((x - mean(x)) ^ 2)
> Syy = sum((y - mean(y)) ^ 2)
> c(Sxy, Sxx, Syy)
[1] 5387.40 1370.00 32538.98
> Then finally calculate βˆ
> 0 and βˆ
symbol in "0 and" 1.
[1] 1
> beta_1_hat = Sxy / Sxx
> beta_0_hat = mean(y) - beta_1_hat * mean(x)
> c(beta_0_hat, beta_1_hat)
[1] -17.579095 3.932409
> yˆ = βˆ
> 0 + βˆ
> 1x.
> yˆ = βˆ
> 0 + βˆ
> 1x.
> unique(cars$speed)
[1] 4 7 8 9 10 11 12 13 14 15 16 17 18 19
[15] 20 22 23 24 25
>
> SST = sum((y - mean(y)) ^ 2)
> SSReg = sum((y_hat - mean(y)) ^ 2)
> SSE = sum((y - y_hat) ^ 2)
> c(SST = SST, SSReg = SSReg, SSE = SSE)
> plot(dist ~ speed, data = cars,
+ xlab = "Speed (in Miles Per Hour)",
+ ylab = "Stopping Distance (in Feet)",
+ main = "Stopping Distance vs Speed",
+ pch = 20,
+ cex = 2,
+ col = "grey")
> abline(stop_dist_model, lwd = 3, col = "darkorange")

Error in abline(stop_dist_model, lwd = 3, col = "darkorange") :
object 'stop_dist_model' not found
> abline(stop_dist_model, lwd = 3, col = "darkorange")
Error in abline(stop_dist_model, lwd = 3, col = "darkorange") :
object 'stop_dist_model' not found
> stop_dist_model = lm(dist ~ speed, data = cars)
> stop_dist_model = lm(dist ~ speed, data = cars)
> abline(stop_dist_model, lwd = 3, col = "darkorange")
> c(beta_0_hat, beta_1_hat)
[1] -17.579095 3.932409
>
> abline(stop_dist_model, lwd = 3, col = "darkorange")
> names(stop_dist_model)
[1] "coefficients" "residuals"
[3] "effects" "rank"
[5] "fitted.values" "assign"
[7] "qr" "df.residual"
[9] "xlevels" "call"
[11] "terms" "model"
>
> stop_dist_model$residuals
1 2 3 4
3.849460 11.849460 -5.947766 12.052234
5 6 7 8
2.119825 -7.812584 -3.744993 4.255007
9 10 11 12
12.255007 -8.677401 2.322599 -15.609810
13 14 15 16
-9.609810 -5.609810 -1.609810 -7.542219
17 18 19 20
0.457781 0.457781 12.457781 -11.474628
21 22 23 24
-1.474628 22.525372 42.525372 -21.407036
25 26 27 28
-15.407036 12.592964 -13.339445 -5.339445
29 30 31 32
-17.271854 -9.271854 0.728146 -11.204263
33 34 35 36
2.795737 22.795737 30.795737 -21.136672
37 38 39 40
-11.136672 10.863328 -29.069080 -13.069080
41 42 43 44
-9.069080 -5.069080 2.930920 -2.933898
45 46 47 48
-18.866307 -6.798715 15.201285 16.201285
49 50
43.201285 4.268876
> coef(stop_dist_model)
(Intercept) speed
-17.579095 3.932409
>
> resid(stop_dist_model)
1 2 3 4
3.849460 11.849460 -5.947766 12.052234
5 6 7 8
2.119825 -7.812584 -3.744993 4.255007
9 10 11 12
12.255007 -8.677401 2.322599 -15.609810
13 14 15 16
-9.609810 -5.609810 -1.609810 -7.542219
17 18 19 20
0.457781 0.457781 12.457781 -11.474628
21 22 23 24
-1.474628 22.525372 42.525372 -21.407036
25 26 27 28
-15.407036 12.592964 -13.339445 -5.339445
29 30 31 32
-17.271854 -9.271854 0.728146 -11.204263
33 34 35 36
2.795737 22.795737 30.795737 -21.136672
37 38 39 40
-11.136672 10.863328 -29.069080 -13.069080
41 42 43 44
-9.069080 -5.069080 2.930920 -2.933898
45 46 47 48
-18.866307 -6.798715 15.201285 16.201285
49 50
43.201285 4.268876
> fitted(stop_dist_model)
1 2 3 4
-1.849460 -1.849460 9.947766 9.947766
5 6 7 8
13.880175 17.812584 21.744993 21.744993
9 10 11 12
21.744993 25.677401 25.677401 29.609810
13 14 15 16
29.609810 29.609810 29.609810 33.542219
17 18 19 20
33.542219 33.542219 33.542219 37.474628
21 22 23 24
37.474628 37.474628 37.474628 41.407036
25 26 27 28
41.407036 41.407036 45.339445 45.339445
29 30 31 32
49.271854 49.271854 49.271854 53.204263
33 34 35 36
53.204263 53.204263 53.204263 57.136672
37 38 39 40
57.136672 57.136672 61.069080 61.069080
41 42 43 44
61.069080 61.069080 61.069080 68.933898
45 46 47 48
72.866307 76.798715 76.798715 76.798715
49 50
76.798715 80.731124
>
> summary(stop_dist_model)

Call:
lm(formula = dist ~ speed, data = cars)

Residuals:
Min 1Q Median 3Q Max
-29.069 -9.525 -2.272 9.215 43.201

Coefficients:
Estimate Std. Error t value
(Intercept) -17.5791 6.7584 -2.601
speed 3.9324 0.4155 9.464
Pr(>|t|)
(Intercept) 0.0123 *
speed 1.49e-12 ***
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 15.38 on 48 degrees of freedom
Multiple R-squared: 0.6511, Adjusted R-squared: 0.6438
F-statistic: 89.57 on 1 and 48 DF, p-value: 1.49e-12

>
> names(summary(stop_dist_model))
[1] "call" "terms"
[3] "residuals" "coefficients"
[5] "aliased" "sigma"
[7] "df" "r.squared"
[9] "adj.r.squared" "fstatistic"
[11] "cov.unscaled"
>
> summary(stop_dist_model)$r.squared
[1] 0.6510794
> summary(stop_dist_model)$sigma
[1] 15.37959
>
> predict(stop_dist_model, newdata = data.frame(speed = 8))
1
13.88018
> predict(stop_dist_model, newdata = data.frame(speed = c(8, 21, 50)))
1 2 3
13.88018 65.00149 179.04134
> predict(stop_dist_model, newdata = cars)
1 2 3 4
-1.849460 -1.849460 9.947766 9.947766
5 6 7 8
13.880175 17.812584 21.744993 21.744993
9 10 11 12
21.744993 25.677401 25.677401 29.609810
13 14 15 16
29.609810 29.609810 29.609810 33.542219
17 18 19 20
33.542219 33.542219 33.542219 37.474628
21 22 23 24
37.474628 37.474628 37.474628 41.407036
25 26 27 28
41.407036 41.407036 45.339445 45.339445
29 30 31 32
49.271854 49.271854 49.271854 53.204263
33 34 35 36
53.204263 53.204263 53.204263 57.136672
37 38 39 40
57.136672 57.136672 61.069080 61.069080
41 42 43 44
61.069080 61.069080 61.069080 68.933898
45 46 47 48
72.866307 76.798715 76.798715 76.798715
49 50
76.798715 80.731124
>
> predict(stop_dist_model)
1 2 3 4
-1.849460 -1.849460 9.947766 9.947766
5 6 7 8
13.880175 17.812584 21.744993 21.744993
9 10 11 12
21.744993 25.677401 25.677401 29.609810
13 14 15 16
29.609810 29.609810 29.609810 33.542219
17 18 19 20
33.542219 33.542219 33.542219 37.474628
21 22 23 24
37.474628 37.474628 37.474628 41.407036
25 26 27 28
41.407036 41.407036 45.339445 45.339445
29 30 31 32
49.271854 49.271854 49.271854 53.204263
33 34 35 36
53.204263 53.204263 53.204263 57.136672
37 38 39 40
57.136672 57.136672 61.069080 61.069080
41 42 43 44
61.069080 61.069080 61.069080 68.933898
45 46 47 48
72.866307 76.798715 76.798715 76.798715
49 50
76.798715 80.731124
>
> fitted(stop_dist_model)
1 2 3 4
-1.849460 -1.849460 9.947766 9.947766
5 6 7 8
13.880175 17.812584 21.744993 21.744993
9 10 11 12
21.744993 25.677401 25.677401 29.609810
13 14 15 16
29.609810 29.609810 29.609810 33.542219
17 18 19 20
33.542219 33.542219 33.542219 37.474628
21 22 23 24
37.474628 37.474628 37.474628 41.407036
25 26 27 28
41.407036 41.407036 45.339445 45.339445
29 30 31 32
49.271854 49.271854 49.271854 53.204263
33 34 35 36
53.204263 53.204263 53.204263 57.136672
37 38 39 40
57.136672 57.136672 61.069080 61.069080
41 42 43 44
61.069080 61.069080 61.069080 68.933898
45 46 47 48
72.866307 76.798715 76.798715 76.798715
49 50
76.798715 80.731124

Rprogrammingstatistics

Simple Linear Regression