Part 1: Multiple Linear Regression using R
There are 1253 vehicles in the cars_19 dataset. I am simply running mlr using Tensorflow for demonstrative purposes as using lm() in R is more efficient and more precise for such a small dataset.
TensorFlow uses an algorithm that is dependent upon convergence whereas R computes the closed form estimates of beta. I will be using 11 features and an intercept so R will be inverting a 12 x 12 matrix which is not computationally expensive with today's technology.
The dataset below of 11 features contains 7 factor variables and 4 numeric variables.
str(cars_19)
'data.frame': 1253 obs. of 12 variables:
$ fuel_economy_combined: int 21 28 21 26 28 11 15 18 17 15 ...
$ eng_disp : num 3.5 1.8 4 2 2 8 6.2 6.2 6.2 6.2 ...
$ num_cyl : int 6 4 8 4 4 16 8 8 8 8 ...
$ transmission : Factor w/ 7 levels "A","AM","AMS",..: 3 2 6 3 6 3 6 6 6 5 ...
$ num_gears : int 9 6 8 7 8 7 8 8 8 7 ...
$ air_aspired_method : Factor w/ 5 levels "Naturally Aspirated",..: 4 4 4 4 4 4 3 1 3 3 ...
$ regen_brake : Factor w/ 3 levels "","Electrical Regen Brake",..: 2 1 1 1 1 1 1 1 1 1 ...
$ batt_capacity_ah : num 4.25 0 0 0 0 0 0 0 0 0 ...
$ drive : Factor w/ 5 levels "2-Wheel Drive, Front",..: 4 2 2 4 2 4 2 2 2 2 ...
$ fuel_type : Factor w/ 5 levels "Diesel, ultra low sulfur (15 ppm, maximum)",..: 4 3 3 5 3 4 4 4 4 4 ...
$ cyl_deactivate : Factor w/ 2 levels "N","Y": 1 1 1 1 1 2 1 2 2 1 ...
$ variable_valve : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...
The factors need to be transformed into a format TensorFlow can understand.
cols <- feature_columns(
column_numeric(colnames(cars_19[c(2, 3, 5, 8)])),
column_categorical_with_identity("transmission", num_buckets = 7),
column_categorical_with_identity("air_aspired_method", num_buckets = 5),
column_categorical_with_identity("regen_brake", num_buckets = 3),
column_categorical_with_identity("drive", num_buckets = 5),
column_categorical_with_identity("fuel_type", num_buckets = 5),
column_categorical_with_identity("cyl_deactivate", num_buckets = 2),
column_categorical_with_identity("variable_valve", num_buckets = 2)
)
Create an input function:
#input_fn for a given subset of data
cars_19_input_fn <- function(data, num_epochs = 1) {
input_fn(
data,
features = colnames(cars_19[c(2:12)]),
response = "fuel_economy_combined",
batch_size = 64,
num_epochs = num_epochs
)
}
Train, evaluate, predict:
model <- linear_regressor(feature_columns = cols)
set.seed(123)
indices <- sample(1:nrow(cars_19), size = 0.75 * nrow(cars_19))
train <- cars_19[indices, ]
test <- cars_19[-indices, ]
#train model
model %>% train(cars_19_input_fn(train, num_epochs = 1000))
#evaluate model
model %>% evaluate(cars_19_input_fn(test))
#predict
yhat <- model %>% predict(cars_19_input_fn(test))
Results are very close to the R closed form estimates:
postResample(yhat, y)
RMSE Rsquared MAE
2.5583185 0.7891934 1.9381757
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tfestimators) | |
library(caret) | |
#input_fn for a given subset of data | |
cars_19_input_fn <- function(data, num_epochs = 1) { | |
input_fn( | |
data, | |
features = colnames(cars_19[c(2:12)]), | |
response = "fuel_economy_combined", | |
batch_size = 64, | |
num_epochs = num_epochs | |
) | |
} | |
cols <- feature_columns( | |
column_numeric(colnames(cars_19[c(2, 3, 5, 8)])), | |
column_categorical_with_identity("transmission", num_buckets = 7), | |
column_categorical_with_identity("air_aspired_method", num_buckets = 5), | |
column_categorical_with_identity("regen_brake", num_buckets = 3), | |
column_categorical_with_identity("drive", num_buckets = 5), | |
column_categorical_with_identity("fuel_type", num_buckets = 5), | |
column_categorical_with_identity("cyl_deactivate", num_buckets = 2), | |
column_categorical_with_identity("variable_valve", num_buckets = 2) | |
) | |
model <- linear_regressor(feature_columns = cols) | |
set.seed(123) | |
indices <- sample(1:nrow(cars_19), size = 0.75 * nrow(cars_19)) | |
train <- cars_19[indices, ] | |
test <- cars_19[-indices, ] | |
#train model | |
model %>% train(cars_19_input_fn(train, num_epochs = 1000)) | |
#evaluate model | |
model %>% evaluate(cars_19_input_fn(test)) | |
#predict | |
yhat <- model %>% predict(cars_19_input_fn(test)) | |
yhat <- unlist(yhat) | |
y <- test$fuel_economy_combined | |
postResample(yhat, y) | |
r <- y - yhat | |
plot(y, yhat, xlab = "actual", ylab = "predicted", main = "Multiple Linear Regression") | |
abline(lm(yhat ~ y)) | |
plot(r) | |
#RMSE Rsquared MAE | |
#2.5583185 0.7891934 1.9381757 |