amesHousing2018 {cheem}R Documentation

Ames housing data 2018

Description

House sales prices from Ames, Iowa, USA between 2006 and 2010. Only complete numeric observations remain.

Usage

amesHousing2018

amesHousing2018_raw

amesHousing2018_NorthAmes

Format

complete data.frame with 2291 rows and 18 numeric variables, SalesPrice, the response variable, and 3 class variables

An object of class data.frame with 2930 rows and 82 columns.

An object of class data.frame with 338 rows and 11 columns.

Details

amesHousing2018

Complete data.frame, n = 2291, 18 numeric variable (including 2 temporal: MoSold, YrSold ), response variable SalePrice, 3 class factors.

amesHousing2018_NorthAmes

A simplified subsample, just North Ames (largest neighborhood). Complete data.frame, n = 338, 9 numeric variables, response variable SalePrice, 1 class factor SubclassMS, a zoning subclass.

amesHousing2018_raw

Original data from Kaggle, 2930 rows of 82 variables. Sparse rows (639) and sparse/defaulted columns (64) are removed.

No data dictionary is provided on Kaggle, but amesHousing2018 variables are inferred to be:

Source

De Cock, D. (2011). "Ames, Iowa: Alternative to the Boston Housing Data as an End of Semester Regression Project," Journal of Statistics Education, Volume 19, Number 3. http://jse.amstat.org/v19n3/decock/DataDocumentation.txt http://jse.amstat.org/v19n3/decock.pdf

Kaggle, Ames Housing Dataset https://www.kaggle.com/prevek18/ames-housing-dataset

Replicating this dataset:

if(FALSE) ## Don't accidentally open the URL.
  browseURL("https://www.kaggle.com/prevek18/ames-housing-dataset")
ames <- readr::read_csv("./buildignore/AmesHousing.csv")
amesHousing2018_raw <- data.frame(ames)
## save(amesHousing2018_raw, file = "./data/amesHousing2018_raw.rda")

## Complete rows and numeric variables
ames1 <- ames[, unlist(lapply(ames, is.numeric))]
ames1$Bathrooms <- ames1$`Full Bath` + ames1$`Half Bath`
ames1 <- ames1[, c(1:18, 38, 19:37)]
col_idx <- !(colnames(ames1) %in% c(
  "Order", "Mas Vnr Area", "BsmtFin SF 1", "BsmtFin SF 2",
  "Bsmt Full Bath", "Bsmt Half Bath", "Fireplaces",
  "Wood Deck SF", "Open Porch SF", "Enclosed Porch",
  "3Ssn Porch", "Screen Porch", "Pool Area", "Misc Val", "2nd Flr SF",
  "Low Qual Fin SF", "Full Bath", "Half Bath", "Kitchen AbvGr"))
row_idx <- !is.na(ames1$"Garage Yr Blt") &
  !is.na(ames1$"Lot Frontage") &
  !is.na(ames1$"Bsmt Unf SF") &
  !is.na(ames1$"Total Bsmt SF")
ames2 <- as.data.frame(ames1[row_idx, col_idx])

## Looking for character classes to keep:
ames_char <- ames[, unlist(lapply(ames, is.character))]
ames_clas <- as.data.frame(lapply(ames_char, factor))[, -1]
ames_clasint <- data.frame(lapply(ames_clas, as.integer))
col_idx_char <- which(names(ames_clas) %in%
                        c("MS.SubClass", "MS.Zoning", "Neighborhood"))
classes <- ames_clas[row_idx, col_idx_char]

amesHousing2018 <- cbind(ames2, classes)
names(amesHousing2018) <- c(
  "LotFrontage", "LotArea","OverallQual", "OverallCond", "YearBuild",
  "YearRemod", "BsmtUnfArea", "TotBsmtArea", "1stFlrArea", "LivingArea",
  "Bathrms", "Bedrms", "TotRms", "GarageYrBlt", "GarageCars", "GarageArea",
  "MoSold", "YrSold", "SalePrice", "SubclassMS", "ZoneMS", "Neighborhd")
## save(amesHousing2018, file = "./data/amesHousing2018.rda")

.thin_col_idx <- names(amesHousing2018) %in% c(
  "LotArea", "OverallQual", "YearBuild",
  "LivingArea", "Bathrms", "Bedrms", "TotRms",
  "GarageYrBlt", "GarageArea", "SalePrice", "SubclassMS")
amesHousing2018_thin <- amesHousing2018[, .thin_col_idx]

## subset to north ames, and only 5 largest subclasses
r_idx <- amesHousing2018$Neighborhd == "NAmes" &
  amesHousing2018$SubclassMS %in% c("020", "050", "080", "090", "060")
amesHousing2018_NorthAmes <- amesHousing2018_thin[r_idx, ]
amesHousing2018_NorthAmes$SubclassMS <- factor(
  amesHousing2018_NorthAmes$SubclassMS,
  unique(amesHousing2018_NorthAmes$SubclassMS))
if(F){ ## Don't accidentally save
  save(amesHousing2018_NorthAmes, file = "./data/amesHousing2018_NorthAmes.rda")

Examples

library(cheem)

## Regression setup:
dat  <- amesHousing2018_NorthAmes
X    <- dat[, 1:9]
Y    <- dat$SalePrice
clas <- dat$SubclassMS

## Cheem list
ames_rf_chm <- cheem_ls(X, Y, ames_rf_shap, ames_rf_pred, clas,
                        label = "North Ames, RF, SHAP")
## Cheem visuals
if(interactive()){
  prim <- 1
  comp <- 2
  global_view(ames_rf_chm, primary_obs = prim, comparison_obs = comp)
  bas <- sug_basis(ames_rf_chm, prim, comp)
  mv  <- sug_manip_var(ames_rf_chm, primary_obs = prim, comp)
  ggt <- radial_cheem_tour(ames_rf_chm, basis = bas, manip_var = mv)
  animate_plotly(ggt)
}

## Save for use with shiny app (expects an rds file)
if(FALSE){ ## Don't accidentally save.
  saveRDS(ames_rf_chm, "./chm_NAmes_rf_tshap.rds")
  run_app() ## Select the saved rds file from the data drop down.
}

[Package cheem version 0.4.0.0 Index]