final-project-kaiqing.Rmd

title: "Final Project"
author: "Sean Kan"
date: ""
output:
  html_document:
    theme: readable
    toc: yes
  pdf_document: default
urlcolor: cyan
knitr::opts_chunk$set(echo = TRUE)
options(scipen=999)
library(readr)
library(dplyr)
airbnb = read_csv('listings.csv')

#Remove columns with irrelevant info, free text or consist entirely of NA
airbnb = subset(airbnb, select = -c(id, listing_url, scrape_id, last_scraped,
                                    source, name, description, neighborhood_overview,
                                    picture_url, host_id, host_url, host_name,
                                    host_location, host_about, host_thumbnail_url,
                                    host_picture_url,host_neighbourhood,
                                    neighbourhood, license, host_verifications,
                                    bathrooms, calendar_last_scraped, amenities,
                                    calendar_updated))

#Remove entries with missing super host info
airbnb = airbnb[airbnb$host_is_superhost!="",]

#Replace "N/A" with NA
airbnb = data.frame(lapply(airbnb, function(x) {
  x = gsub('N/A|NA', NA, x)
}))

#Remove NA
airbnb = na.omit(airbnb)

#Clean up the price column
airbnb$price =  substring(airbnb$price, 2)
airbnb$price = gsub(",","",airbnb$price) #remove commas

#Extract the year from data
airbnb$host_since = as.numeric(substring(airbnb$host_since, 0,4))
airbnb$first_review = as.numeric(substring(airbnb$first_review, 0,4))
airbnb$last_review = as.numeric(substring(airbnb$last_review, 0,4))


#Convert to numeric
airbnb = airbnb %>% mutate_at(vars(-one_of('host_since','host_response_time',
                                 'host_response_rate','host_acceptance_rate',
                                 'host_is_superhost','host_has_profile_pic',
                                 'host_identity_verified','neighbourhood_cleansed',
                                 'neighbourhood_group_cleansed','property_type',
                                 'room_type','bathrooms_text','has_availability',
                                 'first_review','last_review','instant_bookable')),
                              as.numeric)

#Convert to logical
airbnb = airbnb %>% mutate_at(c('host_is_superhost','host_has_profile_pic',
                                'host_identity_verified','has_availability',
                                'instant_bookable'), as.logical)
#Convert to factors
airbnb = airbnb %>% mutate_at(c('host_response_time','neighbourhood_cleansed',
                                'neighbourhood_group_cleansed','property_type',
                                'room_type'), as.factor)

#Convert percentages to numeric
airbnb$host_response_rate = as.numeric(substring(
  airbnb$host_response_rate,1,nchar(airbnb$host_response_rate)-1))/100
airbnb$host_acceptance_rate = as.numeric(substring(
  airbnb$host_acceptance_rate,1,nchar(airbnb$host_acceptance_rate)-1))/100

#Convert bathrooms_text to numeric
##Convert half-bath to a numeric value
airbnb$bathrooms_text = gsub('Shared half-bath|Half-bath', 0.5, airbnb$bathrooms_text)
##Use regular expression to extract only numbers
airbnb$bathrooms_text = as.numeric(gsub('\\s\\w+', '', airbnb$bathrooms_text))

str(airbnb)
library(faraway)
library(lmtest)


abnb_metrics = function(model, log = FALSE, dataset = airbnb_train, alpha = 0.05,
                        metrics = TRUE, plotit = FALSE)
{
  if (plotit == TRUE)
  {
  par(mfrow = c(1, 2))
    #Fitted vs Residuals Plot
    plot(fitted(model), resid(model), col = "gray", pch = 20,
     xlab = "Fitted", ylab = "Residuals", main = "Fitted vs Residuals Plot")
    abline(h = 0, col =  "dodgerblue", lwd = 2)

    #QQ Plot
    qqnorm(resid(model), main = "Normal Q-Q Plot", col ="gray")
    qqline(resid(model), col = "dodgerblue", lwd = 2)
  }
  if (metrics == TRUE)
  {
  adj.r2 = summary(model)$adj.r.squared
  rmse = if (log == FALSE)
      sqrt(mean(sum(resid(model) ^ 2)))
    else
      sqrt(mean((dataset$price - exp(fitted(model))) ^ 2))
  aic = AIC(model)
  n_pred = length(coef(model)) - 1
  n_ns_pred = sum(summary(model)$coefficients[,4] > alpha)
  list(adj.r2 = adj.r2, rmse = rmse, aic = aic, n_pred = n_pred,
    n_ns_pred = n_ns_pred)
  }
}

set.seed(123)

#Train-test split
airbnb_idx = sample(nrow(airbnb), round(nrow(airbnb)*0.7))
airbnb_train = airbnb[airbnb_idx, ]
airbnb_test = airbnb[-airbnb_idx, ]

#Full additive
full_add_mod = lm(price ~ ., data = airbnb_train)
n = length(resid(full_add_mod))
abnb_metrics(full_add_mod, plotit = TRUE, metrics = FALSE)

#Log transformation: top 5 correlation + others
top5_log_mod = lm(
  log(price) ~ bathrooms_text + accommodates + bedrooms + beds + longitude +
    host_is_superhost + host_has_profile_pic + host_identity_verified +
    has_availability + instant_bookable + host_response_time +
    neighbourhood_group_cleansed + neighbourhood_cleansed + room_type,
  data = airbnb_train
)


#Given our focus on explainability, we will perform a forward search to eliminate any unnecessary predictors
#Forward BIC
bic_mod_start = lm(log(price) ~ 1, data = airbnb_train)
all_vars = formula(top5_log_mod)
bic_mod_for = step(
  bic_mod_start,
  scope = all_vars,
  direction = "forward",
  k = log(n),
  trace = 0
)
abnb_metrics(bic_mod_for, plotit = TRUE, metrics = FALSE)

#What would happen if we remove predictors with numerous dummy variables and substitute with 2-way interactions
all_vars = formula(lm(
  log(price) ~ . - neighbourhood_cleansed - property_type +
    (longitude + latitude + neighbourhood_group_cleansed) ^
    2,
  airbnb_train
))
bic_mod_int2 = step(
  bic_mod_start,
  scope = all_vars,
  direction = "forward",
  k = log(n),
  trace = 0
)
abnb_metrics(bic_mod_int2, log = TRUE, plotit = TRUE, metrics = FALSE)

#What would happen if we try 3-way interactions
all_vars = formula(
  lm(
    log(price) ~ . - neighbourhood_cleansed - property_type + longitude * latitude *
      neighbourhood_group_cleansed,
    airbnb_train
  )
)
bic_mod_int3 = step(
  bic_mod_start,
  scope = all_vars,
  direction = "forward",
  k = log(n),
  trace = 0
)
abnb_metrics(bic_mod_int3, log = TRUE, plotit = TRUE, metrics = FALSE)

#ANOVA Test
anova(bic_mod_int2, bic_mod_int3)

#Remove data with high influence and refit model
influ_idx = which(cooks.distance(bic_mod_int3) > 4 /
                    length(cooks.distance(bic_mod_int3)))
airbnb_train_reduced = airbnb_train[-influ_idx, ]
bic_mod_start = lm(log(price) ~ 1, data = airbnb_train_reduced)
n = nrow(airbnb_train_reduced)

all_vars = formula(
  lm(
    log(price) ~ . - neighbourhood_cleansed - property_type + longitude * latitude *
      neighbourhood_group_cleansed,
    airbnb_train_reduced
  )
)
bic_mod_int3_refit = step(
  bic_mod_start,
  scope = all_vars,
  direction = "forward",
  k = log(n),
  trace = 0
)
abnb_metrics(bic_mod_int3_refit, dataset = airbnb_train_reduced, log = TRUE,
             plotit = TRUE, metrics = FALSE)

#VIF
vif(bic_mod_int3_refit)[which(vif(bic_mod_int3_refit) > 10)]
#Picked out a few interesting beta coefficients
sort(round(exp(coef(bic_mod_int3_refit)),5)[c("room_typeHotel room",
                                "review_scores_cleanliness", "bedrooms",
                                "host_is_superhostTRUE", "review_scores_location",
                                "bathrooms_text", "host_has_profile_picTRUE",
                                "accommodates", "availability_30",
                                "number_of_reviews", "review_scores_value",
                                "room_typePrivate room")], decreasing=TRUE)
#Summary Table
df = data.frame(results = c("Full Additive", "Log (High Corr Predictors)",
                            "Log Forw BIC", "Log Forw BIC 2-Way",
                            "Log Forw BIC 3-Way","Log Forw BIC 3-Way Without Large Influence"),
  adj.r2 = c(abnb_metrics(full_add_mod)[[1]], abnb_metrics(top5_log_mod)[[1]]
             ,abnb_metrics(bic_mod_for)[[1]], abnb_metrics(bic_mod_int2)[[1]],
             abnb_metrics(bic_mod_int3)[[1]],
             abnb_metrics(bic_mod_int3_refit,
                          dataset = airbnb_train_reduced)[[1]]),
  rmse =  c(abnb_metrics(full_add_mod)[[2]],
            abnb_metrics(top5_log_mod, log = TRUE)[[2]],
            abnb_metrics(bic_mod_for, log = TRUE)[[2]],
            abnb_metrics(bic_mod_int2, log = TRUE)[[2]],
            abnb_metrics(bic_mod_int3, log = TRUE)[[2]],
            abnb_metrics(bic_mod_int3_refit, log = TRUE,
                         dataset = airbnb_train_reduced)[[2]]),

  aic =  c(abnb_metrics(full_add_mod)[[3]], abnb_metrics(top5_log_mod)[[3]],
           abnb_metrics(bic_mod_for)[[3]], abnb_metrics(bic_mod_int2)[[3]],
           abnb_metrics(bic_mod_int3)[[3]],
           abnb_metrics(bic_mod_int3_refit,
                          dataset = airbnb_train_reduced)[[3]]),
  n_pred =  c(abnb_metrics(full_add_mod)[[4]], abnb_metrics(top5_log_mod)[[4]],
              abnb_metrics(bic_mod_for)[[4]], abnb_metrics(bic_mod_int2)[[4]],
              abnb_metrics(bic_mod_int3)[[4]],
              abnb_metrics(bic_mod_int3_refit,
                           dataset = airbnb_train_reduced)[[4]]),
  n_non_pred =  c(abnb_metrics(full_add_mod)[[5]], abnb_metrics(top5_log_mod)[[5]],
                  abnb_metrics(bic_mod_for)[[5]], abnb_metrics(bic_mod_int2)[[5]],
                  abnb_metrics(bic_mod_int3)[[5]],
                  abnb_metrics(bic_mod_int3_refit,
                               dataset = airbnb_train_reduced)[[5]]))

colnames(df) = c("Model", "Adjusted R2", "Train RMSE", "AIC", "Num of Pred",
                 "Num of Pred (p-val > 0.05)")

knitr::kable(df)
#Evaluate with test set
pred_test = predict(bic_mod_int3_refit, airbnb_test)
test_rmse = sqrt(mean((airbnb_test$price - exp(pred_test))^2))

knitr::kable(data.frame(Model = c("Train", "Test"),
                 RMSE = c(abnb_metrics(bic_mod_int3_refit, log = TRUE,
                         dataset = airbnb_train_reduced)[[2]], test_rmse)))
#Correlation
matrix_cor = cor(select_if(airbnb, is.numeric))

#Correlation between price and numeric predictors
sort((matrix_cor['price',]), decreasing = TRUE)

#Pair plot of variables with a relatively high correlation to price.
price_cor = sort(abs(matrix_cor['price',]), decreasing = TRUE)
pairs(airbnb[,c(names(price_cor[1:11]))],
      main = "Top 10 predictors that display correlation with price", )

#Correlation between price and factorial predictors
pairs(airbnb[,c('price', names(select_if(airbnb, is.factor)))],
      main = "Correlation between price and factorial predictors")

#Pair plot of variables related to availability
pairs(airbnb[,names(airbnb %>%  select(matches('availability_')))],
      main = "Pair plot of predictors related to availability ")

#Pair plot of variables related to review scores
pairs(airbnb[,names(airbnb %>%  select(matches('review_scores')))],
      main = "Pair plot of predictors related to review scores ")
library(ggplot2)
# viusalize the geographical distribution and price of the listings
ggplot(airbnb, aes(x = longitude, y = latitude, color = price)) +
  geom_point() +
  scale_color_gradient(low = 'lightblue', high = 'red')
ggplot(data = airbnb,
       aes(airbnb$price)) +
    geom_histogram(bins = 100,
                   col = "#000000",
                   fill = "#99FFFF",
                   alpha = .5) +
    labs(x = "Price", y = "Frequency")
airbnb$logp = log(airbnb$price)

ggplot(data = airbnb,
       aes(airbnb$logp)) +
    geom_histogram(bins = 50,
                   col = "#000000",
                   fill = "#99FFFF",
                   alpha = .5) +
    labs(x = "Price", y = "Frequency")