aRandom Forest Benchmark (R)

# This script trains a Random Forest model based on the data, # saves a sample submission, and plots the relative importance # of the variables in making predictions # Download 1_random_forest_r_submission.csv from the output below # and submit it through https://www.kaggle.com/c/titanic-gettingStarted/submissions/attach # to enter this getting started competition! library(ggplot2) library(randomForest) set.seed(1) train <- read.csv(“../input/train.csv”, stringsAsFactors=FALSE) test <- read.csv(“../input/test.csv”, stringsAsFactors=FALSE) extractFeatures <- function(data) { features <- c(“Pclass”, “Age”, “Sex”, “Parch”, “SibSp”, “Fare”, “Embarked”) fea <- data[,features] fea$Age[is.na(fea$Age)] <- -1 fea$Fare[is.na(fea$Fare)] <- median(fea$Fare, na.rm=TRUE) fea$Embarked[fea$Embarked==””] = “S” fea$Sex <- as.factor(fea$Sex) fea$Embarked <- as.factor(fea$Embarked) return(fea) } rf <- randomForest(extractFeatures(train), as.factor(train$Survived), ntree=100, importance=TRUE) submission <- data.frame(PassengerId = test$PassengerId) submission$Survived <- predict(rf, extractFeatures(test)) write.csv(submission, file = “1_random_forest_r_submission.csv”, row.names=FALSE) imp <- importance(rf, type=1) featureImportance <- data.frame(Feature=row.names(imp), Importance=imp[,1]) p <- ggplot(featureImportance, aes(x=reorder(Feature, Importance), y=Importance)) + geom_bar(stat=”identity”,…


Link to Full Article: aRandom Forest Benchmark (R)