Entry



# The train and test data is stored in the ../input directory
train <- read.csv("../input/train.csv")
test  <- read.csv("../input/test.csv")

# Create the column child, and indicate whether child or no child
train$Child[train$Age < 18] = 18] <- 0    
train$Child[is.na(train$Age)] <- NA

test$Child[test$Age < 18] = 18] <- 0    
test$Child[is.na(test$Age)] <- NA

# Load in the R package  
library(rpart)

# create a new variable
train$family_size <- train$SibSp + train$Parch + 1 
test$family_size <- test$SibSp + test$Parch + 1 


#Extract title to create a new feature

extractTitle <- function(Name) { 
      Name  0) { 
            return ("Miss.") 
      } else if (length(grep("Master.", Name)) > 0) { 
            return ("Master.") 
      } else if (length(grep("Mrs.", Name)) > 0) { 
            return ("Mrs.") 
      } else if (length(grep("Mr.", Name)) > 0) { 
            return ("Mr.") 
      } else if (length(grep("Rev.", Name)) > 0) { 
            return ("Rev.") 
      } else if (length(grep("Dr.", Name)) > 0) { 
            return ("Dr.") 
      } else if (length(grep("Lady.", Name)) > 0) { 
            return ("Lady.") 
      } else if (length(grep("Mlle.", Name)) > 0) { 
            return ("Mlle.") 
      }
      else { 
            return ("Other") 
      } 
}


titles <- NULL 
for (i in 1:nrow(train)) { 
      titles <- c(titles, extractTitle(train[i,4])) 
} 
train$Title <- as.factor(titles)

titles <- NULL 
for (i in 1:nrow(test)) { 
      titles <- c(titles, extractTitle(test[i,3])) 
} 
test$Title <- as.factor(titles)


# we need to combine the two data sets to get started with random forest

#But first we need to have the same number of columns. So we'll add a new column called "Survived" and populate it with "none"
test$Survived <- rep("None", nrow(test))

#rearranging the test dataset column before R binding it with the train dataset
test<-test[c("PassengerId" ,"Survived", "Pclass", "Name","Sex","Age","SibSp","Parch","Ticket","Fare","Cabin","Embarked","Child","family_size","Title")]

all_data <- rbind(train,test)


# Passenger on row 62 and 830 do not have a value for embarkment. 
# Since many passengers embarked at Southampton, we give them the value S.
# We code all embarkment codes as factors.
all_data$Embarked[c(62,830)] = "S"
all_data$Embarked <- factor(all_data$Embarked)

# Passenger on row 1044 has an NA Fare value. Let's replace it with the median fare value.
all_data$Fare[1044] <- median(all_data$Fare, na.rm=TRUE)

# How to fill in missing Age values?
# We make a prediction of a passengers Age using the other variables and a decision tree model. 
# This time you give method="anova" since you are predicting a continuous variable.
predicted_age <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + family_size,
                       data=all_data[!is.na(all_data$Age),], method="anova")
all_data$Age[is.na(all_data$Age)] <- predict(predicted_age, all_data[is.na(all_data$Age),])

# Split the data back into a train set and a test set
train <- all_data[1:891,]
test <- all_data[892:1309,]

# Load in the package
library(randomForest)


# Apply the Random Forest Algorithm
my_forest <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + family_size, 
                          data=train, importance=TRUE, ntree=2000)

# Make your prediction using the test set
my_prediction <- predict(my_forest, test)

# Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions
my_solution <- data.frame(PassengerId = test$PassengerId, Survived = my_prediction)

# Write your solution away to a csv file with the name my_solution.csv
write.csv(my_solution, file = "my_solution.csv", row.names = FALSE)
                

This script has been released under the Apache 2.0 open source license.




Source: Entry

Via: Google Alert for ML

Pin It on Pinterest

Share This

Join Our Newsletter

Sign up to our mailing list to receive the latest news and updates about homeAI.info and the Informed.AI Network of AI related websites which includes Events.AI, Neurons.AI, Awards.AI, and Vocation.AI

You have Successfully Subscribed!