Titanic RandomForrest v1.2

library(mice) library(randomForest) train<-read.csv(“../input/train.csv”,na.strings=c(‘NA’,”),stringsAsFactors=F) test<-read.csv(“../input/test.csv”,na.strings=c(‘NA’,”),stringsAsFactors=F) check.missing<-function(x) return(paste0(round(sum(is.na(x))/length(x),4)*100,’%’)) data.frame(sapply(train,check.missing)) data.frame(sapply(test,check.missing)) #combine train/test data for pre-processing train$Cat<-‘train’ test$Cat<-‘test’ test$Survived<-NA full<-rbind(train,test) #Embarked full$Embarked[is.na(full$Embarked)]<-‘S’ #Extract Title from Name full$Title = sapply(full$Name,function(x) strsplit(x,'[.,]’)[[1]][2]) full$Title<-gsub(‘ ‘,”,full$Title) #full$Title[full$Title %in% c(‘Capt’, ‘Don’, ‘Major’, ‘Sir’)] <- ‘Sir’ #full$Title[full$Title %in% c(‘Dona’, ‘Lady’, ‘the Countess’, ‘Jonkheer’)] <- ‘Lady’ #Adding FamilySize full$FamilySize<-full$Parch+full$SibSp+1 #Perform Imputation to remove NAs set.seed(144) vars.for.imputation = setdiff(names(full), “Survived”) imputed = complete(mice(full[vars.for.imputation])) full[vars.for.imputation] = imputed #Adding Mother full$Mother0 & full$Age>18 & full$Title!=’Miss’]<-1 #Adding Child full$Child0 & full$Age<=18]<- 1 #FamilyId2 Surname<-sapply(full$Name,function(x) strsplit(x,'[.,]’)[[1]][1]) FamilyId<-paste0(full$FamilySize,Surname) full$FamilyId<-factor(FamilyId) Family<-data.frame(table(FamilyId)) SmallFamily<-Family$FamilyId[Family$Freq<=2] FamilyId[FamilyId %in% SmallFamily]<-‘Small’ full$FamilyId2<-factor(FamilyId) #Exact Deck from Cabin number full$Deck<-sapply(full$Cabin, function(x) strsplit(x,NULL)[[1]][1]) #Excat Position from Cabin number full$CabinNum<-sapply(full$Cabin,function(x) strsplit(x,'[A-Z]’)[[1]][2]) full$num<-as.numeric(full$CabinNum) num<-full$num[!is.na(full$num)] Pos<-kmeans(num,3) full$CabinPos[!is.na(full$num)]<-Pos$cluster full$CabinPos<-factor(full$CabinPos) levels(full$CabinPos)<-c(‘Front’,’End’,’Middle’) full$num<-NULL full<-transform(full, Pclass=factor(Pclass), Sex=factor(Sex), Embarked=factor(Embarked), Title=factor(Title), Mother=factor(Mother), Child=factor(Child), FamilyId2=factor(FamilyId2), Deck=factor(Deck) ) #split train/test data train<-full[full$Cat==’train’,] test<-full[full$Cat==’test’,] train$Survived<-factor(train$Survived) #rf.fit = randomForest(Survived…


Link to Full Article: Titanic RandomForrest v1.2