install.packages("ggplot2")
install.packages("ggthemes")
install.packages("scales")
install.packages("dplyr")
install.packages("mice")
install.packages("randomForest")
train<-read.csv("E:/train.csv",stringsAsFactors = F)
test<-read.csv("E:/test.csv",stringsAsFactors = F)
library('ggplot2') # 可视化
library('ggthemes') # 可视化
library('scales') # 可视化
library('dplyr') # 数据处理
library('mice') # 缺失值填补
library('randomForest') # 随机森林建模
str(train)
str(test)
head(train)
head(test)
full <- bind_rows(train, test) #结合两个数据集,这样后期预测才能针对同一变量进行
str(full)
summary(full)
#(5)缺失值寻找
na_full<-function(x){
i<-1
na_x<-0
while(i<=nrow(full)){
if(x[i]==" "){
na_x<-na_x+1
}
i=i+1
}
return(na_x)
}
na_full(full$Name)
na_full(full$Pclass)
na_full(full$Name)
na_full(full$Sex)
na_full(full$Age)
na_full(full$SibSp)
na_full(full$Parch)
na_full(full$Ticket)
na_full(full$Fare)
na_full(full$Cabin)
na_full(full$Embarked)
newfull<-select(full,-cabin)
which(is.na(newfull$Fare))
which(is.na(newfull$Embarked==""))
newfull$Fare[1044]<-as.numeric(median(newfull$Fare,na.rm = TRUE))
newfull$Embarked[c(62,830)]<-"c"
mice_mod<-mice(newfull[,!names(newfull)%in%c("PassengerID","Name","Survived")],method = "rf")
mice_output<-complete(mice_mod)
newfull$Age<-mice_output$Age