加入收藏 | 设为首页 | 会员中心 | 我要投稿 李大同 (https://www.lidatong.com.cn/)- 科技、建站、经验、云计算、5G、大数据,站长网!
当前位置: 首页 > 大数据 > 正文

利用R语言对Titanic数据test.csv进行处理

发布时间:2020-12-14 03:10:17 所属栏目:大数据 来源:网络整理
导读:说明 现在使用另外给的数据集做为测试集,该数据是与titanic.csv是从同一个总体中抽取的。 test.csv的下载地址:https://www.kaggle.com/c/titanic/data 对数据做相同的分析(清洗,插补,筛选,初步可视化) #经过基础处理之后查看缺失值比例 setwd( "d:/R-

说明

现在使用另外给的数据集做为测试集,该数据是与titanic.csv是从同一个总体中抽取的。
test.csv的下载地址:https://www.kaggle.com/c/titanic/data

对数据做相同的分析(清洗,插补,筛选,初步可视化)

#经过基础处理之后查看缺失值比例
setwd("d:/R-TT")
getwd()
train.data = read.csv("test.csv",na.strings = c("NA",""))
str(train.data)
train.data$Survived = factor(train.data$Survived)
train.data$Pclass = factor(train.data$Pclass)
str(train.data)

is.na(train.data$Age)
sum(is.na(train.data$Age) == TRUE)
sum(is.na(train.data$Age) == TRUE)/length(train.data$Age)
sapply(train.data,function(df){
  sum(is.na(df==TRUE))/length(df)
})
PassengerId      Pclass        Name         Sex         Age       SibSp       Parch      Ticket        Fare       Cabin 
0.000000000 0.000000000 0.000000000 0.000000000 0.205741627 0.000000000 0.000000000 0.000000000 0.002392344 0.782296651 
   Embarked 
0.000000000
#缺失值可视化
library(Amelia)
library(Rcpp)
missmap(train.data,main = "MISSING MAP")

#发现有Cabin,Age,Fare三个属性缺失,其中Cabin缺失过多。不做处理。分别对age与fare做进一步分析,
按照我们之前的方法处理age这个数据时,发现多还有Ms. 
train.data$Name = as.character(train.data$Name)

strsplit(train.data$Name,"s+")

unlist(strsplit(train.data$Name,"s+"))


table_words = table(unlist(strsplit(train.data$Name,"s+")))

sort(table_words [grep(".",names(table_words))],decreasing = TRUE)

library(stringr)
tb = cbind(train.data$Age,str_match(train.data$Name,"[a-zA-Z]+."))
tb
tb[is.na(tb[,1]),2]

table(tb[is.na(tb[,2])
Master.   Miss.     Mr.    Mrs.     Ms. 
      4      14      57      10       1 
#发现Ms只有一个,无法用平均数文数处理,因此我们使用titanic.csv中Ms的平均数
mean.mr = mean(train.data$Age[grepl("Mr.",train.data$Name)&!is.na(train.data$Age)])
mean.mrs = mean(train.data$Age[grepl("Mrs.",train.data$Name)&!is.na(train.data$Age)])
mean.miss = mean(train.data$Age[grepl("Miss.",train.data$Name)&!is.na(train.data$Age)])
mean.master = mean(train.data$Age[grepl("Master.",train.data$Name)&!is.na(train.data$Age)])
mean.ms = 28

train.data$Age[grepl("Mr.",train.data$Name)&is.na(train.data$Age)] = mean.mr
train.data$Age[grepl("Mrs.",train.data$Name)&is.na(train.data$Age)] = mean.mrs
train.data$Age[grepl("Miss.",train.data$Name)&is.na(train.data$Age)] = mean.miss
train.data$Age[grepl("Master.",train.data$Name)&is.na(train.data$Age)] = mean.master

train.data$Age[grepl("Ms.",train.data$Name)&is.na(train.data$Age)] = mean.ms
#这时我们发现还有fare 这个因素没有处理,用相同的方法处理

mean.fare1 = mean(train.data$Fare[grepl("1",train.data$Pclass)&!is.na(train.data$Fare)])
mean.fare2 = mean(train.data$Fare[grepl("2",train.data$Pclass)&!is.na(train.data$Fare)])
mean.fare3 = mean(train.data$Fare[grepl("3",train.data$Pclass)&!is.na(train.data$Fare)])

train.data$Fare[grepl("1",train.data$Pclass)&is.na(train.data$Fare)] = mean.fare1
train.data$Fare[grepl("2",train.data$Pclass)&is.na(train.data$Fare)] = mean.fare2
train.data$Fare[grepl("3",train.data$Pclass)&is.na(train.data$Fare)] = mean.far3
最后的处理结果

(编辑:李大同)

【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!

    推荐文章
      热点阅读