R高效数据处理包dplyr和data.table,你选哪个?
背景介绍在我的数据处理任务中,我算是使用 dplyr
data.table
数据处理首先,我们需要加载一些R包: library(dplyr) library(data.table) library(lubridate) library(jsonlite) library(tidyr) library(ggplot2) library(compare) 本次实验数据来源于DATA.GOV,它是一份关于医院医保的索赔支出数据,数据从这里下载。本文使用 spending=fromJSON("https://data.medicare.gov/api/views/nrth-mfg3/rows.json?accessType=DOWNLOAD") names(spending) "meta" "data" meta=spending$meta hospital_spending=data.frame(spending$data) colnames(hospital_spending)=make.names(meta$view$columns$name) hospital_spending=select(hospital_spending,-c(sid:meta)) glimpse(hospital_spending) Observations: 70598 Variables: $ Hospital.Name ? ? ? ? ? ? ? ? ? ? ? (fctr) SOUTHEAST ALABAMA MEDICAL CENT... $ Provider.Number. ? ? ? ? ? ? ? ? ? ?(fctr) 010001,010001,010001... $ State ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? (fctr) AL,AL,AL... $ Period ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?(fctr) 1 to 3 days Prior to Index Hos... $ Claim.Type ? ? ? ? ? ? ? ? ? ? ? ? ?(fctr) Home Health Agency,Hospice,I... $ Avg.Spending.Per.Episode..Hospital. (fctr) 12,1,6,160,462,0... $ Avg.Spending.Per.Episode..State. ? ?(fctr) 14,85,2,9,492,... $ Avg.Spending.Per.Episode..Nation. ? (fctr) 13,5,117,532,0... $ Percent.of.Spending..Hospital. ? ? ?(fctr) 0.06,0.01,0.03,0.84,... $ Percent.of.Spending..State. ? ? ? ? (fctr) 0.07,0.46,... $ Percent.of.Spending..Nation. ? ? ? ?(fctr) 0.07,0.00,0.58,... $ Measure.Start.Date ? ? ? ? ? ? ? ? ?(fctr) 2014-01-01T00:00:00,2014-01-0... $ Measure.End.Date ? ? ? ? ? ? ? ? ? ?(fctr) 2014-12-31T00:00:00,2014-12-3... 如上所示,导入的所有数据列都是因子型数据。下面我们将列中数据为数值的列改为数值型数据: cols = 6:11; # 需要改变数据类型的列 hospital_spending[,cols] <- lapply(hospital_spending[,cols],as.character) hospital_spending[,as.numeric) 最后两列数据分别是数据收集的起始日期和结束日期。因此,我们需要使用 cols = 12:13; # 需要纠正的两列 hospital_spending[,ymd_hms) 现在,我们可以检查下数据列是否是我们想要的数据类型: sapply(hospital_spending,class) $Hospital.Name ? ?"factor" $Provider.Number. ? ?"factor" $State ? ?"factor" $Period ? ?"factor" $Claim.Type ? ?"factor" $Avg.Spending.Per.Episode..Hospital. ? ?"numeric" $Avg.Spending.Per.Episode..State. ? ?"numeric" $Avg.Spending.Per.Episode..Nation. ? ?"numeric" $Percent.of.Spending..Hospital. ? ?"numeric" $Percent.of.Spending..State. ? ?"numeric" $Percent.of.Spending..Nation. ? ?"numeric" $Measure.Start.Date ? ? ? ?"POSIXct" "POSIXt" $Measure.End.Date ? ? ? ?"POSIXct" "POSIXt" 创建data.table类型数据使用 hospital_spending_DT = data.table(hospital_spending) class(hospital_spending_DT) "data.table" "data.frame" 选取数据集的某些列对于选取数据列,我们可以使用 选取一个变量选取“Hospital Name”列: from_dplyr = select(hospital_spending,Hospital.Name) from_data_table = hospital_spending_DT[,.(Hospital.Name)] 现在,我们对比下 compare(from_dplyr,from_data_table,allowAll=TRUE) TRUE dropped attributes 删除一个变量from_dplyr = select(hospital_spending,-Hospital.Name) from_data_table = hospital_spending_DT[,!c("Hospital.Name"),with=FALSE] compare(from_dplyr,allowAll=TRUE) TRUE dropped attributes 我们也可以使用 DT=copy(hospital_spending_DT) DT=DT[,Hospital.Name:=NULL] "Hospital.Name"%in%names(DT)FALSE 我们也可以一次性删除多个变量: DT=copy(hospital_spending_DT) DT=DT[,c("Hospital.Name","State","Measure.Start.Date","Measure.End.Date"):=NULL] ?c("Hospital.Name","Measure.End.Date") %in% names(DT) FALSE FALSE FALSE FALSE 选取多个变量选取如下变量:“Hospital.Name”,“State”,“Measure.Start.Date”,“Measure.End.Date” from_dplyr = select(hospital_spending,Hospital.Name,State,Measure.Start.Date,Measure.End.Date) from_data_table = hospital_spending_DT[,.(Hospital.Name,Measure.End.Date)] compare(from_dplyr,allowAll=TRUE) TRUE dropped attributes 删除多个变量现在,我们要删除hospital_spending数据框和data.table类型数据hospital_spending_DT中的变量Hospital.Name,State,Measure.Start.Date,Measure.End.Date: from_dplyr = select(hospital_spending,-c(Hospital.Name,Measure.End.Date)) from_data_table = hospital_spending_DT[,!c("Hospital.Name","Measure.End.Date"),with=FALSE] compare(from_dplyr,allowAll=TRUE) TRUE dropped attributes
from_dplyr = select(hospital_spending,contains("Date")) from_data_table = subset(hospital_spending_DT,select=grep("Date",names(hospital_spending_DT))) compare(from_dplyr,allowAll=TRUE) TRUE dropped attributes names(from_dplyr) "Measure.Start.Date" "Measure.End.Date" 重命名列名setnames(hospital_spending_DT,c("Hospital","Start_Date","End_Date")) names(hospital_spending_DT) "Hospital" "Provider.Number." "State" "Period" "Claim.Type" "Avg.Spending.Per.Episode..Hospital." "Avg.Spending.Per.Episode..State." "Avg.Spending.Per.Episode..Nation." "Percent.of.Spending..Hospital." "Percent.of.Spending..State." "Percent.of.Spending..Nation." "Start_Date" "End_Date" hospital_spending = rename(hospital_spending,Hospital= Hospital.Name,Start_Date=Measure.Start.Date,End_Date=Measure.End.Date) compare(hospital_spending,hospital_spending_DT,allowAll=TRUE) TRUE ?dropped attributes 筛选数据集的某些行对于数据集特定行的筛选,我们可以使用 对单个变量进行筛选from_dplyr = filter(hospital_spending,State=='CA') # selecting rows for California from_data_table = hospital_spending_DT[State=='CA'] compare(from_dplyr,allowAll=TRUE) TRUE ?dropped attributes 对多个变量进行筛选from_dplyr = filter(hospital_spending,State=='CA' & Claim.Type!="Hospice") from_data_table = hospital_spending_DT[State=='CA' & Claim.Type!="Hospice"] compare(from_dplyr,allowAll=TRUE) TRUE ?dropped attributes from_dplyr = filter(hospital_spending,State %in% c('CA','MA',"TX")) from_data_table = hospital_spending_DT[State %in% c('CA',"TX")] unique(from_dplyr$State) CA MA TX compare(from_dplyr,allowAll=TRUE) TRUE dropped attributes 数据排序我们使用 升序from_dplyr = arrange(hospital_spending,State) from_data_table = setorder(hospital_spending_DT,State) compare(from_dplyr,allowAll=TRUE) TRUE dropped attributes 降序from_dplyr = arrange(hospital_spending,desc(State)) from_data_table = setorder(hospital_spending_DT,-State) compare(from_dplyr,allowAll=TRUE) TRUE dropped attributes 对多变量进行排序以下代码实现了State变量升序,End_Date变量降序排序: from_dplyr = arrange(hospital_spending,desc(End_Date)) from_data_table = setorder(hospital_spending_DT,-End_Date) compare(from_dplyr,allowAll=TRUE) TRUE dropped attributes 添加或更新列在 from_dplyr = mutate(hospital_spending,diff=Avg.Spending.Per.Episode..State. - Avg.Spending.Per.Episode..Nation.) from_data_table = copy(hospital_spending_DT) from_data_table = from_data_table[,diff := Avg.Spending.Per.Episode..State. - Avg.Spending.Per.Episode..Nation.] compare(from_dplyr,allowAll=TRUE) TRUE sorted renamed rows dropped row names dropped attributes from_dplyr = mutate(hospital_spending,diff1=Avg.Spending.Per.Episode..State. - Avg.Spending.Per.Episode..Nation.,diff2=End_Date-Start_Date) from_data_table = copy(hospital_spending_DT) from_data_table = from_data_table[,c("diff1","diff2") := list(Avg.Spending.Per.Episode..State. - Avg.Spending.Per.Episode..Nation.,diff2=End_Date-Start_Date)] compare(from_dplyr,allowAll=TRUE) TRUE dropped attributes 数据汇总我们可以使用 summarize(hospital_spending,mean=mean(Avg.Spending.Per.Episode..Nation.)) mean 1820.409 hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Nation.))] mean 1820.409 summarize(hospital_spending,mean=mean(Avg.Spending.Per.Episode..Nation.),? ? ? ? ? ? ? ? ? ? ? ? ? ?maximum=max(Avg.Spending.Per.Episode..Nation.),? ? ? ? ? ? ? ? ? ? ? ? ? ?minimum=min(Avg.Spending.Per.Episode..Nation.),? ? ? ? ? ? ? ? ? ? ? ? ? ?median=median(Avg.Spending.Per.Episode..Nation.)) mean ? ? maximum ? minimum ?median 1820.409 ?20025 ? ? ? 0 ? ? ?109 hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Nation.),? ? ? ? ? ? ? ? ? ? ? ?maximum=max(Avg.Spending.Per.Episode..Nation.),? ? ? ? ? ? ? ? ? ? ? ?minimum=min(Avg.Spending.Per.Episode..Nation.),? ? ? ? ? ? ? ? ? ? ? ?median=median(Avg.Spending.Per.Episode..Nation.))] mean ? ? ?maximum ? minimum ?median 1820.409 ?20025 ? ? ? 0 ? ? ?109 当然,我们也可以对各分组的数据块分别求概述性统计量。在 head(hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Hospital.)),by=.(Hospital)]) mygroup= group_by(hospital_spending,Hospital) from_dplyr = summarize(mygroup,mean=mean(Avg.Spending.Per.Episode..Hospital.)) from_data_table=hospital_spending_DT[,by=.(Hospital)] compare(from_dplyr,allowAll=TRUE) TRUE ?sorted ?renamed rows ?dropped row names ?dropped attributes 我们也可以对多个分组条件进行分组计算: head(hospital_spending_DT[,? ? ? ? ? ? ? ? ? ? ? ? ?by=.(Hospital,State)]) mygroup= group_by(hospital_spending,Hospital,State) from_dplyr = summarize(mygroup,by=.(Hospital,State)] compare(from_dplyr,allowAll=TRUE) TRUE ?sorted ?renamed rows ?dropped row names ?dropped attributes 链式操作在 from_dplyr=hospital_spending %>% group_by(Hospital,State) %>% summarize(mean=mean(Avg.Spending.Per.Episode..Hospital.)) from_data_table=hospital_spending_DT[,allowAll=TRUE) TRUE ?sorted ?renamed rows ?dropped row names ?dropped attributes hospital_spending %>% group_by(State) %>% summarize(mean=mean(Avg.Spending.Per.Episode..Hospital.)) %>% arrange(desc(mean)) %>% head(10) %>% ? ? ? ?mutate(State = factor(State,levels = State[order(mean,decreasing =TRUE)])) %>% ? ? ? ? ?ggplot(aes(x=State,y=mean))+geom_bar(stat='identity',color='darkred',fill='skyblue')+ ? ? ? ? ?xlab("")+ggtitle('Average Spending Per Episode by State')+ ? ? ? ? ?ylab('Average')+ coord_cartesian(ylim = c(3800,4000)) hospital_spending_DT[,? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? by=.(State)][order(-mean)][1:10] %>% ? ? ? ? ? ?mutate(State = factor(State,decreasing =TRUE)])) %>% ? ? ? ? ? ggplot(aes(x=State,4000)) 总结在本篇文章,我们展示了使用 (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |