R 笔记:大型数据文件流读取与写入
发布时间:2020-12-14 03:05:30 所属栏目:大数据 来源:网络整理
导读:# ------------clear existed variants------------rm(list=ls())START_TIME - Sys.time()StopWatch - function(start_time){dt - difftime(Sys.time(),start_time,units='secs')print(paste('Time Cost',format(.POSIXct(dt,tz="GMT"),"%H:%M:%S"),sep=': '
# ------------clear existed variants------------ rm(list=ls()) START_TIME <- Sys.time() StopWatch <- function(start_time){ dt <- difftime(Sys.time(),start_time,units='secs') print(paste('Time Cost',format(.POSIXct(dt,tz="GMT"),"%H:%M:%S"),sep=': ')) } path <- "C:/Users/Public/Data Analysis in R" if (!file.exists(path)){ print("Creating work directory...") dir.create(path) } setwd(path) data_folder <- "data" filename <- "data.csv" filename <- paste(data_folder,filename,sep='/') if (!file.exists(filename)){ print('Data file does not exists.') quit() } resample <- function(mtime,btime,border){ return (abs(as.numeric(difftime(mtime,btime),units='hours')) > border) } border <- 24 resample_length_first <- 0 resample_length_second <- 0 data_length <- 1 fcon <- file(filename,open='r') line <- readLines(fcon,n=1) print('Titles>>>') print(strsplit(line,split=';')[[1]]) while(length(line) != 0){ if(grepl(";",line)){ mtime <- line if (data_length == 2){ stime <- strsplit(mtime,split=';')[[1]][1] print(paste('startTime',stime,sep=': ')) } }else{ print("Unexpected line:") print(data_length) print(line) } line <- readLines(fcon,n=1) data_length <- data_length + 1 } close(fcon) etime <- strsplit(mtime,split=';')[[1]][1] print(paste('endTime',etime,sep=': ')) print(paste('Count',data_length,sep=': ')) StopWatch(START_TIME) scon <- file(filename,open='r') line <- readLines(scon,n=1) while(length(line) != 0){ line <- readLines(scon,n=1) if(length(line) > 0 && grepl(";",line)){ mtime <- strsplit(line,';')[[1]][1] if(resample(mtime,border)){ resample_length_first = resample_length_first + 1 } if(resample(mtime,border)){ resample_length_second = resample_length_second + 1 } } } close(scon) interval <- 1048570 resample_length <- resample_length_first + resample_length_second interval <- ceiling((data_length - resample_length) / (interval - resample_length)) print(paste("Interval",interval,sep=': ')) StopWatch(START_TIME) idx <- 1 idx_tmp <- 0 tcon <- file(filename,open='r') d_con <- file("resample.csv",open='w') line <- readLines(tcon,n=1) while(length(line) != 0){ if(idx <= resample_length_first || idx >= data_length - resample_length_second){ writeLines(line,d_con) }else{ if(idx_tmp %% interval == 0){ writeLines(line,d_con) } idx_tmp = idx_tmp + 1 } line <- readLines(tcon,n=1) idx <- idx + 1 } close(d_con) close(tcon) StopWatch(START_TIME) ? 1. 对于体积较大的csv文件,不仅用Microsoft excel打不开,而且在用R处理时,使用read.csv()方法也不能全部打开,所以使用R中的readLines()和writeLines()方法,减少内存消耗。 2. 对数据文件起始位置定位暂时想不到好的办法,不得不遍历两次。(编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |