R 笔记：大型数据文件流读取与写入

发布时间：2020-12-14 03:05:30 所属栏目：大数据来源：网络整理

导读：# ------------clear existed variants------------rm(list=ls())START_TIME - Sys.time()StopWatch - function(start_time){dt - difftime(Sys.time(),start_time,units='secs')print(paste('Time Cost',format(.POSIXct(dt,tz="GMT"),"%H:%M:%S"),sep=': '

# ------------clear existed variants------------
rm(list=ls())

START_TIME <- Sys.time()
StopWatch <- function(start_time){
	dt <- difftime(Sys.time(),start_time,units='secs')
	print(paste('Time Cost',format(.POSIXct(dt,tz="GMT"),"%H:%M:%S"),sep=': '))
}

path <- "C:/Users/Public/Data Analysis in R"
if (!file.exists(path)){
	print("Creating work directory...")
	dir.create(path)
}
setwd(path)

data_folder <- "data"
filename <- "data.csv"
filename <- paste(data_folder,filename,sep='/')

if (!file.exists(filename)){
	print('Data file does not exists.')
	quit()
}

resample <- function(mtime,btime,border){
	return (abs(as.numeric(difftime(mtime,btime),units='hours')) > border)
}

border <- 24
resample_length_first <- 0
resample_length_second <- 0
data_length <- 1

fcon <- file(filename,open='r')
line <- readLines(fcon,n=1)
print('Titles>>>')
print(strsplit(line,split=';')[[1]])
while(length(line) != 0){
	if(grepl(";",line)){
		mtime <- line
		if (data_length == 2){
			stime <- strsplit(mtime,split=';')[[1]][1]
			print(paste('startTime',stime,sep=': '))
		}
	}else{
		print("Unexpected line:")
		print(data_length)
		print(line)
	}
	line <- readLines(fcon,n=1)	
	data_length <- data_length + 1
}
close(fcon)
etime <- strsplit(mtime,split=';')[[1]][1]
print(paste('endTime',etime,sep=': '))
print(paste('Count',data_length,sep=': '))

StopWatch(START_TIME)

scon <- file(filename,open='r')
line <- readLines(scon,n=1)
while(length(line) != 0){	
	line <- readLines(scon,n=1)
	if(length(line) > 0 && grepl(";",line)){
		mtime <- strsplit(line,';')[[1]][1]
		if(resample(mtime,border)){
			resample_length_first = resample_length_first + 1
		} 
		if(resample(mtime,border)){
			resample_length_second = resample_length_second + 1
		}
	}	
}
close(scon)

interval <- 1048570
resample_length <- resample_length_first + resample_length_second
interval <- ceiling((data_length - resample_length) / (interval - resample_length)) 
print(paste("Interval",interval,sep=': '))

StopWatch(START_TIME)

idx <- 1
idx_tmp <- 0
tcon <- file(filename,open='r')
d_con <- file("resample.csv",open='w')
line <- readLines(tcon,n=1)
while(length(line) != 0){	
	if(idx <= resample_length_first || idx >= data_length - resample_length_second){
		writeLines(line,d_con)
	}else{
		if(idx_tmp %% interval == 0){
			writeLines(line,d_con)
		}
		idx_tmp = idx_tmp + 1
	}
	line <- readLines(tcon,n=1)
	idx <- idx + 1
}
close(d_con)
close(tcon)

StopWatch(START_TIME)

1. 对于体积较大的csv文件，不仅用Microsoft excel打不开，而且在用R处理时，使用read.csv()方法也不能全部打开，所以使用R中的readLines()和writeLines()方法，减少内存消耗。

2. 对数据文件起始位置定位暂时想不到好的办法，不得不遍历两次。

（编辑：李大同）

【声明】本站内容均来自网络，其相关言论仅代表作者个人观点，不代表本站立场。若无意侵犯到您的权利，请及时与联系站长删除相关内容!