大数据处理之一

发布时间：2020-12-14 02:33:46 所属栏目：大数据来源：网络整理

导读：搜索引擎会通过日志文件把用户每次检索使用的所有检索串都记录下来，每个查询串的长度为1-255字节。假设目前有一千万个记录（这些查询串的重复度比较高，虽然总数是1千万，但如果除去重复后，不超过3百万个。一个查询串的重复度越高，说明查询它的用户越多，

搜索引擎会通过日志文件把用户每次检索使用的所有检索串都记录下来，每个查询串的长度为1-255字节。假设目前有一千万个记录（这些查询串的重复度比较高，虽然总数是1千万，但如果除去重复后，不超过3百万个。一个查询串的重复度越高，说明查询它的用户越多，也就是越热门），请你统计最热门的10个查询串，要求使用的内存不能超过1G。

思路：每个查询串长度不超过255个字符，然后根据这255个字符，进行字符串查找，首先要做的就是进行看看我们应该怎么进行查找
1000 0000 *256=25 6000 0000（保证最后有一个作为字符串结束标记）
1G也即? 1024*1024*1024= 10 7374 1824
我们来看下根据内存要求，每一次处理，最多可以处理1G,我们需要处理一共256 000 0000/10 7374 1824=2次
我们可以这样进行1G字节，的要求下，可以运行5次，保证每一次运行都小于使用内存1G

每一次运行256 000 0000/5= 51 200 0000然后呢，进行512000000/256=200 0000行数据
现在讨论下：fstream进行

1、所以使用内存映射文件进行大文件的读取
2、首先呢，将51 200 0000个字符放在内存当中
3、进行查找换位符，当找到了第200 0000的倍数个换位符的时候,进行处理
4、然后呢，我们将所有的查询字符串进行哈希值计算，然后进行对10求余，将查询字符串分发在i.txt文件中(i是0~9)，
?? 注意，这里面记录了所有查询字符串的访问次数
5、然后遍历这10个文件，依次遍历整个文件进行将对其进行次数统计，并将每个文件的频度排名前10的放在result_i.txt
??? 当中

6、读取所有的result_i。txt文档，然后在这排名前100的查询字符串中找到排名前10的放在result.txt当中

// ConsoleApplication1.cpp : Defines the entry point for the console application.
//

#include "stdafx.h"
#include<Windows.h>
#include<assert.h>
#include<hash_map>
#include<iostream>
#include<fstream>
#include"Header.h"
using namespace std;
#define FILENAME "proc.txt"
const int read_byte_count = 512000000;
const _int64 max_ul_int =   4294967296;
_int64 file_start_pos;
_int64 file_end_pos;
struct str_equal
{

	bool operator()(char* a,char* b){
		return strcmp(a,b)==0;
	}
	int operator()(char* a)
	{
		int i = 0;
		int sum = 0;
		while (a[i] != '')
		{
			sum += a[i];
		}
		return sum % 100;
	}
};

int hash_char(char*str)
{
	int sum = 0;
	int i = 0;
	while (str[i] != '')
	{
		sum += str[i];
		i++;
	}
	return sum % 10;
}
void save2txt(char* str,int rand)
{
	char txtname[20] = { 0 };
	sprintf(txtname,"%d",rand);
	ofstream out_txt;
	out_txt.open(txtname,ios::app);
	out_txt.write(str,strlen(str));
	out_txt.close();
}
void proc_str(LPVOID & str)
{
	//获得此次处理的文件长度
	char* str_index = (char*)str;
	_int64 str_length = file_end_pos - file_start_pos;
	_int64 byte_count = 0;
	_int64 index = 0;
	_int64 end = 0;
	while (byte_count != str_length)
	{
		char buf[256] = { 0 };
		if (str_index[byte_count] == 'n')
		{
			end = byte_count;
			strncpy(buf,str_index + index,end - index);
			//对buf进行处理,将查询字符串和字符串出现的频度放在里面
			int a = hash_char(buf);
			save2txt(buf,a);
			index = end;
		}
		byte_count++;	
	}
}
void rePosition(LPVOID lpBase)
{
	int Enter_num = 0;
	_int64 byte_count = 0;
	char* mm = (char*)lpBase;
	while (Enter_num != 2000000)
	{
		try
		{

			if (mm[byte_count] == 'n')
			{
				Enter_num++;
			}
			byte_count++;
		}
		catch (...)
		{
			//如果基数还不到4000个换行字符，就出现了读取错误，也即已经是最后一个了，不满4000行数据
			file_end_pos = file_start_pos + byte_count;
			return;
		}
	}
	//到这里
	file_end_pos = file_start_pos + byte_count;
}
void getFilePointer(HANDLE& hFile)
{
	assert(hFile);
	if (file_start_pos + read_byte_count < max_ul_int)
	{
		//还在4G范围之内
		file_end_pos = file_start_pos + read_byte_count;
		HANDLE hMap = ::CreateFileMapping(hFile,NULL,PAGE_READWRITE,file_start_pos,NULL);
		LPVOID  lpBase = ::MapViewOfFile(hMap,FILE_MAP_ALL_ACCESS,read_byte_count);
		//开始进行处理，查找首次出现的4000个换行符
		rePosition(lpBase);
		proc_str(lpBase);
		::UnmapViewOfFile(lpBase);
		file_start_pos = file_end_pos;
	}
	else
	{
		//已经超出了4G的范围了，现在开始进行使用高位
		file_end_pos = file_start_pos + read_byte_count;
		DWORD dwHigh = (file_start_pos + read_byte_count) / max_ul_int;
		DWORD dwLow = (file_start_pos + read_byte_count) - dwHigh*max_ul_int;
		HANDLE hMap = ::CreateFileMapping(hFile,dwHigh,dwLow,read_byte_count);
		//开始进行处理，查找首次出现的4000个换行符
		rePosition(lpBase);
		proc_str(lpBase);
		::UnmapViewOfFile(lpBase);
		file_start_pos = file_end_pos;
	}
}
int main()
{
	add(5,6);
	file_start_pos = 0;
	file_end_pos = 0;
	HANDLE hFile = ::CreateFileA(FILENAME,GENERIC_ALL,FILE_SHARE_READ,FILE_ATTRIBUTE_NORMAL,NULL);
	for (int i = 0; i < 5; i++)
	{
		getFilePointer(hFile);
	}
	CloseHandle(hFile);
	hash_map<char*,int,str_equal> hash_obj;
	hash_map<char*,str_equal>::iterator itr;
	//最后开始对0-9的txt文档
	for (int i = 0; i < 10; i++)
	{
		hash_obj.clear();
		char buf[20] = { 0 };
		sprintf(buf,i);
		ifstream in_txt;
		in_txt.open(buf);
		//开始对每一个文件的查询字符串进行统计频次
		char con_buf[256] = { 0 };
		in_txt.getline(con_buf,256);
		itr = hash_obj.find(con_buf);
		if (itr == hash_obj.end())
		{
			//不存在这个查询字符串
			pair<char*,int> tmp;
			tmp.first = con_buf;
			tmp.second = 1;
			hash_obj.insert(tmp);
		}
		else
		{
			//存在查询字符串，修改当前频度
			itr->second++;
		}
		in_txt.close();
		//修改完成之后将所有hash_map中的内容放在结果txt当中
		//使用堆排序进行完成，放在i对应的result_i.txt 当中
		int ele_count = hash_obj.size();
		itr = hash_obj.begin(); 
		hash_map<char*,str_equal>::iterator the_max;
		the_max = hash_obj.begin();
		//将前10个
		for (int j = 0; j<10; j++)
		{
			for (itr=hash_obj.begin(); itr != hash_obj.end(); itr++)
			{
				if (itr->second >(the_max->second))
				{
					the_max = itr;
					
				}
			}
			//将记录放在result_i.txt当中
			ofstream result_i;
			char result_txt[20];
			sprintf(result_txt,"result_%d.txt",j);
			result_i.open(result_txt,ios::app);
			result_i.write(the_max->first,strlen(the_max->first));
			result_i.write("n",strlen("n"));
			char mm[5] = { 0 };
			sprintf(mm,the_max->second);
			result_i.write(mm,strlen(mm));
			result_i.write("n",strlen("n"));
			result_i.close();
			hash_obj.erase(the_max);
		}
	}
	//最后从result_0~result_9这10个文件中读取出前十个
	hash_map<char*,str_equal> result_hash;
	for (int i = 0; i < 10; i++)
	{
		char file_name[20] = { 0 };
		sprintf(file_name,"result_i",i);
		ifstream in_txt;
		in_txt.open(file_name);
		char buf[256] = { 0 };
		in_txt.getline(buf,256);
		char uh[10] = { 0 };
		in_txt.getline(uh,10);
		int value = atoi(uh);
		pair<char*,int> mm;
		mm.first = buf;
		mm.second = value;
		result_hash.insert(mm);
	}
	//最后根据冒泡法，进行前十的输出
	itr = result_hash.begin();
	hash_map<char*,str_equal>::iterator the_max;

	for (int i = 0; i < 10; i++)
	{
		the_max = result_hash.begin();
		for (itr = result_hash.begin(); itr != result_hash.end(); itr++)
		{
			if (the_max->second < itr->second)
			{
				the_max = itr;
			}
		}
		ofstream result;
		result.open("result.txt",ios::app);
		result.write(the_max->first,strlen(the_max->first));
		result.write("n",strlen("n"));
		char buf[20] = { 0 };
		sprintf(buf,the_max->second);
		result.write(buf,20);
		result.write("n",strlen("n"));
		result.close();

	}
}

（编辑：李大同）

【声明】本站内容均来自网络，其相关言论仅代表作者个人观点，不代表本站立场。若无意侵犯到您的权利，请及时与联系站长删除相关内容!