关联规则挖掘之Apriori优化

发布时间：2020-12-14 02:58:52 所属栏目：大数据来源：网络整理

导读：一、仔细看代码会发现如果要加入一个L4是很简单的。计算L4的代码如下： /*construct C4...................................................*/vectorNode4 c4;vectorNode4 l4;vectorNode3::iterator ite41=l3.begin();//iter 41,42for(;ite41!=l3.end();it

一、仔细看代码会发现如果要加入一个L4是很简单的。计算L4的代码如下：

	/*construct C4...................................................*/
	vector<Node4> c4;
	vector<Node4> l4;

	vector<Node3>::iterator ite41=l3.begin();//iter 41,42
	for(;ite41!=l3.end();ite41++)
	{
		vector<Node3>::iterator ite42=l3.begin();
		for(;ite42!=l3.end();ite42++)
		{

			if(ite41->index_j==ite42->index_i && ite41->index_k == ite42->index_j)
			{
				Node4 n4(ite41->index_i,ite41->index_j,ite41->index_k,ite42->index_k,0);
				c4.push_back(n4);
			}

		}
	}
	/*countC3.................................................*/
	for(int out=0;out<D;out++)
	{
		for(int i=0;a[out][i]!='';i++)
		{
			int h1 = hash1(a[out][i]);
			for(int j=i+1;a[out][j]!='';j++)
			{
				int h2=hash1(a[out][j]);
				for(int k=j+1;a[out][k]!='';k++)
				{
					int h3=hash1(a[out][k]);
					for(int m=k+1;a[out][m]!='';m++)
					{
						int h4=hash1(a[out][m]);
						vector<Node4>::iterator ite=c4.begin();
						for(;ite!=c4.end();ite++)
						{
							if(ite->index_i ==h1 && ite->index_j == h2 && ite->index_k==h3 && ite->index_m == h4)
								ite->count++;
						}
					}
	
				}
	
			}
		}
	}
	/*prune C4 to l4....................................*/
	l4.clear();
	vector<Node4>::iterator ite4=c4.begin(); //ite4
	for(;ite4!=c4.end();ite4++)
	{
		if(ite4->count>=MinSupCount)
		{
			l4.push_back(*ite4);
		}
	}
	cout<<"print L4..........................."<<endl;
	printNode4(l4);

二、算法优化

2.1 事务性压缩

我们可以发现求L3时至少需要事物D的长度为3.于是增加一个长度为D的一维数组，来存储各个事务的长度。这段代码可以镶嵌在求C1的第一次扫描中。因为第一次基本不用事务限制。

	int lengthOfD[D]={0};
	for(int i=0;i<D;i++)     
	{  
		int length = 0;
		for(int j=0;a[i][j]!='';j++)    
		{
			length++;
		}
		lengthOfD[i]=length;
	}

于是改善的扫描代码如下：

	/*countC3.................................................*/
	for(int out=0;out<D;out++)
	{
		if(lengthOfD[out]<3)
		{
			continue;
		}
		for(int i=0;a[out][i]!='';i++)
		{
			int h1 = hash1(a[out][i]);
			for(int j=i+1;a[out][j]!='';j++)
			{
				int h2=hash1(a[out][j]);
				for(int k=j+1;a[out][k]!='';k++)
				{
					int h3=hash1(a[out][k]);
					vector<Node3>::iterator ite=c3.begin();
					for(;ite!=c3.end();ite++)
					{
						if(ite->index_i ==h1 && ite->index_j == h2 && ite->index_k==h3)
						ite->count++;
					}
				}
	
			}
		}
	}

2.2 杂凑法

由于频繁项集的复杂度基本都在生成C2和count C2的过程，但是在第一次扫描由于内存空间很少被使用，所以可以将第一次扫描和第二次扫描拼凑在一起。我们可以减少construct C2和count C2的过程。但是需要注意的是在裁剪的时候我们需要裁剪掉那些一介子集不是频繁项集的项。这是由于我们没有直接从L1来生成候选项。

	vector<Node2> c2;
    int b[10]={0};
	int  ItemNumber=0;
	
	for(int out=0;out<D;out++)     
	{  
		if(lengthOfD[out]<1)
		{
			continue;
		}
		for(int i=0;a[out][i]!='';i++)         
		{ 
			int h1 = hash1(a[out][i]);

			if(b[h1]==0)
			{
				b[h1]++;
				ItemNumber++;
			}
			else
			{				
				b[h1]++;
			}
			for(int j=i+1;a[out][j]!='';j++)
			{
				int h2 = hash1(a[out][j]);
				if(!searchinC2(c2,h1,h2))
				{
					Node2 n2=Node2(h1,h2,1);
					c2.push_back(n2);
				}
			}
        
		} 
	}
 
	/*prune C2 to L2.......................................................*/
	l2.clear();
	vector<Node2>::iterator ite2=c2.begin(); //ite2
	for(;ite2!=c2.end();ite2++)
	{
		//check l1
		if(ite2->count>=MinSupCount_2&&b[ite2->index_i]>=MinSupCount_1 &&b[ite2->index_j]>=MinSupCount_1)
		{
			l2.push_back(*ite2);
		}
	}

2.3 PCY

由于没有使用过位图，并且目前的书上对于阐述PCY的哈希表不是很清晰也没有具体的示例，我也不太清楚我第一遍扫描使用的哈希表是不是就是PCY的主旨，因为我也确实没有按照A priori来老老实实的实现C1和L1.

2.4 多阶段算法

2.5 多哈希算法

三、有限扫描算法

包括简单的随机化算法、SON算法、Toivonen算法实现起来也并不难，只是有他们的理论，留到以后再来深究。

（编辑：李大同）

【声明】本站内容均来自网络，其相关言论仅代表作者个人观点，不代表本站立场。若无意侵犯到您的权利，请及时与联系站长删除相关内容!