MAX-MINER 频繁模式挖掘 Apriori算法
发布时间:2020-12-14 04:13:15 所属栏目:大数据 来源:网络整理
导读:这几天小白让我做一个max-miner 最大项集的挖掘,一般的算法有apriori和FP-TREE 考虑到用FP-TREE 可能有点复杂就用apriori算法先测试下,在小样本测试的时候速度非常快,当对一个5W行的文本测试效率变得不可承受了,因此对算法进行了分析,假设每行有100个单
这几天小白让我做一个max-miner 最大项集的挖掘,一般的算法有apriori和FP-TREE 考虑到用FP-TREE 可能有点复杂就用apriori算法先测试下,在小样本测试的时候速度非常快,当对一个5W行的文本测试效率变得不可承受了,因此对算法进行了分析,假设每行有100个单词当用apriori算法对3-候选集进行count的时候每一个候选集需要:
5W * 100*3 = 1500W 次的比较 当3-候选集有1000个那么需要150亿次比较效率变得不可接受即使用优化的算法在1天也很难得到结果。因此在样本很大的情况下Apriori算法并不适用。 原始Apriori算法借鉴了别人的一个程序进行了自己的改进,找不到原作者了在这里对作者表示感谢。 下面是代码: #ifndef APRIOR_H #define APRIOR_H #include <vector> #include <map> using namespace std; class aprior{ public: aprior(int supp,string name):support(supp),file(name){} ~aprior(); void openData(); void generate_1Itemset(); void generate_Alternative2(); void generate_Alternative(); void generate_ItemSet(); void get_Big_set(); void get2B(); void begin(); void countWord(char s); private: void countSupport(); void getNextSet(); void output1(); void output2(); bool compareS(const vector<string> s1,const vector<string> s2); string getRes(); struct item{ string key; int value; }; struct mutiItem{ vector<string> key; int value; }; int support; vector<mutiItem> vec_n_current; vector<mutiItem> vec_muti_pre; vector<mutiItem> vec_muti_current; vector<item> vec_item; vector<string> vec_str; map<string,int> sequence; string file; }; #endif //end APRIOR_H //aprior.cpp #include <algorithm> #include <iostream> #include <map> #include <sstream> #include <vector> #include <fstream> #include "aprior.h" #include <string> #include <cassert> #include <cstdio> using namespace std; //open file and each line of file as input database; void aprior::openData() { ifstream in(file.c_str()); assert(in != NULL); string word; while(getline(in,word)) { vec_str.push_back(word); } } //1 genetative whit structure map void aprior::countWord(char split) { string line,word; for(vector<string>::iterator iter = vec_str.begin(); iter != vec_str.end(); ++iter){ line = *iter; while(line.find(split) != -1) { word = line.substr(0,line.find(split)); ++sequence[word]; line = line.substr(line.find(split)+1,line.size()-1); } ++sequence[line]; } } void aprior::generate_1Itemset() { item ite; for(map<string,int>::iterator iter = sequence.begin(); iter !=sequence.end(); ++iter) { if(iter->second >= support) { ite.key = iter->first; ite.value = iter->second; vec_item.push_back(ite); } } } void aprior::generate_Alternative2() { mutiItem mutiTmp; vector<string> vecTmp; for(vector<item>::iterator iter1 = vec_item.begin(); iter1 != vec_item.end() -1 ; ++iter1){ vecTmp.push_back(iter1->key); for(vector<item>::iterator iter2 = iter1+1; iter2 !=vec_item.end(); ++iter2){ vecTmp.push_back(iter2->key); mutiTmp.key = vecTmp; mutiTmp.value = 0; vec_muti_pre.push_back(mutiTmp); vecTmp.pop_back(); } vecTmp.clear(); } } void aprior::countSupport() { bool flag = true; vector<string> vec; for(vector<mutiItem>::iterator iter1 = vec_muti_pre.begin(); iter1 != vec_muti_pre.end(); ++iter1){ vec = iter1->key; for(vector<string>::iterator iter2 = vec_str.begin(); iter2 != vec_str.end(); ++iter2) { flag = true; for(vector<string>::iterator iter3 = vec.begin(); iter3 != vec.end(); ++iter3){ if(iter2->find(*iter3) == -1) { flag =false; break; } } if(flag) ++iter1->value; } } } void aprior::get2B() { ofstream out; out.open(getRes().c_str(),ios::app); out<<endl; out<<"***************************************"<<endl; vector<string> vecTmp; bool flag = true; for(vector<item>::iterator iter1 = vec_item.begin(); iter1 != vec_item.end(); ++iter1) { flag = true; for(vector<mutiItem>::iterator iter = vec_muti_current.begin(); iter != vec_muti_current.end(); ++iter) { vecTmp =iter->key; if(find(vecTmp.begin(),vecTmp.end(),iter1->key) != vecTmp.end()) { flag =false; break; } } if(flag) out<<iter1->key<<endl; } out<<"****************************************"<<endl; out<<endl; out.close(); } void aprior::generate_ItemSet() { vec_muti_current.clear(); for(vector<mutiItem>::iterator iter = vec_muti_pre.begin(); iter != vec_muti_pre.end(); ++iter){ if(iter->value >= support){ vec_muti_current.push_back(*iter); } } } void aprior::getNextSet() { vec_n_current.clear(); vec_n_current = vec_muti_current; } void aprior::generate_Alternative(){ vec_muti_pre.clear(); int diff = 0; vector<string> vecTmp; string tmp; mutiItem muti; bool sa; for(vector<mutiItem>::iterator iter1 = vec_muti_current.begin(); iter1 != vec_muti_current.end()-1; ++iter1) { vecTmp = iter1->key; for(vector<mutiItem>::iterator iter2 = iter1+1; iter2 != vec_muti_current.end(); ++iter2){ diff = 0; for(vector<string>::iterator iter3 = iter2->key.begin(); iter3 != iter2->key.end(); ++iter3){ if(find(vecTmp.begin(),*iter3) == vecTmp.end()) { ++diff; tmp = *iter3; } } if(diff == 1) { vecTmp.push_back(tmp); muti.key = vecTmp; muti.value = 0; sa = true; for(vector<mutiItem>::iterator itersa = vec_muti_pre.begin(); itersa != vec_muti_pre.end(); ++itersa) { if(compareS(itersa->key,muti.key)) { sa = false; break; } } if(sa) vec_muti_pre.push_back(muti); vecTmp.pop_back(); } } } } bool aprior::compareS(vector<string> s1,vector<string> s2) { for(vector<string>::iterator iter1 = s1.begin(); iter1 != s1.end(); ++iter1) { if(find(s2.begin(),s2.end(),*iter1)==s2.end()) return false; } return true; } void aprior::get_Big_set() { ofstream out; out.open(getRes().c_str(),ios::app); out<<endl; out<<"****************************************************************"<<endl; bool flag = false; vector<string> vecTmp; for(vector<mutiItem>::iterator iter1 = vec_n_current.begin(); iter1 != vec_n_current.end(); ++iter1){ vecTmp = iter1->key; flag = false; for(vector<mutiItem>::iterator iter2 = vec_muti_current.begin(); iter2 != vec_muti_current.end(); ++iter2) { if(compareS(vecTmp,iter2->key)) flag = true; } if(!flag) { for(vector<string>::iterator iter = vecTmp.begin(); iter != vecTmp.end(); ++iter) out<<*iter<<"t"; out<<endl; } } out<<"*************************************************************"<<endl; out<<endl; } void aprior::begin() { generate_1Itemset(); output1(); generate_Alternative2(); countSupport(); generate_ItemSet(); output2(); get2B(); while(1) { generate_Alternative(); getNextSet(); countSupport(); generate_ItemSet(); if(vec_muti_current.size() == 0) return; output2(); get_Big_set(); } } string aprior::getRes() { string tmp; stringstream ss; ss<<support; tmp = ss.str(); tmp = file+tmp; return tmp; } void aprior::output1() { ofstream out; string tmp = getRes(); out.open(tmp.c_str(),ios::app); for(vector<item>::iterator iter = vec_item.begin(); iter != vec_item.end(); ++iter) out<<iter->key<<'t'<<iter->value<<endl; out.close(); } void aprior::output2() { ofstream out; out.open(getRes().c_str(),ios::app); vector<string> vecTmp; string tmp; for(vector<mutiItem>::iterator iter = vec_muti_current.begin(); iter != vec_muti_current.end(); ++iter){ vecTmp = iter->key; for(vector<string>::iterator iters = vecTmp.begin(); iters!= vecTmp.end(); ++iters) out<<*iters<<"t"; out<<iter->value<<endl; } out.close(); } aprior::~aprior() { } //main.cpp #include <iostream> #include "aprior.h" using namespace std; void test(int support,string file) { aprior ap(support,file); ap.openData(); ap.countWord(' '); ap.begin(); } int main() { test(50,"data.txt"); return 0; }在编译时候用g++ -c aprior.cpp main.cpp 就可以 要生成可执行文件需要g++ aprior.cpp main.cpp -o aprior就可以了。 在接下来要写FP-TREE的算法对样本就行测试对比效率。? (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |