資料探勘 Homework #1

--- tags: Data Mining, C++ disqus: HackMD --- 資料探勘 Homework #1 === Association Analysis --- * Dataset1: Select from kaggle.com / UCI * Dataset2: Use IBM Quest Synthetic Data Generator * https://sourceforge.net/projects/ibmquestdatagen/ * Generate different datasets * Implement Apriori Algorithm and apply on these datasets * Hash? Tree? (optional) * FP-growth (optional) * Compare your results ```cpp= #include <iostream> #include <fstream> #include <cstring> #include <string> #include <cstdlib> #include <map> #include <set> #include <vector> #include <algorithm> using namespace std; char *input_filename; char *output_filename; double min_support; //double min_confidence; vector<vector<string>> transactions; // DB vector<string> Items; // Frequent 1-itemsets (L1) vector<int> SupportCount; void getTransactions(string input_filename, int startRow, int startCol, char splitChar, char lineChar); void splitStr2Vec(string s, vector<string> &buf, char splitChar); int getSupportCount(vector<string> Itemset); vector<vector<string>> prune(vector<vector<string>> S); vector<vector<string>> removeDup(vector<vector<string>> itemset); vector<vector<string>> handleFreqItemset(vector<vector<string>> S1, vector<vector<string>> S2); int present_in_vec(vector<string> S, string a); void displayItemsets(ofstream &outfile, vector<vector<string>> itemset); void TestdisplayItemsets(vector<vector<string>> itemset); int main(int argc,char *argv[]) { if (argc < 4) { cout << "請輸入參數：[0] input_filename [1] min_support [2] output_filename" << endl; return 0; } input_filename = argv[1]; min_support = atof(argv[2]); output_filename = argv[3]; //min_confidence = atof(argv[4]); clock_t startClock, finishClock; //計算處理時間 // 資料整理 string filename = input_filename; int startRow = 0; // 欲擷取開始列 int startCol = 0; // 欲擷取欄位 char splitChar = ' '; // 分割字元 char lineChar = '\n'; // 換行符號 string ext; // 檔案副檔名 ext = filename.substr(filename.length() - 4, 4); if (ext == ".csv") { splitChar = ','; } if (filename == "BreadBasket_DMS.csv") { startRow = 1; startCol = 2; lineChar = '\r'; } else if (filename == "IBMData.data") { startRow = 0; startCol = 2; lineChar = '\r'; } else if (filename == "Test.csv") { startRow = 1; startCol = 0; } else if (filename == "Test.txt") { startRow = 0; startCol = 0; } getTransactions(input_filename, startRow, startCol, splitChar, lineChar); ofstream outfile(output_filename); outfile << "source:" << filename << endl; outfile << "minSupport:" << min_support << endl; // 顯示DB資料 //TestdisplayItemsets(transactions); //cout << endl; //outfile << endl; //return 0; // 取得Large itemsets與Candicate itemsets vector<vector<string>> C1; vector<vector<vector<string>>> L; // Large itemsets(Frequent itemsets) vector<vector<vector<string>>> C; // 置信度之集合 Candicate itemsets startClock = clock(); // 開始計時 for (int i = 0; i < Items.size(); i++) { vector<string> temp; temp.emplace_back(Items[i]); C1.emplace_back(temp); temp.clear(); } C.emplace_back(C1); // 將 L1 存到 L[0] L.emplace_back(prune(C1)); // 將 C1 存到 C[0] for (int i = 1; i < C1.size(); i++) { C.emplace_back(handleFreqItemset(L[i - 1], L[0])); C[i] = removeDup(C[i]); L.emplace_back(prune(C[i])); } finishClock = clock(); // 結束計時 cout << "C1" << endl; outfile << "C1" << endl; displayItemsets(outfile, C1); cout << "L1" << endl; outfile << "L1" << endl; displayItemsets(outfile, L[0]); for (int i = 1; i < C1.size(); i++) { if (C[i].size() != 0) { cout << "C" << i + 1 << endl; outfile << "C" << i + 1 << endl; displayItemsets(outfile, C[i]); } if (L[i].size() != 0) { cout << "L" << i + 1 << endl; outfile << "L" << i + 1 << endl; displayItemsets(outfile, L[i]); } } double duration = (double)(finishClock - startClock) / CLOCKS_PER_SEC; printf( "%f seconds\n", duration); outfile << to_string(duration) << " seconds" << endl; outfile.close(); return 1; } void getTransactions(string input_filename, int startRow, int startCol, char splitChar, char lineChar) { string str; ifstream file(input_filename); if (!file) { cerr << "input_filename open fail:" << input_filename << endl; exit(1); } // 資料整理 int x = 0; vector<string> temp; vector<string> buf; while (getline(file, str, lineChar)) { if (startRow != 0) { startRow--; continue; } buf.clear(); splitStr2Vec(str, buf, splitChar); if (x != 0 && x != stoi(buf[startCol])) { transactions.emplace_back(temp); temp.clear(); } x = stoi(buf[startCol]); temp.emplace_back(buf[startCol + 1]); Items.emplace_back(buf[startCol + 1]); } transactions.emplace_back(temp); temp.clear(); buf.clear(); sort(Items.begin(), Items.end()); Items.erase(unique(Items.begin(), Items.end()), Items.end()); file.close(); } void splitStr2Vec(string s, vector<string> &buf, char splitChar) { int current = 0; //最初由 0 的位置開始找 int next; while (1) { next = s.find_first_of(splitChar, current); if (next != current) { string tmp = s.substr(current, next - current); if (tmp.size() != 0) //忽略空字串 buf.emplace_back(tmp); } if (next == string::npos) break; current = next + 1; //下次由 next + 1 的位置開始找起。 } } int getSupportCount(vector<string> Itemset) { int count = 0; for (int i = 0; i < transactions.size(); i++) { int cnt = 0; for (int j = 0; j < transactions[i].size(); j++) { for (int k = 0; k < Itemset.size(); k++) { if (Itemset[k] == transactions[i][j]) cnt++; } } if (cnt==Itemset.size()) count++; } return count; } vector<vector<string>> prune(vector<vector<string>> C) { vector<vector<string>> L; for (int i = 0; i < C.size(); i++) { int supportCount = getSupportCount(C[i]); if (supportCount >= min_support) { L.push_back(C[i]); } } return L; } vector<vector<string>> removeDup(vector<vector<string>> itemset) { sort(itemset.begin(), itemset.end()); itemset.erase(unique(itemset.begin(), itemset.end()), itemset.end()); return itemset; } vector<vector<string>> handleFreqItemset(vector<vector<string>> L1, vector<vector<string>> C2) { vector<vector<string>> result; for (int i = 0; i < L1.size(); i++) { for (int j = i; j < C2.size(); j++) { vector<string> row; for (int k = 0; k < L1[i].size(); k++) { row.emplace_back(L1[i][k]); } if (!present_in_vec(row, C2[j][0])) { row.emplace_back(C2[j][0]); sort(row.begin(), row.end()); result.emplace_back(row); } } } return result; } int present_in_vec(vector<string> S, string a) { int flag = 0; for (int i = 0; i < S.size(); i++) { if (S[i] == a) flag = 1; } return flag; } void displayItemsets(ofstream &outfile, vector<vector<string>> itemset) { for (int i = 0; i < itemset.size(); i++) { int sup = getSupportCount(itemset[i]); cout << "count:" << sup << " "; outfile << "count:" << sup << "\t"; for (int j = 0; j < itemset[i].size(); j++) { cout << itemset[i][j]; outfile << itemset[i][j]; if (j != itemset[i].size() - 1) { cout << ","; outfile << ","; } } cout << endl; outfile << endl; } } void TestdisplayItemsets(vector<vector<string>> itemset) { //ofstream outfile(output_filename); for (int i = 0; i < itemset.size(); i++) { for (int j = 0; j < itemset[i].size(); j++) { cout << itemset[i][j]; //outfile << itemset[i][j]; if (j != itemset[i].size() - 1) { cout << ","; //outfile << ","; } } cout << endl; //outfile << endl; } //outfile.close(); } ```