---
tags: Data Mining, C++
disqus: HackMD
---
資料探勘 Homework #1
===
Association Analysis
---
* Dataset1: Select from kaggle.com / UCI
* Dataset2: Use IBM Quest Synthetic Data Generator
* https://sourceforge.net/projects/ibmquestdatagen/
* Generate different datasets
* Implement Apriori Algorithm and apply on these datasets
* Hash? Tree? (optional)
* FP-growth (optional)
* Compare your results
```cpp=
#include <iostream>
#include <fstream>
#include <cstring>
#include <string>
#include <cstdlib>
#include <map>
#include <set>
#include <vector>
#include <algorithm>
using namespace std;
char *input_filename;
char *output_filename;
double min_support;
//double min_confidence;
vector<vector<string>> transactions; // DB
vector<string> Items; // Frequent 1-itemsets (L1)
vector<int> SupportCount;
void getTransactions(string input_filename, int startRow, int startCol, char splitChar, char lineChar);
void splitStr2Vec(string s, vector<string> &buf, char splitChar);
int getSupportCount(vector<string> Itemset);
vector<vector<string>> prune(vector<vector<string>> S);
vector<vector<string>> removeDup(vector<vector<string>> itemset);
vector<vector<string>> handleFreqItemset(vector<vector<string>> S1, vector<vector<string>> S2);
int present_in_vec(vector<string> S, string a);
void displayItemsets(ofstream &outfile, vector<vector<string>> itemset);
void TestdisplayItemsets(vector<vector<string>> itemset);
int main(int argc,char *argv[]) {
if (argc < 4) {
cout << "請輸入參數:[0] input_filename [1] min_support [2] output_filename" << endl;
return 0;
}
input_filename = argv[1];
min_support = atof(argv[2]);
output_filename = argv[3];
//min_confidence = atof(argv[4]);
clock_t startClock, finishClock; //計算處理時間
// 資料整理
string filename = input_filename;
int startRow = 0; // 欲擷取開始列
int startCol = 0; // 欲擷取欄位
char splitChar = ' '; // 分割字元
char lineChar = '\n'; // 換行符號
string ext; // 檔案副檔名
ext = filename.substr(filename.length() - 4, 4);
if (ext == ".csv") {
splitChar = ',';
}
if (filename == "BreadBasket_DMS.csv") {
startRow = 1;
startCol = 2;
lineChar = '\r';
}
else if (filename == "IBMData.data") {
startRow = 0;
startCol = 2;
lineChar = '\r';
}
else if (filename == "Test.csv") {
startRow = 1;
startCol = 0;
}
else if (filename == "Test.txt") {
startRow = 0;
startCol = 0;
}
getTransactions(input_filename, startRow, startCol, splitChar, lineChar);
ofstream outfile(output_filename);
outfile << "source:" << filename << endl;
outfile << "minSupport:" << min_support << endl;
// 顯示DB資料
//TestdisplayItemsets(transactions);
//cout << endl;
//outfile << endl;
//return 0;
// 取得Large itemsets與Candicate itemsets
vector<vector<string>> C1;
vector<vector<vector<string>>> L; // Large itemsets(Frequent itemsets)
vector<vector<vector<string>>> C; // 置信度之集合 Candicate itemsets
startClock = clock(); // 開始計時
for (int i = 0; i < Items.size(); i++) {
vector<string> temp;
temp.emplace_back(Items[i]);
C1.emplace_back(temp);
temp.clear();
}
C.emplace_back(C1); // 將 L1 存到 L[0]
L.emplace_back(prune(C1)); // 將 C1 存到 C[0]
for (int i = 1; i < C1.size(); i++) {
C.emplace_back(handleFreqItemset(L[i - 1], L[0]));
C[i] = removeDup(C[i]);
L.emplace_back(prune(C[i]));
}
finishClock = clock(); // 結束計時
cout << "C1" << endl;
outfile << "C1" << endl;
displayItemsets(outfile, C1);
cout << "L1" << endl;
outfile << "L1" << endl;
displayItemsets(outfile, L[0]);
for (int i = 1; i < C1.size(); i++) {
if (C[i].size() != 0) {
cout << "C" << i + 1 << endl;
outfile << "C" << i + 1 << endl;
displayItemsets(outfile, C[i]);
}
if (L[i].size() != 0) {
cout << "L" << i + 1 << endl;
outfile << "L" << i + 1 << endl;
displayItemsets(outfile, L[i]);
}
}
double duration = (double)(finishClock - startClock) / CLOCKS_PER_SEC;
printf( "%f seconds\n", duration);
outfile << to_string(duration) << " seconds" << endl;
outfile.close();
return 1;
}
void getTransactions(string input_filename, int startRow, int startCol, char splitChar, char lineChar) {
string str;
ifstream file(input_filename);
if (!file) {
cerr << "input_filename open fail:" << input_filename << endl;
exit(1);
}
// 資料整理
int x = 0;
vector<string> temp;
vector<string> buf;
while (getline(file, str, lineChar)) {
if (startRow != 0) {
startRow--;
continue;
}
buf.clear();
splitStr2Vec(str, buf, splitChar);
if (x != 0 && x != stoi(buf[startCol])) {
transactions.emplace_back(temp);
temp.clear();
}
x = stoi(buf[startCol]);
temp.emplace_back(buf[startCol + 1]);
Items.emplace_back(buf[startCol + 1]);
}
transactions.emplace_back(temp);
temp.clear();
buf.clear();
sort(Items.begin(), Items.end());
Items.erase(unique(Items.begin(), Items.end()), Items.end());
file.close();
}
void splitStr2Vec(string s, vector<string> &buf, char splitChar) {
int current = 0; //最初由 0 的位置開始找
int next;
while (1)
{
next = s.find_first_of(splitChar, current);
if (next != current)
{
string tmp = s.substr(current, next - current);
if (tmp.size() != 0) //忽略空字串
buf.emplace_back(tmp);
}
if (next == string::npos) break;
current = next + 1; //下次由 next + 1 的位置開始找起。
}
}
int getSupportCount(vector<string> Itemset) {
int count = 0;
for (int i = 0; i < transactions.size(); i++) {
int cnt = 0;
for (int j = 0; j < transactions[i].size(); j++) {
for (int k = 0; k < Itemset.size(); k++) {
if (Itemset[k] == transactions[i][j])
cnt++;
}
}
if (cnt==Itemset.size())
count++;
}
return count;
}
vector<vector<string>> prune(vector<vector<string>> C) {
vector<vector<string>> L;
for (int i = 0; i < C.size(); i++) {
int supportCount = getSupportCount(C[i]);
if (supportCount >= min_support) {
L.push_back(C[i]);
}
}
return L;
}
vector<vector<string>> removeDup(vector<vector<string>> itemset) {
sort(itemset.begin(), itemset.end());
itemset.erase(unique(itemset.begin(), itemset.end()), itemset.end());
return itemset;
}
vector<vector<string>> handleFreqItemset(vector<vector<string>> L1, vector<vector<string>> C2) {
vector<vector<string>> result;
for (int i = 0; i < L1.size(); i++) {
for (int j = i; j < C2.size(); j++) {
vector<string> row;
for (int k = 0; k < L1[i].size(); k++) {
row.emplace_back(L1[i][k]);
}
if (!present_in_vec(row, C2[j][0])) {
row.emplace_back(C2[j][0]);
sort(row.begin(), row.end());
result.emplace_back(row);
}
}
}
return result;
}
int present_in_vec(vector<string> S, string a) {
int flag = 0;
for (int i = 0; i < S.size(); i++) {
if (S[i] == a)
flag = 1;
}
return flag;
}
void displayItemsets(ofstream &outfile, vector<vector<string>> itemset) {
for (int i = 0; i < itemset.size(); i++) {
int sup = getSupportCount(itemset[i]);
cout << "count:" << sup << " ";
outfile << "count:" << sup << "\t";
for (int j = 0; j < itemset[i].size(); j++) {
cout << itemset[i][j];
outfile << itemset[i][j];
if (j != itemset[i].size() - 1) {
cout << ",";
outfile << ",";
}
}
cout << endl;
outfile << endl;
}
}
void TestdisplayItemsets(vector<vector<string>> itemset) {
//ofstream outfile(output_filename);
for (int i = 0; i < itemset.size(); i++) {
for (int j = 0; j < itemset[i].size(); j++) {
cout << itemset[i][j];
//outfile << itemset[i][j];
if (j != itemset[i].size() - 1) {
cout << ",";
//outfile << ",";
}
}
cout << endl;
//outfile << endl;
}
//outfile.close();
}
```