Hash-Based Apriori Algorithm

hash_based_apriori.cpp

#include <iostream>
#include <fstream>
#include <vector>
#include <set>
#include <map>
#include <unordered_map>
#include <sstream>
#include <algorithm>

using namespace std;

typedef set<int> Itemset;


int hashItemset(const Itemset& itemset) {
    int sum = 0;
    for (int item : itemset) {
        sum += item;
    }
    return sum % 40; // 40 buckets
}


vector<Itemset> generateHashBasedCandidates(const vector<Itemset>& transactions, map<int, int>& hashBuckets, int minSupport) {
    map<pair<int, int>, int> candidateCounts;

    for (const auto& transaction : transactions) {
        vector<int> items(transaction.begin(), transaction.end());
        int n = items.size();

        
        for (int i = 0; i < n; ++i) {
            for (int j = i + 1; j < n; ++j) {
                Itemset candidate = { items[i], items[j] };
                int hashValue = hashItemset(candidate);
                hashBuckets[hashValue]++;

                candidateCounts[{items[i], items[j]}]++;
            }
        }
    }

    
    vector<Itemset> frequentItemsets;
    for (const auto& candidate : candidateCounts) {
        Itemset itemset = { candidate.first.first, candidate.first.second };
        int hashValue = hashItemset(itemset);

        // count exceeds the minimum support
        if (hashBuckets[hashValue] >= minSupport) {
            frequentItemsets.push_back(itemset);
        }
    }

    return frequentItemsets;
}


map<Itemset, int> countSupport(const vector<Itemset>& transactions, const vector<Itemset>& candidates) {
    map<Itemset, int> itemsetCounts;
    for (const auto& transaction : transactions) {
        for (const auto& candidate : candidates) {
            if (includes(transaction.begin(), transaction.end(), candidate.begin(), candidate.end())) {
                itemsetCounts[candidate]++;
            }
        }
    }
    return itemsetCounts;
}


vector<Itemset> filterBySupport(const map<Itemset, int>& candidateCounts, int minSupport) {
    vector<Itemset> frequentItemsets;
    for (const auto& candidateCount : candidateCounts) {
        if (candidateCount.second >= minSupport) {
            frequentItemsets.push_back(candidateCount.first);
        }
    }
    return frequentItemsets;
}

// Hash-based Apriori algorithm
void hashBasedApriori(const vector<Itemset>& transactions, int globalMinSupport, const string& outputFilename) {
    ofstream outputFile(outputFilename);

    map<int, int> itemCounts;
    for (const auto& transaction : transactions) {
        for (int item : transaction) {
            itemCounts[item]++;
        }
    }

    vector<Itemset> frequentItemsets;
    for (const auto& itemCount : itemCounts) {
        if (itemCount.second >= globalMinSupport) {
            frequentItemsets.push_back({ itemCount.first });
        }
    }

    outputFile << "Frequent 1-itemsets:\n";
    for (const auto& itemset : frequentItemsets) {
        for (int item : itemset) {
            outputFile << item << " ";
        }
        outputFile << endl;
    }

  
    map<int, int> hashBuckets;  
    vector<Itemset> candidates = generateHashBasedCandidates(transactions, hashBuckets, globalMinSupport);

    
    map<Itemset, int> candidateCounts = countSupport(transactions, candidates);
    frequentItemsets = filterBySupport(candidateCounts, globalMinSupport);

    if (!frequentItemsets.empty()) {
        outputFile << "\nFrequent 2-itemsets:\n";
        for (const auto& itemset : frequentItemsets) {
            for (int item : itemset) {
                outputFile << item << " ";
            }
            outputFile << endl;
        }
    }

    outputFile.close();
    cout << "Results saved to " << outputFilename << endl;
}

//read transactions from text file
vector<Itemset> readTransactions(const string& filename) {
    ifstream file(filename);
    vector<Itemset> transactions;
    string line;

    if (file.is_open()) {
        while (getline(file, line)) {
            stringstream ss(line);
            Itemset transaction;
            int item;
            while (ss >> item) {
                transaction.insert(item);
            }
            transactions.push_back(transaction);
        }
        file.close();
    }
    else {
        cerr << "Unable to open file" << endl;
    }

    return transactions;
}

int main() {
    string inputFilename = "td.txt";
    string outputFilename = "output_hash.txt";
    int globalMinSupport = 10;  

    vector<Itemset> transactions = readTransactions(inputFilename);

    hashBasedApriori(transactions, globalMinSupport, outputFilename);

    return 0;
}